Step 1 - Install pyspark and other dependencies

In [1]:
# Install PySpark
!pip install pyspark findspark

print("\n" + "="*50)
print("PySpark installed successfully!")
print("="*50)

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1

PySpark installed successfully!


Step 2 - Initialize Spark Session

In [2]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col, month, year, avg, stddev,
    min as spark_min, max as spark_max,
    when, lit, count, to_date
)
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import (
    LinearRegression,
    RandomForestRegressor,
    GBTRegressor
)
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Evapotranspiration_ML_Analysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

print(f"Spark Version: {spark.version}")
print(f"App Name: {spark.sparkContext.appName}")

Spark Version: 4.0.1
App Name: Evapotranspiration_ML_Analysis


Step 3 - Load and Explore Data

In [3]:
print("="*60)
print("STEP 3: Loading and Exploring Data")
print("="*60)

# Load weather data
weather_df = spark.read.csv(
    "weatherData.csv",
    header=True,
    inferSchema=True
)

# Load location data
location_df = spark.read.csv(
    "locationData.csv",
    header=True,
    inferSchema=True
)

print(f"\nWeather data records: {weather_df.count():,}")
print(f"Location data records: {location_df.count()}")

print("\n" + "-"*60)
print("Weather Data Schema:")
print("-"*60)
weather_df.printSchema()

print("\n" + "-"*60)
print("Sample Weather Data (5 rows):")
print("-"*60)
weather_df.show(5, truncate=False)

STEP 3: Loading and Exploring Data

Weather data records: 142,371
Location data records: 27

------------------------------------------------------------
Weather Data Schema:
------------------------------------------------------------
root
 |-- location_id: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- weather_code (wmo code): integer (nullable = true)
 |-- temperature_2m_max (°C): double (nullable = true)
 |-- temperature_2m_min (°C): double (nullable = true)
 |-- temperature_2m_mean (°C): double (nullable = true)
 |-- apparent_temperature_max (°C): double (nullable = true)
 |-- apparent_temperature_min (°C): double (nullable = true)
 |-- apparent_temperature_mean (°C): double (nullable = true)
 |-- daylight_duration (s): double (nullable = true)
 |-- sunshine_duration (s): double (nullable = true)
 |-- precipitation_sum (mm): double (nullable = true)
 |-- rain_sum (mm): double (nullable = true)
 |-- precipitation_hours (h): integer (nullable = true)
 |-- wind_sp

Step 4 - Data Preparation and Cleaning

In [4]:
print("="*60)
print("STEP 4: Data Preparation and Cleaning")
print("="*60)

# Rename columns to remove special characters and units
weather_df_clean = weather_df \
    .withColumnRenamed("weather_code (wmo code)", "weather_code") \
    .withColumnRenamed("temperature_2m_max (°C)", "temp_max") \
    .withColumnRenamed("temperature_2m_min (°C)", "temp_min") \
    .withColumnRenamed("temperature_2m_mean (°C)", "temp_mean") \
    .withColumnRenamed("apparent_temperature_max (°C)", "apparent_temp_max") \
    .withColumnRenamed("apparent_temperature_min (°C)", "apparent_temp_min") \
    .withColumnRenamed("apparent_temperature_mean (°C)", "apparent_temp_mean") \
    .withColumnRenamed("daylight_duration (s)", "daylight_duration") \
    .withColumnRenamed("sunshine_duration (s)", "sunshine_duration") \
    .withColumnRenamed("precipitation_sum (mm)", "precipitation_sum") \
    .withColumnRenamed("rain_sum (mm)", "rain_sum") \
    .withColumnRenamed("precipitation_hours (h)", "precipitation_hours") \
    .withColumnRenamed("wind_speed_10m_max (km/h)", "wind_speed") \
    .withColumnRenamed("wind_gusts_10m_max (km/h)", "wind_gusts") \
    .withColumnRenamed("wind_direction_10m_dominant (°)", "wind_direction") \
    .withColumnRenamed("shortwave_radiation_sum (MJ/m²)", "shortwave_radiation") \
    .withColumnRenamed("et0_fao_evapotranspiration (mm)", "evapotranspiration")

# Parse date and extract month/year
weather_df_clean = weather_df_clean \
    .withColumn("date_parsed", to_date(col("date"), "M/d/yyyy")) \
    .withColumn("month", month(col("date_parsed"))) \
    .withColumn("year", year(col("date_parsed")))

# Convert sunshine_duration from seconds to hours
weather_df_clean = weather_df_clean \
    .withColumn("sunshine_hours", col("sunshine_duration") / 3600.0)

# Join with location data
full_df = weather_df_clean.join(location_df, on="location_id", how="left")

print("\nData cleaning completed!")
print("\nTransformations applied:")
print("  ✓ Renamed columns (removed special characters)")
print("  ✓ Parsed date and extracted month/year")
print("  ✓ Converted sunshine duration to hours")
print("  ✓ Joined with location data")

print("\n" + "-"*60)
print("Sample Cleaned Data:")
print("-"*60)
full_df.select("date", "city_name", "precipitation_hours", "sunshine_hours",
               "wind_speed", "evapotranspiration", "month", "year").show(5)

STEP 4: Data Preparation and Cleaning

Data cleaning completed!

Transformations applied:
  ✓ Renamed columns (removed special characters)
  ✓ Parsed date and extracted month/year
  ✓ Converted sunshine duration to hours
  ✓ Joined with location data

------------------------------------------------------------
Sample Cleaned Data:
------------------------------------------------------------
+--------+---------+-------------------+------------------+----------+------------------+-----+----+
|    date|city_name|precipitation_hours|    sunshine_hours|wind_speed|evapotranspiration|month|year|
+--------+---------+-------------------+------------------+----------+------------------+-----+----+
|1/1/2010|  Colombo|                  0|10.807147222222223|      12.2|              4.61|    1|2010|
|1/2/2010|  Colombo|                  1|10.403058333333334|      13.0|              3.91|    1|2010|
|1/3/2010|  Colombo|                  3| 9.215675000000001|      12.3|              3.66|    1|2010|

Step 5 - Filter Data for May

In [5]:
print("="*60)
print("STEP 5: Filter Data for May (Month = 5)")
print("="*60)

# Filter for May data only
may_df = full_df.filter(col("month") == 5)

print(f"\nTotal records for May: {may_df.count():,}")

# Display years covered
print("\nYears covered in May data:")
may_df.select("year").distinct().orderBy("year").show(20)

# Descriptive statistics for May
print("\n" + "-"*60)
print("Descriptive Statistics for May Data:")
print("-"*60)
may_df.select("precipitation_hours", "sunshine_hours",
              "wind_speed", "evapotranspiration").describe().show()

# Check for null values
print("\n" + "-"*60)
print("Null Value Counts in Key Columns:")
print("-"*60)
for column in ["precipitation_hours", "sunshine_hours", "wind_speed", "evapotranspiration"]:
    null_count = may_df.filter(col(column).isNull()).count()
    print(f"  {column}: {null_count} nulls")

STEP 5: Filter Data for May (Month = 5)

Total records for May: 12,555

Years covered in May data:
+----+
|year|
+----+
|2010|
|2011|
|2012|
|2013|
|2014|
|2015|
|2016|
|2017|
|2018|
|2019|
|2020|
|2021|
|2022|
|2023|
|2024|
+----+


------------------------------------------------------------
Descriptive Statistics for May Data:
------------------------------------------------------------
+-------+-------------------+------------------+-----------------+------------------+
|summary|precipitation_hours|    sunshine_hours|       wind_speed|evapotranspiration|
+-------+-------------------+------------------+-----------------+------------------+
|  count|              12555|             12555|            12555|             12555|
|   mean|  8.011867781760255| 9.528481240320387|17.84893667861412| 4.225671843886905|
| stddev|  6.946432602146947|2.5585355610205562|7.709660922461054|1.1799446246284135|
|    min|                  0|               0.0|              2.5|              0.59|
|    

Step 6 - Feature Selection

In [6]:
print("="*60)
print("STEP 6: Feature Selection")
print("="*60)
# Define feature columns
feature_columns = [
    "precipitation_hours",
    "sunshine_hours",
    "wind_speed",
    "temp_mean",
    "shortwave_radiation"
]

print("\n" + "="*60)
print("FEATURE SELECTION")
print("="*60)
print("\nSelected Features for Modeling:")
for i, feat in enumerate(feature_columns, 1):
    print(f"  {i}. {feat}")
print("\nTarget Variable: evapotranspiration (mm)")

STEP 6: Feature Selection

FEATURE SELECTION

Selected Features for Modeling:
  1. precipitation_hours
  2. sunshine_hours
  3. wind_speed
  4. temp_mean
  5. shortwave_radiation

Target Variable: evapotranspiration (mm)


Step 7 - Data Preparation for ML

In [7]:
print("="*60)
print("STEP 7: Data Preparation for Machine Learning")
print("="*60)

# Ensure all features are DoubleType
may_df_ml = may_df
for col_name in feature_columns + ["evapotranspiration"]:
    may_df_ml = may_df_ml.withColumn(col_name, col(col_name).cast(DoubleType()))

# Remove rows with null values in key columns
may_df_ml = may_df_ml.dropna(subset=feature_columns + ["evapotranspiration"])

print(f"\nFinal dataset size after cleaning: {may_df_ml.count():,} records")

# Cache for performance
may_df_ml.cache()
print("Dataset cached for faster processing.")

STEP 7: Data Preparation for Machine Learning

Final dataset size after cleaning: 12,555 records
Dataset cached for faster processing.


Step 8 - Correlation Analysis

In [8]:
print("="*60)
print("STEP 8: Correlation Analysis")
print("="*60)

# Convert to Pandas for correlation analysis
pandas_df = may_df_ml.select(feature_columns + ["evapotranspiration"]).toPandas()

print("\nCorrelation with Evapotranspiration:")
print("-"*50)
correlations = {}
for feature in feature_columns:
    corr = pandas_df[feature].corr(pandas_df["evapotranspiration"])
    correlations[feature] = corr
    direction = "↑ Positive" if corr > 0 else "↓ Negative"
    strength = "Strong" if abs(corr) > 0.5 else "Moderate" if abs(corr) > 0.3 else "Weak"
    print(f"  {feature:25} : {corr:>8.4f}  ({strength} {direction})")

print("\n" + "-"*50)
print("Interpretation:")
print("-"*50)
print("• Positive correlation: As feature increases, ET increases")
print("• Negative correlation: As feature increases, ET decreases")
print("• For LOW evapotranspiration, we want:")
print("  - Higher values of negatively correlated features")
print("  - Lower values of positively correlated features")

STEP 8: Correlation Analysis

Correlation with Evapotranspiration:
--------------------------------------------------
  precipitation_hours       :  -0.7430  (Strong ↓ Negative)
  sunshine_hours            :   0.7394  (Strong ↑ Positive)
  wind_speed                :   0.3994  (Moderate ↑ Positive)
  temp_mean                 :   0.5832  (Strong ↑ Positive)
  shortwave_radiation       :   0.8842  (Strong ↑ Positive)

--------------------------------------------------
Interpretation:
--------------------------------------------------
• Positive correlation: As feature increases, ET increases
• Negative correlation: As feature increases, ET decreases
• For LOW evapotranspiration, we want:
  - Higher values of negatively correlated features
  - Lower values of positively correlated features


Step 9 - Train-Test Split (80-20)

In [9]:
print("="*60)
print("STEP 9: Train-Test Split (80% Training, 20% Validation)")
print("="*60)

# Create feature vector assembler
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features_raw"
)

# Create scaler for feature normalization
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=True
)

# Split data: 80% training, 20% validation
train_df, test_df = may_df_ml.randomSplit([0.8, 0.2], seed=42)

train_count = train_df.count()
test_count = test_df.count()
total_count = train_count + test_count

print(f"\n{'Dataset':<20} {'Records':>10} {'Percentage':>12}")
print("-"*45)
print(f"{'Training Set':<20} {train_count:>10,} {train_count/total_count*100:>11.1f}%")
print(f"{'Validation Set':<20} {test_count:>10,} {test_count/total_count*100:>11.1f}%")
print("-"*45)
print(f"{'Total':<20} {total_count:>10,} {'100.0%':>12}")

STEP 9: Train-Test Split (80% Training, 20% Validation)

Dataset                 Records   Percentage
---------------------------------------------
Training Set             10,128        80.7%
Validation Set            2,427        19.3%
---------------------------------------------
Total                    12,555       100.0%


Step 10 - Train Linear Regression Model

In [10]:
print("="*60)
print("STEP 10: Training Linear Regression Model")
print("="*60)

# Define evaluator
evaluator = RegressionEvaluator(
    labelCol="evapotranspiration",
    predictionCol="prediction"
)

# Dictionary to store results
model_results = {}

# Linear Regression Model
lr = LinearRegression(
    featuresCol="features",
    labelCol="evapotranspiration",
    maxIter=100,
    regParam=0.01,
    elasticNetParam=0.5
)

# Create pipeline
lr_pipeline = Pipeline(stages=[assembler, scaler, lr])

# Train model
print("\nTraining Linear Regression model...")
lr_model = lr_pipeline.fit(train_df)
lr_predictions = lr_model.transform(test_df)

# Evaluate
evaluator.setMetricName("rmse")
lr_rmse = evaluator.evaluate(lr_predictions)
evaluator.setMetricName("r2")
lr_r2 = evaluator.evaluate(lr_predictions)
evaluator.setMetricName("mae")
lr_mae = evaluator.evaluate(lr_predictions)

model_results["Linear Regression"] = {
    "model": lr_model,
    "RMSE": lr_rmse,
    "R2": lr_r2,
    "MAE": lr_mae
}

print("\n" + "-"*50)
print("Linear Regression - Evaluation Metrics:")
print("-"*50)
print(f"  RMSE (Root Mean Square Error): {lr_rmse:.4f}")
print(f"  R² (Coefficient of Determination): {lr_r2:.4f}")
print(f"  MAE (Mean Absolute Error): {lr_mae:.4f}")

# Get coefficients
lr_coefficients = lr_model.stages[-1].coefficients.toArray()
lr_intercept = lr_model.stages[-1].intercept

print("\n" + "-"*50)
print("Model Coefficients (Standardized):")
print("-"*50)
for i, feature in enumerate(feature_columns):
    impact = "increases" if lr_coefficients[i] > 0 else "decreases"
    print(f"  {feature:25}: {lr_coefficients[i]:>10.6f}  (ET {impact})")
print(f"  {'Intercept':25}: {lr_intercept:>10.6f}")

STEP 10: Training Linear Regression Model

Training Linear Regression model...

--------------------------------------------------
Linear Regression - Evaluation Metrics:
--------------------------------------------------
  RMSE (Root Mean Square Error): 0.3266
  R² (Coefficient of Determination): 0.9247
  MAE (Mean Absolute Error): 0.2393

--------------------------------------------------
Model Coefficients (Standardized):
--------------------------------------------------
  precipitation_hours      :  -0.246957  (ET decreases)
  sunshine_hours           :  -0.091560  (ET decreases)
  wind_speed               :   0.204664  (ET increases)
  temp_mean                :   0.255108  (ET increases)
  shortwave_radiation      :   0.830731  (ET increases)
  Intercept                :   4.225302


Step 11 - Train Random Forest Model

In [11]:
print("="*60)
print("STEP 11: Training Random Forest Model")
print("="*60)

# Random Forest Model
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="evapotranspiration",
    numTrees=100,
    maxDepth=10,
    seed=42
)

# Create pipeline
rf_pipeline = Pipeline(stages=[assembler, scaler, rf])

# Train model
print("\nTraining Random Forest model (100 trees)...")
rf_model = rf_pipeline.fit(train_df)
rf_predictions = rf_model.transform(test_df)

# Evaluate
evaluator.setMetricName("rmse")
rf_rmse = evaluator.evaluate(rf_predictions)
evaluator.setMetricName("r2")
rf_r2 = evaluator.evaluate(rf_predictions)
evaluator.setMetricName("mae")
rf_mae = evaluator.evaluate(rf_predictions)

model_results["Random Forest"] = {
    "model": rf_model,
    "RMSE": rf_rmse,
    "R2": rf_r2,
    "MAE": rf_mae
}

print("\n" + "-"*50)
print("Random Forest - Evaluation Metrics:")
print("-"*50)
print(f"  RMSE (Root Mean Square Error): {rf_rmse:.4f}")
print(f"  R² (Coefficient of Determination): {rf_r2:.4f}")
print(f"  MAE (Mean Absolute Error): {rf_mae:.4f}")

# Feature importance
rf_importance = rf_model.stages[-1].featureImportances.toArray()

print("\n" + "-"*50)
print("Feature Importance:")
print("-"*50)
importance_list = [(feature, rf_importance[i]) for i, feature in enumerate(feature_columns)]
importance_list.sort(key=lambda x: x[1], reverse=True)
for feature, importance in importance_list:
    bar = "█" * int(importance * 50)
    print(f"  {feature:25}: {importance:.4f} {bar}")

STEP 11: Training Random Forest Model

Training Random Forest model (100 trees)...

--------------------------------------------------
Random Forest - Evaluation Metrics:
--------------------------------------------------
  RMSE (Root Mean Square Error): 0.2451
  R² (Coefficient of Determination): 0.9576
  MAE (Mean Absolute Error): 0.1683

--------------------------------------------------
Feature Importance:
--------------------------------------------------
  shortwave_radiation      : 0.4322 █████████████████████
  temp_mean                : 0.2349 ███████████
  precipitation_hours      : 0.1739 ████████
  sunshine_hours           : 0.1148 █████
  wind_speed               : 0.0442 ██


Step 12 - Train Gradient Boosted Trees Model

In [12]:
print("="*60)
print("STEP 12: Training Gradient Boosted Trees Model")
print("="*60)

# Gradient Boosted Trees Model
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="evapotranspiration",
    maxIter=100,
    maxDepth=5,
    seed=42
)

# Create pipeline
gbt_pipeline = Pipeline(stages=[assembler, scaler, gbt])

# Train model
print("\nTraining Gradient Boosted Trees model (100 iterations)...")
gbt_model = gbt_pipeline.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)

# Evaluate
evaluator.setMetricName("rmse")
gbt_rmse = evaluator.evaluate(gbt_predictions)
evaluator.setMetricName("r2")
gbt_r2 = evaluator.evaluate(gbt_predictions)
evaluator.setMetricName("mae")
gbt_mae = evaluator.evaluate(gbt_predictions)

model_results["Gradient Boosted Trees"] = {
    "model": gbt_model,
    "RMSE": gbt_rmse,
    "R2": gbt_r2,
    "MAE": gbt_mae
}

print("\n" + "-"*50)
print("Gradient Boosted Trees - Evaluation Metrics:")
print("-"*50)
print(f"  RMSE (Root Mean Square Error): {gbt_rmse:.4f}")
print(f"  R² (Coefficient of Determination): {gbt_r2:.4f}")
print(f"  MAE (Mean Absolute Error): {gbt_mae:.4f}")

print("\n" + "-"*50)
print("Feature Importance (Baseline):")
print("-"*50)
ggt_importance = gbt_model.stages[-1].featureImportances.toArray()
importance_list = [(feature, ggt_importance[i]) for i, feature in enumerate(feature_columns)]
importance_list.sort(key=lambda x: x[1], reverse=True)
for feature, importance in importance_list:
    bar = "█" * int(importance * 50)
    print(f"  {feature:25}: {importance:.4f} {bar}")

STEP 12: Training Gradient Boosted Trees Model

Training Gradient Boosted Trees model (100 iterations)...

--------------------------------------------------
Gradient Boosted Trees - Evaluation Metrics:
--------------------------------------------------
  RMSE (Root Mean Square Error): 0.2546
  R² (Coefficient of Determination): 0.9543
  MAE (Mean Absolute Error): 0.1735

--------------------------------------------------
Feature Importance (Baseline):
--------------------------------------------------
  shortwave_radiation      : 0.5737 ████████████████████████████
  temp_mean                : 0.2379 ███████████
  wind_speed               : 0.0969 ████
  precipitation_hours      : 0.0647 ███
  sunshine_hours           : 0.0268 █


Step 13 - Model Comparison and Selection

In [13]:
print("="*60)
print("STEP 13: Model Comparison and Selection")
print("="*60)

print("\n" + "-"*70)
print(f"{'Model':<25} {'RMSE':>12} {'R²':>12} {'MAE':>12}")
print("-"*70)
for model_name, metrics in model_results.items():
    print(f"{model_name:<25} {metrics['RMSE']:>12.4f} {metrics['R2']:>12.4f} {metrics['MAE']:>12.4f}")
print("-"*70)

# Select best model based on R²
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]["R2"])
best_model = model_results[best_model_name]["model"]

print(f"\n★ Best Performing Model: {best_model_name}")
print(f"  - Highest R² Score: {model_results[best_model_name]['R2']:.4f}")
print(f"  - RMSE: {model_results[best_model_name]['RMSE']:.4f}")
print(f"  - MAE: {model_results[best_model_name]['MAE']:.4f}")

STEP 13: Model Comparison and Selection

----------------------------------------------------------------------
Model                             RMSE           R²          MAE
----------------------------------------------------------------------
Linear Regression               0.3266       0.9247       0.2393
Random Forest                   0.2451       0.9576       0.1683
Gradient Boosted Trees          0.2546       0.9543       0.1735
----------------------------------------------------------------------

★ Best Performing Model: Random Forest
  - Highest R² Score: 0.9576
  - RMSE: 0.2451
  - MAE: 0.1683


Step 14 - Analysis of Low Evapotranspiration Conditions (without ML)

In [19]:
print("=" * 60)
print("STEP 14: Determining Conditions for Low Evapotranspiration (without ML)")
print("=" * 60)

# Analyze historical data where evapotranspiration < 1.5mm in May
low_et_df = may_df_ml.filter(col("evapotranspiration") < 1.5)
low_et_count = low_et_df.count()

print(f"\nHistorical records with evapotranspiration < 1.5mm in May: {low_et_count:,}")
print(f"Percentage of total May records: {low_et_count / may_df_ml.count() * 100:.2f}%")

if low_et_count > 0:
    # Get statistics for ALL features in low-ET conditions
    low_et_stats = low_et_df.select(
        # Precipitation hours
        avg("precipitation_hours").alias("mean_precip_hours"),
        stddev("precipitation_hours").alias("std_precip_hours"),
        spark_min("precipitation_hours").alias("min_precip_hours"),
        spark_max("precipitation_hours").alias("max_precip_hours"),
        # Sunshine hours
        avg("sunshine_hours").alias("mean_sunshine_hours"),
        stddev("sunshine_hours").alias("std_sunshine_hours"),
        spark_min("sunshine_hours").alias("min_sunshine_hours"),
        spark_max("sunshine_hours").alias("max_sunshine_hours"),
        # Wind speed
        avg("wind_speed").alias("mean_wind_speed"),
        stddev("wind_speed").alias("std_wind_speed"),
        spark_min("wind_speed").alias("min_wind_speed"),
        spark_max("wind_speed").alias("max_wind_speed"),
        # ET
        avg("evapotranspiration").alias("mean_et"),
        spark_min("evapotranspiration").alias("min_et"),
        spark_max("evapotranspiration").alias("max_et")
    ).collect()[0]

    print("\n" + "-" * 60)
    print("Statistics for Low Evapotranspiration Conditions (ET < 1.5mm):")
    print("-" * 60)

    print("\n Precipitation Hours:")
    print(f"      Mean:     {low_et_stats['mean_precip_hours']:.2f} hours")
    print(f"      Std Dev:  {low_et_stats['std_precip_hours']:.2f}")
    print(f"      Range:    {low_et_stats['min_precip_hours']:.2f} - {low_et_stats['max_precip_hours']:.2f}")

    print("\n Sunshine Hours:")
    print(f"      Mean:     {low_et_stats['mean_sunshine_hours']:.2f} hours")
    print(f"      Std Dev:  {low_et_stats['std_sunshine_hours']:.2f}")
    print(f"      Range:    {low_et_stats['min_sunshine_hours']:.2f} - {low_et_stats['max_sunshine_hours']:.2f}")

    print("\n Wind Speed:")
    print(f"      Mean:     {low_et_stats['mean_wind_speed']:.2f} km/h")
    print(f"      Std Dev:  {low_et_stats['std_wind_speed']:.2f}")
    print(f"      Range:    {low_et_stats['min_wind_speed']:.2f} - {low_et_stats['max_wind_speed']:.2f}")

    print(f"\n Average Evapotranspiration: {low_et_stats['mean_et']:.2f} mm")

STEP 14: Determining Conditions for Low Evapotranspiration (without ML)

Historical records with evapotranspiration < 1.5mm in May: 134
Percentage of total May records: 1.07%

------------------------------------------------------------
Statistics for Low Evapotranspiration Conditions (ET < 1.5mm):
------------------------------------------------------------

 Precipitation Hours:
      Mean:     22.13 hours
      Std Dev:  3.04
      Range:    11.00 - 24.00

 Sunshine Hours:
      Mean:     0.44 hours
      Std Dev:  0.75
      Range:    0.00 - 4.85

 Wind Speed:
      Mean:     18.49 km/h
      Std Dev:  7.49
      Range:    4.20 - 40.00

 Average Evapotranspiration: 1.19 mm


Step 15 - Final Prediction for May 2026 (with Average Conditions for Shortwave Radiation and Temperature)

In [17]:
print("=" * 60)
print("STEP 15: Final Prediction for May 2026 (with Average Conditions for Shortwave Radiation and Temperature)")
print("=" * 60)

# Get typical May values as baseline
may_stats = may_df_ml.select(
    avg("precipitation_hours").alias("avg_precip"),
    avg("sunshine_hours").alias("avg_sunshine"),
    avg("wind_speed").alias("avg_wind"),
    avg("temp_mean").alias("avg_temp"),
    avg("shortwave_radiation").alias("avg_radiation"),
    avg("evapotranspiration").alias("avg_et")
).collect()[0]

print("\nTypical May Conditions (Historical Average):")
print("-" * 50)
print(f"  Precipitation Hours:    {may_stats['avg_precip']:.2f} hours")
print(f"  Sunshine Hours:         {may_stats['avg_sunshine']:.2f} hours")
print(f"  Wind Speed:             {may_stats['avg_wind']:.2f} km/h")
print(f"  Temperature (Mean):     {may_stats['avg_temp']:.2f} °C")
print(f"  Shortwave Radiation:    {may_stats['avg_radiation']:.2f} MJ/m²")
print(f"  Evapotranspiration:     {may_stats['avg_et']:.2f} mm")

# Get recommended conditions from low-ET analysis
recommended_precip = low_et_stats['mean_precip_hours']
recommended_sunshine = low_et_stats['mean_sunshine_hours']
recommended_wind = low_et_stats['mean_wind_speed']

# Create test data point with recommended conditions
test_data = spark.createDataFrame([
    (float(recommended_precip),
     float(recommended_sunshine),
     float(recommended_wind),
     float(may_stats['avg_temp']),
     float(may_stats['avg_radiation']))
], ["precipitation_hours", "sunshine_hours", "wind_speed", "temp_mean", "shortwave_radiation"])

# Make prediction using best model
prediction = best_model.transform(test_data)
predicted_et = prediction.select("prediction").collect()[0][0]

print("\n" + "**************************************************")
print("             PREDICTION RESULTS")
print("               FOR MAY 2026")
print("**************************************************")

print(f"\n  Model Used: {best_model_name}")

print("\nTo achieve evapotranspiration LOWER than 1.5mm in May 2026,")
print("the following daily weather conditions are recommended:")

print("\n==================================================================")
print("                 RECOMMENDED WEATHER CONDITIONS                   ")
print("==================================================================")
print(f" Mean Precipitation Hours:   {recommended_precip:>8.2f} hours")
print(f" Mean Sunshine Duration:     {recommended_sunshine:>8.2f} hours")
print(f" Mean Wind Speed:            {recommended_wind:>8.2f} km/h")
print("==================================================================")
print(f" Predicted Evapotranspiration: {predicted_et:>6.4f} mm")
print(f" Target:                       < 1.5000 mm")
print("==================================================================")

if predicted_et < 1.5:
    print("\n STATUS: TARGET ACHIEVABLE")
    print(f"    The predicted ET ({predicted_et:.4f} mm) is below the target (1.5 mm)")
else:
    print("\n STATUS: TARGET NOT MET WITH AVERAGE CONDITIONS")
    print(f"    The predicted ET ({predicted_et:.4f} mm) exceeds the target (1.5 mm)")

STEP 15: Final Prediction for May 2026 (with Average Conditions for Shortwave Radiation and Temperature)

Typical May Conditions (Historical Average):
--------------------------------------------------
  Precipitation Hours:    8.01 hours
  Sunshine Hours:         9.53 hours
  Wind Speed:             17.85 km/h
  Temperature (Mean):     26.78 °C
  Shortwave Radiation:    19.15 MJ/m²
  Evapotranspiration:     4.23 mm

**************************************************
             PREDICTION RESULTS
               FOR MAY 2026
**************************************************

  Model Used: Random Forest

To achieve evapotranspiration LOWER than 1.5mm in May 2026,
the following daily weather conditions are recommended:

                 RECOMMENDED WEATHER CONDITIONS                   
 Mean Precipitation Hours:      22.13 hours
 Mean Sunshine Duration:         0.44 hours
 Mean Wind Speed:               18.49 km/h
 Predicted Evapotranspiration: 3.2781 mm
 Target:                       

Step 16 - Analysis of Low Evapotranspiration Conditions (with ML predictions)

In [18]:
print("=" * 60)
print("STEP 16: Determining Conditions for Low Evapotranspiration (with ML Predictions)")
print("=" * 60)

print("\n" + "-"*60)
print("Step 16.1: Calculate Fixed Values from Historical Low-ET Data")
print("-"*60)

# Filter historical data where ET < 1.5mm
historical_low_et = may_df_ml.filter(col("evapotranspiration") < 1.5)
low_et_count = historical_low_et.count()
total_may_count = may_df_ml.count()

print(f"\nHistorical records with ET < 1.5mm: {low_et_count}")
print(f"Percentage of total May records: {(low_et_count/total_may_count)*100:.2f}%")

# Calculate mean values of shortwave_radiation and temp_mean from low-ET records
fixed_values = historical_low_et.select(
    avg("shortwave_radiation").alias("mean_radiation"),
    avg("temp_mean").alias("mean_temp")
).collect()[0]

FIXED_RADIATION = float(fixed_values['mean_radiation'])
FIXED_TEMP = float(fixed_values['mean_temp'])

print(f"\nFixed Values (from historical ET < 1.5mm records):")
print(f"    Shortwave Radiation (Mean): {FIXED_RADIATION:.2f} MJ/m²")
print(f"    Temperature Mean:           {FIXED_TEMP:.2f} °C")

STEP 16: Determining Conditions for Low Evapotranspiration (with ML Predictions)

------------------------------------------------------------
Step 16.1: Calculate Fixed Values from Historical Low-ET Data
------------------------------------------------------------

Historical records with ET < 1.5mm: 134
Percentage of total May records: 1.07%

Fixed Values (from historical ET < 1.5mm records):
    Shortwave Radiation (Mean): 5.42 MJ/m²
    Temperature Mean:           23.69 °C


In [20]:
print("\n" + "-"*60)
print("Step 16.2: Predict ET Using Fixed Values")
print("-"*60)

# Get the unique combinations of precipitation_hours, sunshine_hours, wind_speed from May data
# Replace actual shortwave_radiation and temp_mean with fixed values

print(f"\nCreating prediction dataset with fixed values:")
print(f"   - Using actual precipitation_hours, sunshine_hours, wind_speed from May data")
print(f"   - Replacing shortwave_radiation with: {FIXED_RADIATION:.2f} MJ/m²")
print(f"   - Replacing temp_mean with: {FIXED_TEMP:.2f} °C")

# Create modified dataset with fixed values for radiation and temperature
prediction_df = may_df_ml.select(
    col("precipitation_hours"),
    col("sunshine_hours"),
    col("wind_speed")
).withColumn("temp_mean", lit(FIXED_TEMP)) \
 .withColumn("shortwave_radiation", lit(FIXED_RADIATION))

# Reorder columns to match feature_columns order
prediction_df = prediction_df.select(feature_columns)

print(f"\nTotal records for prediction: {prediction_df.count():,}")
print("\nSample of prediction dataset:")
prediction_df.show(5)

# Make predictions using the Random Forest model
print("\nMaking predictions using Random Forest model...")

# Transform the prediction dataset
predictions_with_et = best_model.transform(prediction_df)

print(f"Predictions complete!")
print("\nSample predictions:")
predictions_with_et.select(
    "precipitation_hours", "sunshine_hours", "wind_speed",
    "temp_mean", "shortwave_radiation", "prediction"
).show(5)


------------------------------------------------------------
Step 16.2: Predict ET Using Fixed Values
------------------------------------------------------------

Creating prediction dataset with fixed values:
   - Using actual precipitation_hours, sunshine_hours, wind_speed from May data
   - Replacing shortwave_radiation with: 5.42 MJ/m²
   - Replacing temp_mean with: 23.69 °C

Total records for prediction: 12,555

Sample of prediction dataset:
+-------------------+------------------+----------+------------------+-------------------+
|precipitation_hours|    sunshine_hours|wind_speed|         temp_mean|shortwave_radiation|
+-------------------+------------------+----------+------------------+-------------------+
|               14.0|           11.2708|      10.3|23.688059701492534|   5.41820895522388|
|               10.0|              11.0|       9.2|23.688059701492534|   5.41820895522388|
|                9.0|10.886522222222224|      11.9|23.688059701492534|   5.41820895522388|
|

In [21]:
print("\n" + "-"*60)
print("Step 16.3: Filter Combinations with Predicted ET < 1.5mm")
print("-"*60)

# Filter for predictions where ET < 1.5mm
low_et_predictions = predictions_with_et.filter(col("prediction") < 1.5)

# Get count of valid combinations
valid_count = low_et_predictions.count()
total_count = predictions_with_et.count()

print(f"\nResults:")
print(f"   Total predictions made: {total_count:,}")
print(f"   Combinations with predicted ET < 1.5mm: {valid_count:,}")
print(f"   Success rate: {(valid_count/total_count)*100:.2f}%")

# Show sample of valid combinations
print("\nSample of valid combinations (predicted ET < 1.5mm):")
low_et_predictions.select(
    "precipitation_hours", "sunshine_hours", "wind_speed", "prediction"
).show(10)


------------------------------------------------------------
Step 16.3: Filter Combinations with Predicted ET < 1.5mm
------------------------------------------------------------

Results:
   Total predictions made: 12,555
   Combinations with predicted ET < 1.5mm: 187
   Success rate: 1.49%

Sample of valid combinations (predicted ET < 1.5mm):
+-------------------+-------------------+----------+------------------+
|precipitation_hours|     sunshine_hours|wind_speed|        prediction|
+-------------------+-------------------+----------+------------------+
|               23.0|0.47094444444444444|      23.0|1.2980010536600892|
|               23.0|             1.8227|       9.5|1.3959749899538734|
|               24.0| 0.4560472222222222|      11.8|1.1618661064669258|
|               24.0|                0.0|       9.8|1.1419866127557976|
|               24.0|                0.0|      26.3|1.2893803705633133|
|               23.0|                0.0|      23.3|1.2980010536600892|
|   

In [22]:
print("\n" + "-"*60)
print("Step 16.4: Statistics for Low ET Conditions (Predicted ET < 1.5mm)")
print("-"*60)

# Calculate mean, std, min, max for the three features
low_et_stats = low_et_predictions.select(
    # Precipitation hours
    avg("precipitation_hours").alias("mean_precip_hours"),
    stddev("precipitation_hours").alias("std_precip_hours"),
    spark_min("precipitation_hours").alias("min_precip_hours"),
    spark_max("precipitation_hours").alias("max_precip_hours"),
    # Sunshine hours
    avg("sunshine_hours").alias("mean_sunshine_hours"),
    stddev("sunshine_hours").alias("std_sunshine_hours"),
    spark_min("sunshine_hours").alias("min_sunshine_hours"),
    spark_max("sunshine_hours").alias("max_sunshine_hours"),
    # Wind speed
    avg("wind_speed").alias("mean_wind_speed"),
    stddev("wind_speed").alias("std_wind_speed"),
    spark_min("wind_speed").alias("min_wind_speed"),
    spark_max("wind_speed").alias("max_wind_speed"),
    # Predicted ET
    avg("prediction").alias("mean_predicted_et"),
    spark_min("prediction").alias("min_predicted_et"),
    spark_max("prediction").alias("max_predicted_et")
).collect()[0]

print(f"\n Precipitation Hours:")
print(f"      Mean:     {low_et_stats['mean_precip_hours']:.2f} hours")
print(f"      Std Dev:  {low_et_stats['std_precip_hours']:.2f}")
print(f"      Range:    {low_et_stats['min_precip_hours']:.2f} - {low_et_stats['max_precip_hours']:.2f}")

print(f"\n Sunshine Hours:")
print(f"      Mean:     {low_et_stats['mean_sunshine_hours']:.2f} hours")
print(f"      Std Dev:  {low_et_stats['std_sunshine_hours']:.2f}")
print(f"      Range:    {low_et_stats['min_sunshine_hours']:.2f} - {low_et_stats['max_sunshine_hours']:.2f}")

print(f"\n Wind Speed:")
print(f"      Mean:     {low_et_stats['mean_wind_speed']:.2f} km/h")
print(f"      Std Dev:  {low_et_stats['std_wind_speed']:.2f}")
print(f"      Range:    {low_et_stats['min_wind_speed']:.2f} - {low_et_stats['max_wind_speed']:.2f}")

print(f"\n Predicted Evapotranspiration:")
print(f"      Mean:     {low_et_stats['mean_predicted_et']:.4f} mm")
print(f"      Range:    {low_et_stats['min_predicted_et']:.4f} - {low_et_stats['max_predicted_et']:.4f} mm")



------------------------------------------------------------
Step 16.4: Statistics for Low ET Conditions (Predicted ET < 1.5mm)
------------------------------------------------------------

 Precipitation Hours:
      Mean:     23.23 hours
      Std Dev:  1.52
      Range:    16.00 - 24.00

 Sunshine Hours:
      Mean:     0.58 hours
      Std Dev:  0.65
      Range:    0.00 - 2.03

 Wind Speed:
      Mean:     19.37 km/h
      Std Dev:  7.29
      Range:    4.20 - 40.00

 Predicted Evapotranspiration:
      Mean:     1.2689 mm
      Range:    1.1397 - 1.4981 mm


Step 17 - Final Prediction for May 2026 (with predicted mean values)

In [24]:
print("="*70)
print("STEP 17: Final Prediction for May 2026 (with predicted mean values)")
print("="*70)

# Get optimal values (mean of valid combinations)
recommended_precip = low_et_stats['mean_precip_hours']
recommended_sunshine = low_et_stats['mean_sunshine_hours']
recommended_wind = low_et_stats['mean_wind_speed']

# Validate with Random Forest model
validation_point = spark.createDataFrame([
    (float(recommended_precip), float(recommended_sunshine), float(recommended_wind),
     FIXED_TEMP, FIXED_RADIATION)
], feature_columns)

validation_pred = best_model.transform(validation_point)
predicted_et = validation_pred.select("prediction").collect()[0][0]

print(f"""
========================================================================
      OBJECTIVE: Predict mean precipitation_hours, sunshine, and
     wind_speed for May 2026 to achieve evapotranspiration < 1.5mm
========================================================================

METHODOLOGY:
========================================================================
1. Fixed shortwave_radiation and temp_mean at their mean values from
   historical records where ET < 1.5mm:
   - Shortwave Radiation: {FIXED_RADIATION:.2f} MJ/m²
   - Temperature Mean:    {FIXED_TEMP:.2f} °C

2. Used Random Forest model to predict ET for May dataset with
   these fixed values

3. Filtered combinations where predicted ET < 1.5mm
   - Valid combinations found: {valid_count:,} out of {total_count:,}

4. Calculated mean values from valid combinations
========================================================================
""")

print("========================================================================")
print("                    RECOMMENDED OPTIMAL VALUES")
print("========================================================================")

print(f"    Model Used: {best_model_name}")

print(f"""
To achieve evapotranspiration LOWER than 1.5mm in May 2026,
the following daily weather conditions are recommended:

========================================================================
    OPTIMAL WEATHER CONDITIONS
========================================================================
    Mean Precipitation Hours:   {recommended_precip:>8.2f} hours
    Mean Sunshine Duration:     {recommended_sunshine:>8.2f} hours
    Mean Wind Speed:            {recommended_wind:>8.2f} km/h
========================================================================
    Predicted Evapotranspiration: {predicted_et:>8.4f} mm
    Target:                       <  1.5000 mm
    Status: {'TARGET ACHIEVED' if predicted_et < 1.5 else 'TARGET NOT MET'}
========================================================================
""")

STEP 17: Final Prediction for May 2026 (with predicted mean values)

      OBJECTIVE: Predict mean precipitation_hours, sunshine, and          
     wind_speed for May 2026 to achieve evapotranspiration < 1.5mm 

METHODOLOGY:
1. Fixed shortwave_radiation and temp_mean at their mean values from 
   historical records where ET < 1.5mm:
   - Shortwave Radiation: 5.42 MJ/m²
   - Temperature Mean:    23.69 °C

2. Used Random Forest model to predict ET for May dataset with 
   these fixed values

3. Filtered combinations where predicted ET < 1.5mm
   - Valid combinations found: 187 out of 12,555

4. Calculated mean values from valid combinations

                    RECOMMENDED OPTIMAL VALUES
    Model Used: Random Forest

To achieve evapotranspiration LOWER than 1.5mm in May 2026,
the following daily weather conditions are recommended:

    OPTIMAL WEATHER CONDITIONS                                    
    Mean Precipitation Hours:      23.23 hours              
    Mean Sunshine Duration: 

Step 18 - Save Results to CSV Files

In [26]:
import pandas as pd

print("="*60)
print("STEP 18: Saving Results to CSV Files")
print("="*60)

# 1. Model Performance Comparison
print("\n1. Saving Model Performance Comparison...")
model_performance_data = []
for model_name, metrics in model_results.items():
    model_performance_data.append({
        'Model': model_name,
        'RMSE': round(metrics['RMSE'], 4),
        'R2': round(metrics['R2'], 4),
        'MAE': round(metrics['MAE'], 4)
    })

model_performance_df = pd.DataFrame(model_performance_data)
model_performance_df.to_csv('model_performance_comparison.csv', index=False)
print("     Saved: model_performance_comparison.csv")

# 2. Feature Correlations with Evapotranspiration
print("\n2. Saving Feature Correlations...")
correlation_data = []
for feature in feature_columns:
    corr = pandas_df[feature].corr(pandas_df["evapotranspiration"])
    strength = "Strong" if abs(corr) > 0.5 else "Moderate" if abs(corr) > 0.3 else "Weak"
    direction = "Positive" if corr > 0 else "Negative"
    correlation_data.append({
        'Feature': feature,
        'Correlation': round(corr, 4),
        'Strength': strength,
        'Direction': direction
    })

correlation_df = pd.DataFrame(correlation_data)
correlation_df.to_csv('feature_correlations.csv', index=False)
print("     Saved: feature_correlations.csv")

# 3. Linear Regression Coefficients
print("\n3. Saving Linear Regression Coefficients...")
lr_coef_data = []
for i, feature in enumerate(feature_columns):
    lr_coef_data.append({
        'Feature': feature,
        'Coefficient': round(lr_coefficients[i], 6),
        'Impact': 'Increases ET' if lr_coefficients[i] > 0 else 'Decreases ET'
    })
lr_coef_data.append({
    'Feature': 'Intercept',
    'Coefficient': round(lr_intercept, 6),
    'Impact': '-'
})

lr_coef_df = pd.DataFrame(lr_coef_data)
lr_coef_df.to_csv('linear_regression_coefficients.csv', index=False)
print("     Saved: linear_regression_coefficients.csv")

# 4. Random Forest Feature Importance
print("\n4. Saving Random Forest Feature Importance...")
rf_importance_data = []
for i, feature in enumerate(feature_columns):
    rf_importance_data.append({
        'Feature': feature,
        'Importance': round(rf_importance[i], 4)
    })

rf_importance_df = pd.DataFrame(rf_importance_data)
rf_importance_df = rf_importance_df.sort_values('Importance', ascending=False)
rf_importance_df.to_csv('random_forest_feature_importance.csv', index=False)
print("     Saved: random_forest_feature_importance.csv")

# 5. Fixed Values for ML Prediction (from Step 16.1)
print("\n5. Saving Fixed Values for ML Prediction...")
fixed_values_data = {
    'Parameter': ['Shortwave Radiation', 'Temperature Mean'],
    'Value': [round(FIXED_RADIATION, 2), round(FIXED_TEMP, 2)],
    'Unit': ['MJ/m²', '°C'],
    'Source': ['Mean from historical ET < 1.5mm records', 'Mean from historical ET < 1.5mm records']
}

fixed_values_df = pd.DataFrame(fixed_values_data)
fixed_values_df.to_csv('fixed_values_for_prediction.csv', index=False)
print("     Saved: fixed_values_for_prediction.csv")

# 6. ML-Predicted Low ET Statistics (from Step 16.4)
print("\n6. Saving ML-Predicted Low ET Statistics...")
ml_low_et_data = {
    'Metric': ['Mean', 'Std Dev', 'Min', 'Max'],
    'Precipitation_Hours': [
        round(low_et_stats['mean_precip_hours'], 2),
        round(low_et_stats['std_precip_hours'], 2),
        round(low_et_stats['min_precip_hours'], 2),
        round(low_et_stats['max_precip_hours'], 2)
    ],
    'Sunshine_Hours': [
        round(low_et_stats['mean_sunshine_hours'], 2),
        round(low_et_stats['std_sunshine_hours'], 2),
        round(low_et_stats['min_sunshine_hours'], 2),
        round(low_et_stats['max_sunshine_hours'], 2)
    ],
    'Wind_Speed': [
        round(low_et_stats['mean_wind_speed'], 2),
        round(low_et_stats['std_wind_speed'], 2),
        round(low_et_stats['min_wind_speed'], 2),
        round(low_et_stats['max_wind_speed'], 2)
    ],
    'Predicted_ET': [
        round(low_et_stats['mean_predicted_et'], 4),
        '-',
        round(low_et_stats['min_predicted_et'], 4),
        round(low_et_stats['max_predicted_et'], 4)
    ]
}

ml_low_et_df = pd.DataFrame(ml_low_et_data)
ml_low_et_df.to_csv('low_et_statistics.csv', index=False)
print("     Saved: low_et_statistics.csv")

# 7. Valid Combinations (from Step 16.3)
print("\n7. Saving Valid Combinations (Predicted ET < 1.5mm)...")
valid_combinations_pd = low_et_predictions.select(
    "precipitation_hours", "sunshine_hours", "wind_speed", "prediction"
).toPandas()
valid_combinations_pd.columns = ['Precipitation_Hours', 'Sunshine_Hours', 'Wind_Speed', 'Predicted_ET']
valid_combinations_pd = valid_combinations_pd.round(4)
valid_combinations_pd.to_csv('valid_combinations.csv', index=False)
print(f"    Saved: valid_combinations.csv ({len(valid_combinations_pd)} records)")

# 8. Final Prediction Results (from Step 17)
print("\n8. Saving Final Prediction Results...")
final_prediction_data = {
    'Parameter': [
        'Mean Precipitation Hours',
        'Mean Sunshine Duration',
        'Mean Wind Speed',
        'Fixed Temperature',
        'Fixed Shortwave Radiation',
        'Predicted Evapotranspiration',
        'Target Evapotranspiration',
        'Status',
        'Best Model Used',
        'Model R² Score',
        'Valid Combinations Found',
        'Total Predictions Made',
        'Success Rate'
    ],
    'Value': [
        round(recommended_precip, 2),
        round(recommended_sunshine, 2),
        round(recommended_wind, 2),
        round(FIXED_TEMP, 2),
        round(FIXED_RADIATION, 2),
        round(predicted_et, 4),
        '< 1.5',
        'TARGET ACHIEVED' if predicted_et < 1.5 else 'TARGET NOT MET',
        best_model_name,
        round(model_results[best_model_name]['R2'], 4),
        valid_count,
        total_count,
        f"{(valid_count/total_count)*100:.2f}%"
    ],
    'Unit': [
        'hours',
        'hours',
        'km/h',
        '°C',
        'MJ/m²',
        'mm',
        'mm',
        '-',
        '-',
        '-',
        'records',
        'records',
        '-'
    ]
}

final_prediction_df = pd.DataFrame(final_prediction_data)
final_prediction_df.to_csv('final_prediction_may2026.csv', index=False)
print("     Saved: final_prediction_may2026.csv")

# 9. Train-Test Split Information
print("\n9. Saving Train-Test Split Information...")
split_info = {
    'Dataset': ['Training Set', 'Validation Set', 'Total'],
    'Records': [train_count, test_count, train_count + test_count],
    'Percentage': [
        round(train_count/(train_count + test_count)*100, 1),
        round(test_count/(train_count + test_count)*100, 1),
        100.0
    ]
}

split_df = pd.DataFrame(split_info)
split_df.to_csv('train_test_split_info.csv', index=False)
print("     Saved: train_test_split_info.csv")

# 10. Methodology Summary
print("\n10. Saving Methodology Summary...")
methodology_data = {
    'Step': [
        'Step 14',
        'Step 15',
        'Step 16.1',
        'Step 16.2',
        'Step 16.3',
        'Step 16.4',
        'Step 17'
    ],
    'Description': [
        'Analyze historical records where actual ET < 1.5mm',
        'Baseline prediction using historical means with May average temp/radiation',
        'Get mean shortwave_radiation and temp_mean from historical ET < 1.5mm records as fixed values',
        'Create prediction dataset with fixed radiation and temperature values',
        'Predict ET using Random Forest model and filter where predicted ET < 1.5mm',
        'Calculate statistics (mean, std, range) for valid combinations',
        'Use mean values from valid combinations as final recommendation'
    ],
    'Result': [
        f"{low_et_count} historical records found with ET < 1.5mm",
        'Baseline comparison - ET exceeds target with May averages',
        f"Fixed Radiation: {FIXED_RADIATION:.2f} MJ/m², Fixed Temp: {FIXED_TEMP:.2f} °C",
        f"{total_count:,} predictions made with fixed values",
        f"{valid_count:,} valid combinations ({(valid_count/total_count)*100:.2f}% success rate)",
        f"Mean: Precip={low_et_stats['mean_precip_hours']:.2f}h, Sun={low_et_stats['mean_sunshine_hours']:.2f}h, Wind={low_et_stats['mean_wind_speed']:.2f}km/h",
        f"Predicted ET: {predicted_et:.4f} mm - TARGET ACHIEVED"
    ]
}

methodology_df = pd.DataFrame(methodology_data)
methodology_df.to_csv('methodology_summary.csv', index=False)
print("   Saved: methodology_summary.csv")

print("\n" + "="*60)
print("ALL CSV FILES SAVED SUCCESSFULLY!")
print("="*60)
print("\nFiles created:")
print("  1. model_performance_comparison.csv")
print("  2. feature_correlations.csv")
print("  3. linear_regression_coefficients.csv")
print("  4. random_forest_feature_importance.csv")
print("  5. fixed_values_for_prediction.csv")
print("  6. low_et_statistics.csv")
print("  7. valid_combinations.csv")
print("  8. final_prediction_may2026.csv")
print("  9. train_test_split_info.csv")
print("  10. methodology_summary.csv")

STEP 18: Saving Results to CSV Files

1. Saving Model Performance Comparison...
     Saved: model_performance_comparison.csv

2. Saving Feature Correlations...
     Saved: feature_correlations.csv

3. Saving Linear Regression Coefficients...
     Saved: linear_regression_coefficients.csv

4. Saving Random Forest Feature Importance...
     Saved: random_forest_feature_importance.csv

5. Saving Fixed Values for ML Prediction...
     Saved: fixed_values_for_prediction.csv

6. Saving ML-Predicted Low ET Statistics...
     Saved: low_et_statistics.csv

7. Saving Valid Combinations (Predicted ET < 1.5mm)...
    Saved: valid_combinations.csv (187 records)

8. Saving Final Prediction Results...
     Saved: final_prediction_may2026.csv

9. Saving Train-Test Split Information...
     Saved: train_test_split_info.csv

10. Saving Methodology Summary...
   Saved: methodology_summary.csv

ALL CSV FILES SAVED SUCCESSFULLY!

Files created:
  1. model_performance_comparison.csv
  2. feature_correlation