In [35]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, StackingRegressor)
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import xgboost as xgb
import catboost as cb
import time

In [36]:
##Configs
DATA_PATH = "output\odisha_merged_tabular.parquet"
OUTPUT_DIR = "output"
SEED = 42
# API_K = 0.85
# RAIN_THR = 0.5
# GAP_BUFFER = 7
# np.random.seed(SEED)

  DATA_PATH = "output\odisha_merged_tabular.parquet"


In [37]:
df = pd.read_parquet(DATA_PATH)
df.columns = df.columns.str.lower()
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["lat", "lon", "date"]).reset_index(drop=True)


In [38]:
full_date_range = pd.date_range(start='2020-01-01', end='2025-12-31', freq="D")
missing_reports = []
for (lat, lon), group in df.groupby(['lat', 'lon']):
    existing_dates = set(group['date'])
    missing_dates = set(full_date_range) - existing_dates
    if missing_dates != set():
        missing_reports.append({
            'lat': lat,
            'lon': lon,
            'missing_dates': sorted(missing_dates)
        })
if missing_reports == []:
    print("No missing reports found.")
    in_sync = True
else:
    print(f"Found {len(missing_reports)} locations with missing reports.")
    in_sync = False


No missing reports found.


In [39]:
df["loc_id"] = df["lat"].astype(str) + "_" + df["lon"].astype(str)
df = df.sort_values(["loc_id", "date"]).reset_index(drop=True)
df["sm_4_prior"] = df["soil_moisture"].shift(4)
df["sm_3_prior"] = df["soil_moisture"].shift(3)
df["sm_2_prior"] = df["soil_moisture"].shift(2)
df["sm_1_prior"] = df["soil_moisture"].shift(1)
df["sum_rainfall_4"] = sum(df["rainfall"].shift(i) for i in range(1, 5))
df["sum_rainfall_3"] = sum(df["rainfall"].shift(i) for i in range(1, 4))
df["sum_rainfall_2"] = sum(df["rainfall"].shift(i) for i in range(1, 3))
df["sum_rainfall_1"] = sum(df["rainfall"].shift(i) for i in range(1, 2))
df["mean_temp_4"] = sum(df["temperature"].shift(i) for i in range(1, 5)) / 4
df["mean_temp_3"] = sum(df["temperature"].shift(i) for i in range(1, 4)) / 3
df["mean_temp_2"] = sum(df["temperature"].shift(i) for i in range(1, 3)) / 2
df["mean_temp_1"] = sum(df["temperature"].shift(i) for i in range(1, 2)) / 1
df["doy"]     = df["date"].dt.dayofyear
df["doy_sin"] = np.sin(2 * np.pi * df["doy"] / 365.25)
df["doy_cos"] = np.cos(2 * np.pi * df["doy"] / 365.25)
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df.dropna(subset=["sm_4_prior", "sm_3_prior", "sm_2_prior", "sm_1_prior", "sum_rainfall_4", "sum_rainfall_3", "sum_rainfall_2", "sum_rainfall_1", "mean_temp_4", "mean_temp_3", "mean_temp_2", "mean_temp_1"], inplace=True)



In [40]:
df_test = df[df["date"] > '2025-01-01']
df = df[df["date"] <= '2025-01-01']

In [41]:
models = {}
models["RandomForest"] = RandomForestRegressor(
        n_estimators=300, max_depth=20, min_samples_split=5,
        min_samples_leaf=2, max_features="sqrt", random_state=SEED, n_jobs=-1)
models["GradientBoosting"] = GradientBoostingRegressor(
            n_estimators=300, max_depth=6, learning_rate=0.05,
            subsample=0.8, min_samples_leaf=10, random_state=SEED)
models["XGBoost"] = xgb.XGBRegressor(
    n_estimators=500, max_depth=8, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1,
    reg_lambda=1.0, random_state=SEED, n_jobs=-1, verbosity=0)  
models["CatBoost"] = cb.CatBoostRegressor(
    iterations=500, depth=8, learning_rate=0.05,
    l2_leaf_reg=3.0, subsample=0.8, random_seed=SEED, verbose=0)
models["Ridge"] = Pipeline([
    ("scaler", StandardScaler()), ("ridge", Ridge(alpha=1.0))])
models["ElasticNet"] = Pipeline([
    ("scaler", StandardScaler()),
    ("enet", ElasticNet(alpha=0.01, l1_ratio=0.5, max_iter=5000,random_state=SEED))])

base = []
base.append(("xgb", xgb.XGBRegressor(
    n_estimators=300, max_depth=6, learning_rate=0.05,
    random_state=SEED, n_jobs=-1, verbosity=0)))
base.append(("cb", cb.CatBoostRegressor(
    iterations=300, depth=6, learning_rate=0.05,
    random_seed=SEED, verbose=0)))
base.append(("rf", RandomForestRegressor(
    n_estimators=200, max_depth=15,
    random_state=SEED, n_jobs=-1)))
base.append(("gbr", GradientBoostingRegressor(
    n_estimators=200, max_depth=5,
    learning_rate=0.05, random_state=SEED)))
models["StackingEnsemble"] = StackingRegressor(
    estimators=base, final_estimator=Ridge(alpha=1.0), cv=3, n_jobs=-1)

In [42]:
def compute_metrics(yt, yp):
    rmse = np.sqrt(mean_squared_error(yt, yp))
    mae  = mean_absolute_error(yt, yp)
    r2   = r2_score(yt, yp)
    bias = np.mean(yp - yt)
    ubrmse = np.sqrt(max(rmse**2 - bias**2, 0))
    return dict(rmse=rmse, mae=mae, r2=r2, ubrmse=ubrmse, bias=bias)

In [50]:
def get_cv_splits(df, cv_type):
    years = df["year"].unique()
    splits = []
    if cv_type == "groupkfold":
        gfk = GroupKFold(n_splits=min(5,len(years)))
        for tr, te in gfk.split(df, groups=df["year"].values):
            ty_ = df.iloc[te]["year"].unique()
            try_ = df.iloc[tr]["year"].unique()
            splits.append((tr, te, f"Train {try_} → Test {ty_}"))
    elif cv_type == "timeseries_year":
        for i in range(1, len(years)):
            train_years = years[:i]
            test_year = years[i]
            tr_mask = df["year"].isin(train_years)
            te_mask = df["year"] == test_year
            if len(df[tr_mask]) > 0 and len(df[te_mask]) > 0:
                splits.append((tr_mask, te_mask, f"Train {train_years} → Test {test_year}"))  
    return splits        
        

In [51]:
def run_experiment(X, y, df, cv_type, models):
    splits = get_cv_splits(df, cv_type)
    results = {}
    for mname, mtemplate in models.items():
        print(f"Running {mname}...")
        folds = []
        predictions = []
        t0 = time.time()
        for f1, (tr_i, te_i, desc) in enumerate(splits, 1):
            model = clone(mtemplate)
            model.fit(X[tr_i], y[tr_i])
            yp = model.predict(X[te_i])
            metrics = compute_metrics(y[te_i], yp)
            metrics.update(fold=f1, label=desc, n_train=len(tr_i), n_test=len(te_i))
            folds.append(metrics)
            predictions.append((dict(y_test = y[te_i], y_pred=yp)))
        elapsed = time.time() - t0
        avg = {k: np.mean([f[k] for f in folds])
               for k in ["rmse","mae","r2","ubrmse","bias"]}
        print(f"  │  Avg RMSE={avg['rmse']:.4f}  MAE={avg['mae']:.4f}  "
              f"R²={avg['r2']:.4f}  ubRMSE={avg['ubrmse']:.4f}  "
              f"Bias={avg['bias']:.4f}  [{elapsed:.1f}s]")
        for f in folds:
            print(f"  │    Fold {f['fold']}: RMSE={f['rmse']:.4f}  "
                  f"R²={f['r2']:.4f}  [{f['n_train']:,}→{f['n_test']:,}]")
        print(f"  └─\n")
        results[mname] = dict(folds=folds, predictions=predictions, average=avg, elapsed=elapsed)       
    return results
        

In [52]:
def print_summary(rg, rt):
    print("\n" + "=" * 75)
    print("  FINAL SUMMARY")
    print("=" * 75)
    hdr = f"{'Model':<28}{'CV':<16}{'RMSE':>8}{'MAE':>8}{'R²':>8}{'ubRMSE':>8}{'Bias':>8}{'Time':>7}"
    print(hdr); print("-"*len(hdr))
    for cn, res in [("GroupKFold", rg), ("TSplit", rt)]:
        for mn, r in res.items():
            a = r["avg"]
            print(f"  {mn:<26}{cn:<16}{a['rmse']:>8.4f}{a['mae']:>8.4f}"
                  f"{a['r2']:>8.4f}{a['ubrmse']:>8.4f}{a['bias']:>8.4f}"
                  f"{r['elapsed']:>6.1f}s")
        print("-"*len(hdr))
    all_r2 = {}
    for cv, res in [("GKF",rg),("TS",rt)]:
        for n,r in res.items(): all_r2[f"{n} ({cv})"] = r["avg"]["r2"]
    best = max(all_r2, key=all_r2.get)
    print(f"\n  ★ Best: {best}  (R² = {all_r2[best]:.4f})")

In [53]:
FEAT = ["sm_4_prior", "sm_3_prior", "sm_2_prior", "sm_1_prior", "sum_rainfall_4", "sum_rainfall_3", "sum_rainfall_2", "sum_rainfall_1", "mean_temp_4", "mean_temp_3", "mean_temp_2", "mean_temp_1", "doy_sin", "doy_cos"]
TARGET = "soil_moisture"
X = df[FEAT].values
y = df[TARGET].values
r_groupkfold = run_experiment(X, y, df, "groupkfold", models)
r_timeseries = run_experiment(X, y, df, "timeseries_year", models)
print_summary(r_groupkfold,r_timeseries)



Running RandomForest...


KeyboardInterrupt: 