<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/11_lgbm_tweedie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive  # noqa: F401
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# ==== CONFIG ====
BASE_DIR = "/content/drive/MyDrive/ai4trade"  # change if needed
FEATURES_TRAIN = f"{BASE_DIR}/data/features/features_train_h2.parquet"
FEATURES_TEST  = f"{BASE_DIR}/data/features/features_test_h2.parquet"

MODEL_NAME = "lgbm_tweedie"
RUN_ID = "run_h2_"  # will append datetime

# Tweedie variance power sweep
POWER_GRID = [1.2, 1.35, 1.5]

# Random seed
SEED = 42

# ==== IMPORTS ====
import os, json, math, time, datetime as dt
import numpy as np
import pandas as pd

# LightGBM (CPU wheel; no source build)
try:
    import lightgbm as lgb
except Exception as e:
    !pip -q install lightgbm
    import lightgbm as lgb

from sklearn.metrics import mean_absolute_error

In [None]:
from pathlib import Path

paths = [
    f"{BASE_DIR}/models/{MODEL_NAME}",
    f"{BASE_DIR}/predictions/oof",
    f"{BASE_DIR}/predictions/forecast",
    f"{BASE_DIR}/logs/runs",
]
for p in paths:
    Path(p).mkdir(parents=True, exist_ok=True)

RUN_ID = RUN_ID + dt.datetime.now().strftime("%Y%m%d_%H%M%S")
print("RUN_ID:", RUN_ID)

RUN_ID: run_h2_20251023_165145


In [None]:
def smape(y_true, y_pred, eps=1.0):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    # Floor negatives and tiny values for stability
    y_true = np.clip(y_true, 0.0, None)
    y_pred = np.clip(y_pred, 0.0, None)
    denom = np.maximum(np.abs(y_true) + np.abs(y_pred), eps)
    return np.mean(2.0 * np.abs(y_true - y_pred) / denom)

def now():
    return dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [None]:
train = pd.read_parquet(FEATURES_TRAIN)
test  = pd.read_parquet(FEATURES_TEST)

# Expected identifier schema (at HS-6 level)
id_cols = ["origin","destination","hs6","trade_flow","month"]
target_col = "y_target"  # per h=2 policy

# Basic sanity
assert set(id_cols).issubset(train.columns), "Missing id columns in train!"
assert target_col in train.columns, "Missing y_target in train!"
assert set(id_cols).issubset(test.columns), "Missing id columns in test!"

print("Train shape:", train.shape, " Test shape:", test.shape)
print("Train month range:", train["month"].min(), "→", train["month"].max())
print("Test  month unique:", sorted(test["month"].unique()))

Train shape: (5979239, 42)  Test shape: (320208, 42)
Train month range: 2023-01-01 00:00:00 → 2024-08-01 00:00:00
Test  month unique: [Timestamp('2024-08-01 00:00:00')]


In [None]:
# Explicit validation target months (y_target dates) per policy doc
val_target_months = [
    pd.Timestamp("2023-09-01"),
    pd.Timestamp("2024-03-01"),
    pd.Timestamp("2024-09-01"),
]

folds = []
for vm in val_target_months:
    # Because target=y(t+2), the feature row month 't' that predicts 'y_target' at 'vm' is t = vm - 2 months
    t_feat_month = (vm - pd.offsets.MonthBegin(1)) - pd.offsets.MonthBegin(1)
    t_feat_month = pd.Timestamp(vm) - pd.DateOffset(months=2)
    # Train uses all rows with feature month <= t_feat_month (i.e., data available up to t)
    train_mask = train["month"] <= t_feat_month
    valid_mask = train["month"] == t_feat_month  # rows whose target is vm
    folds.append((train_mask.values, valid_mask.values, vm))

for i, (_, _, vm) in enumerate(folds, 1):
    print(f"Fold {i}: validate y_target at {vm.date()} (features from t={vm - pd.DateOffset(months=2)})")

Fold 1: validate y_target at 2023-09-01 (features from t=2023-07-01 00:00:00)
Fold 2: validate y_target at 2024-03-01 (features from t=2024-01-01 00:00:00)
Fold 3: validate y_target at 2024-09-01 (features from t=2024-07-01 00:00:00)


In [None]:
# Exclude id + target + any non-numeric
exclude = set(id_cols + [target_col])
num_cols = [c for c in train.columns if c not in exclude and pd.api.types.is_numeric_dtype(train[c])]

# Optional sample weights to balance scale (if ma_12 exists per schema)
weight_col = "ma_12" if "ma_12" in train.columns else None

print("Num features:", len(num_cols))
print("First 10:", num_cols[:10])
print("Weight column:", weight_col)

Num features: 34
First 10: ['value', 'y', 'month_num', 'quarter', 'month_id', 'lag_1', 'lag_2', 'lag_3', 'lag_6', 'lag_12']
Weight column: ma_12


In [None]:
def train_fold_lgbm_tweedie(X_tr, y_tr, X_va, y_va, power, w_tr=None, w_va=None, seed=SEED):
    params = {
        "objective": "tweedie",
        "tweedie_variance_power": power,
        "metric": "mae",                # track MAE; we compute sMAPE ourselves
        "learning_rate": 0.05,
        "num_leaves": 63,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.9,
        "bagging_freq": 1,
        "min_data_in_leaf": 40,
        "lambda_l1": 0.0,
        "lambda_l2": 0.0,
        "max_depth": -1,
        "verbosity": -1,
        "seed": seed,
        "num_threads": max(1, os.cpu_count() - 1),
        "force_col_wise": True,  # stable memory on wide data
    }
    dtrain = lgb.Dataset(X_tr, label=y_tr, weight=w_tr)
    dvalid = lgb.Dataset(X_va, label=y_va, weight=w_va, reference=dtrain)
    model = lgb.train(
        params,
        dtrain,
        valid_sets=[dtrain, dvalid],
        valid_names=["train","valid"],
        num_boost_round=5000,
        # early_stopping_rounds is deprecated, use callbacks
        callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=-1)]
    )
    pred_va = np.clip(model.predict(X_va, num_iteration=model.best_iteration), 0, None)
    return model, pred_va

In [None]:
oof_frames = []
cv_summary = []

for power in POWER_GRID:
    print(f"[{now()}] Power={power} CV start...")
    preds_oof = np.zeros(len(train), dtype=float)
    fold_scores = []
    fold_sizes = []

    for k, (tr_mask, va_mask, vm) in enumerate(folds, 1):
        tr_idx = np.where(tr_mask)[0]
        va_idx = np.where(va_mask)[0]
        X_tr, y_tr = train.loc[tr_idx, num_cols], train.loc[tr_idx, target_col]
        X_va, y_va = train.loc[va_idx, num_cols], train.loc[va_idx, target_col]
        w_tr = train.loc[tr_idx, weight_col].pow(0.5) if weight_col else None
        w_va = train.loc[va_idx, weight_col].pow(0.5) if weight_col else None

        model, pred_va = train_fold_lgbm_tweedie(X_tr, y_tr, X_va, y_va, power, w_tr, w_va)
        preds_oof[va_idx] = pred_va

        score = smape(y_va.values, pred_va)
        fold_scores.append(score)
        fold_sizes.append(len(va_idx))
        print(f"  Fold {k}: val_month={vm.date()} sMAPE={score:.4f} (n={len(va_idx)})")

    # OOF only over union of validation rows (those folds)
    oof_mask = np.zeros(len(train), dtype=bool)
    for _, va_mask, _ in folds:
        oof_mask |= va_mask

    oof_score = smape(train.loc[oof_mask, target_col].values, preds_oof[oof_mask])
    wavg = np.average(fold_scores, weights=fold_sizes)

    print(f"[{now()}] Power={power} OOF sMAPE={oof_score:.4f} (fold wavg={wavg:.4f})")

    # Store per-power OOF
    oof_df = train.loc[oof_mask, id_cols + [target_col]].copy()
    oof_df[f"y_pred_{MODEL_NAME}_p{power}"] = preds_oof[oof_mask]
    oof_df["power"] = power
    oof_frames.append(oof_df)

    cv_summary.append({
        "run_id": RUN_ID,
        "model": MODEL_NAME,
        "power": power,
        "fold_scores": fold_scores,
        "fold_sizes": fold_sizes,
        "oof_smape": float(oof_score),
        "timestamp": now(),
    })

cv_tbl = pd.DataFrame(cv_summary)
cv_tbl

[2025-10-23 16:51:58] Power=1.2 CV start...
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	train's l1: nan	valid's l1: nan
  Fold 1: val_month=2023-09-01 sMAPE=1.6025 (n=316445)
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[5000]	train's l1: 685751	valid's l1: 685751
  Fold 2: val_month=2024-03-01 sMAPE=0.6872 (n=253011)
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[4986]	train's l1: 2.62497e+06	valid's l1: 2.5426e+06
  Fold 3: val_month=2024-09-01 sMAPE=0.9176 (n=311662)
[2025-10-23 19:10:31] Power=1.2 OOF sMAPE=1.0974 (fold wavg=1.0974)
[2025-10-23 19:10:33] Power=1.35 CV start...
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[1]	train's l1: nan	valid's l1: nan
  Fold 1: val_month=2023-09-01 sMAPE=1.6025 (n=316445)
Training until validation scores don't

Unnamed: 0,run_id,model,power,fold_scores,fold_sizes,oof_smape,timestamp
0,run_h2_20251023_165145,lgbm_tweedie,1.2,"[1.6025407258765347, 0.6871820749741039, 0.917...","[316445, 253011, 311662]",1.097418,2025-10-23 19:10:33
1,run_h2_20251023_165145,lgbm_tweedie,1.35,"[1.6025407258765347, 0.6904808086555311, 0.914...","[316445, 253011, 311662]",1.097173,2025-10-23 21:26:36
2,run_h2_20251023_165145,lgbm_tweedie,1.5,"[1.6025407258765347, 0.6941873189946405, 0.909...","[316445, 253011, 311662]",1.096668,2025-10-23 23:42:18


In [None]:
best_row = cv_tbl.sort_values("oof_smape", ascending=True).iloc[0]
BEST_POWER = float(best_row["power"])
print("Best power by OOF sMAPE:", BEST_POWER, "score=", best_row["oof_smape"])

# Full-train refit using all available training rows with non-null target
tr_mask_full = train[target_col].notnull().values
X_full = train.loc[tr_mask_full, num_cols]
y_full = train.loc[tr_mask_full, target_col]
w_full = train.loc[tr_mask_full, weight_col].pow(0.5) if weight_col else None

# Simple validation split from tail just to enable early stopping:
# Use last available 't' month (max month in train) as pseudo-valid
last_t = train["month"].max()
va_mask_full = (train["month"] == last_t).values
if va_mask_full.sum() < 100:  # safeguard: if tiny, fall back to random 5%
    rng = np.random.default_rng(SEED)
    va_mask_full = np.zeros_like(tr_mask_full, dtype=bool)
    va_mask_full[np.where(tr_mask_full)[0][rng.choice(np.where(tr_mask_full)[0],
                                                      size=int(tr_mask_full.sum()*0.05), replace=False)]] = True

X_tr, y_tr = X_full.loc[~va_mask_full], y_full.loc[~va_mask_full]
X_va, y_va = X_full.loc[ va_mask_full], y_full.loc[ va_mask_full]
w_tr = w_full.loc[~va_mask_full] if weight_col else None
w_va = w_full.loc[ va_mask_full] if weight_col else None

final_model, _ = train_fold_lgbm_tweedie(X_tr, y_tr, X_va, y_va, BEST_POWER, w_tr, w_va)

# Predict test (October 2025 horizon)
test_pred = np.clip(final_model.predict(test[num_cols], num_iteration=final_model.best_iteration), 0, None)
print("Test predictions shape:", test_pred.shape)

Best power by OOF sMAPE: 1.5 score= 1.0966679063990767
Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[62]	train's l1: 3.24699e+07	valid's l1: 4.21181e+07
Test predictions shape: (320208,)


In [None]:
# Merge all oof (per-power) for record; also save the best as canonical OOF
oof_all = pd.concat(oof_frames, ignore_index=True)
oof_all_path = f"{BASE_DIR}/predictions/oof/{MODEL_NAME}_oof_all_powers_{RUN_ID}.parquet"
oof_all.to_parquet(oof_all_path, index=False)

# Canonical OOF for best power
best_mask = oof_all["power"] == BEST_POWER
oof_best = oof_all.loc[best_mask, id_cols + [target_col, f"y_pred_{MODEL_NAME}_p{BEST_POWER}"]].rename(
    columns={f"y_pred_{MODEL_NAME}_p{BEST_POWER}": f"y_pred_{MODEL_NAME}"}
)
oof_best_path = f"{BASE_DIR}/predictions/oof/{MODEL_NAME}_oof_{RUN_ID}.parquet"
oof_best.to_parquet(oof_best_path, index=False)

# Forecast file
forecast_df = test[id_cols].copy()
forecast_df[f"y_pred_{MODEL_NAME}"] = test_pred
forecast_path = f"{BASE_DIR}/predictions/forecast/{MODEL_NAME}_forecast_{RUN_ID}.parquet"
forecast_df.to_parquet(forecast_path, index=False)

print("Saved:")
print("  OOF (all powers):", oof_all_path)
print("  OOF (best):      ", oof_best_path)
print("  Forecast:        ", forecast_path)

# Append CV scores to a CSV log
cv_csv = f"{BASE_DIR}/logs/cv_scores.csv"
cv_tbl2 = cv_tbl.copy()
cv_tbl2["run_id"] = RUN_ID
cv_tbl2["model"] = MODEL_NAME
cv_tbl2.to_csv(cv_csv, mode="a", index=False, header=not os.path.exists(cv_csv))

# Save run config
run_cfg = {
    "run_id": RUN_ID,
    "model": MODEL_NAME,
    "power_grid": POWER_GRID,
    "best_power": BEST_POWER,
    "features_train": FEATURES_TRAIN,
    "features_test": FEATURES_TEST,
    "num_features": len(num_cols),
    "feature_cols": num_cols,
    "weight_col": weight_col,
    "val_target_months": [str(x.date()) for x in [pd.Timestamp("2023-09-01"), pd.Timestamp("2024-03-01"), pd.Timestamp("2024-09-01")]],
    "seed": SEED,
    "timestamp": now(),
    "horizon": 2
}
with open(f"{BASE_DIR}/logs/runs/{MODEL_NAME}_{RUN_ID}.json","w") as f:
    json.dump(run_cfg, f, indent=2)

Saved:
  OOF (all powers): /content/drive/MyDrive/ai4trade/predictions/oof/lgbm_tweedie_oof_all_powers_run_h2_20251023_165145.parquet
  OOF (best):       /content/drive/MyDrive/ai4trade/predictions/oof/lgbm_tweedie_oof_run_h2_20251023_165145.parquet
  Forecast:         /content/drive/MyDrive/ai4trade/predictions/forecast/lgbm_tweedie_forecast_run_h2_20251023_165145.parquet


In [None]:
# OOF sMAPE for the best power (computed on union of folds)
oof_score_best = float(cv_tbl.loc[cv_tbl["power"]==BEST_POWER, "oof_smape"].values[0])
display({"best_power": BEST_POWER, "oof_smape": round(oof_score_best, 6)})

# Sanity: check non-negativity and basic stats
print("Forecast min/max:", float(forecast_df[f"y_pred_{MODEL_NAME}"].min()), float(forecast_df[f"y_pred_{MODEL_NAME}"].max()))
print("Share of zeros in forecast:", float((forecast_df[f"y_pred_{MODEL_NAME}"]==0).mean()))

{'best_power': 1.5, 'oof_smape': 1.096668}

Forecast min/max: 214949.70572128068 2475814980.0947886
Share of zeros in forecast: 0.0
