<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/10_lgbm_rmse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Title: 10_lgbm_rmse.ipynb
Purpose: Train LightGBM (RMSE objective, log1p target) for direct h=2 forecasting.
Data: features_train_h2.parquet (train ≤ 2025-08), features_test_h2.parquet (score 2025-10).
Outputs:
	•	OOF predictions → predictions/oof/lgbm_rmse_oof.parquet
	•	Forecast      → predictions/forecast/lgbm_rmse_forecast.parquet
	•	CV scores     → logs/cv_scores.csv
	•	Run metadata  → logs/runs/run_YYYYMMDD_HHMM.json

In [2]:
from datetime import datetime
import os

USE_DRIVE = True  # set False if running locally

if USE_DRIVE:
    from google.colab import drive  # noqa: F401
    drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/ai4trade"

DATA_DIR    = os.path.join(BASE_DIR, "data")
FEATURE_DIR = os.path.join(DATA_DIR, "features")

PRED_DIR  = os.path.join(BASE_DIR, "predictions")
OOF_DIR   = os.path.join(PRED_DIR, "oof")
FC_DIR    = os.path.join(PRED_DIR, "forecast")
MERGED_DIR= os.path.join(PRED_DIR, "merged")

LOG_DIR   = os.path.join(BASE_DIR, "logs")

for d in [OOF_DIR, FC_DIR, MERGED_DIR, os.path.join(LOG_DIR, "runs")]:
    os.makedirs(d, exist_ok=True)

RUN_ID = datetime.now().strftime("run_%Y%m%d_%H%M")
MODEL_NAME = "lgbm_rmse"

print("BASE_DIR:", BASE_DIR)
print("RUN_ID  :", RUN_ID)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
BASE_DIR: /content/drive/MyDrive/ai4trade
RUN_ID  : run_20251021_2333


In [3]:
!pip -q install lightgbm==4.5.0 pyarrow==17.0.0

import json
import numpy as np
import pandas as pd
import lightgbm as lgb
from pathlib import Path

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 180)

In [13]:
from typing import Dict

def smape(y_true, y_pred, eps: float = 1.0):
    """Symmetric MAPE with epsilon floor."""
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true) + np.abs(y_pred), eps)
    return np.mean(2.0 * np.abs(y_true - y_pred) / denom)

# --- improved save_json with timestamp / numpy handling ---
import json
from pathlib import Path

def _json_default(o):
    if isinstance(o, (pd.Timestamp, )):
        return o.isoformat()
    if isinstance(o, (np.integer, )):
        return int(o)
    if isinstance(o, (np.floating, )):
        return float(o)
    if isinstance(o, (np.bool_, )):
        return bool(o)
    if isinstance(o, (np.ndarray, )):
        return o.tolist()
    try:
        return str(o)
    except Exception:
        return None

def save_json(obj, path: str):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w") as f:
        json.dump(obj, f, indent=2, default=_json_default)

def append_cv_score(log_path: str, row: Dict):
    if os.path.exists(log_path):
        df = pd.read_csv(log_path)
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    else:
        df = pd.DataFrame([row])
    df.to_csv(log_path, index=False)

In [5]:
train_path = os.path.join(FEATURE_DIR, "features_train_h2.parquet")
test_path  = os.path.join(FEATURE_DIR, "features_test_h2.parquet")

df_train = pd.read_parquet(train_path)
df_test  = pd.read_parquet(test_path)

print("Train shape:", df_train.shape)
print("Test  shape:", df_test.shape)

# Expected core columns (train must include y_target; test may not)
core_cols = {"origin", "destination", "hs6", "hs4", "trade_flow", "month", "y"}
if not core_cols.issubset(df_train.columns):
    missing = core_cols - set(df_train.columns)
    raise ValueError(f"Train missing core cols: {missing}")
if "y_target" not in df_train.columns:
    raise ValueError("Train must include y_target for h=2.")

print("Train min/max month:", df_train["month"].min(), "→", df_train["month"].max())
print("Test  min/max month:", df_test["month"].min(),  "→", df_test["month"].max())

Train shape: (5979239, 42)
Test  shape: (320208, 42)
Train min/max month: 2023-01-01 00:00:00 → 2024-08-01 00:00:00
Test  min/max month: 2024-08-01 00:00:00 → 2024-08-01 00:00:00


In [6]:
# IDs and target settings
id_cols    = ["origin", "destination", "hs6", "hs4", "trade_flow", "month"]
id_cats    = ["origin", "destination", "hs6", "hs4", "trade_flow"]
target_col = "y_target"

# Keep rows with valid two-month-ahead targets
df_train = df_train[df_train[target_col].notna()].copy()

# Safety: clip negatives
for c in ["y", target_col]:
    if c in df_train:
        df_train[c] = df_train[c].clip(lower=0)

# Cast ID categoricals
for c in id_cats:
    if c in df_train: df_train[c] = df_train[c].astype("category")
    if c in df_test:  df_test[c]  = df_test[c].astype("category")

# Candidate features:
# Keep *all* columns except: month (datetime), and y/y_target
# NOTE: We DO include categorical IDs as features (helps GBMs).
candidate_features = [c for c in df_train.columns if c not in {"y", target_col, "month"}]
print("Initial candidate features (incl. categorical IDs):", len(candidate_features))

Initial candidate features (incl. categorical IDs): 39


We’ll inspect dtypes of all candidate features and then apply a single sanitization policy so LightGBM gets only numeric or categorical columns, with train/test aligned.

In [7]:
from collections import defaultdict

by_dtype = defaultdict(list)
for c in candidate_features:
    by_dtype[str(df_train[c].dtype)].append(c)

print("=== DTYPE AUDIT (train) ===")
for dt, cols in sorted(by_dtype.items(), key=lambda kv: kv[0]):
    head = ", ".join(cols[:12])
    more = " ..." if len(cols) > 12 else ""
    print(f"{dt:>16} : {len(cols)} cols -> {head}{more}")

missing_in_test = sorted(set(candidate_features) - set(df_test.columns))
if missing_in_test:
    print("\nWARNING: features missing in TEST (will drop):", missing_in_test[:20], "...")

=== DTYPE AUDIT (train) ===
        category : 5 cols -> origin, destination, hs6, hs4, trade_flow
  datetime64[ns] : 1 cols -> cutoff_month
         float32 : 21 cols -> lag_1, lag_2, lag_3, lag_6, lag_12, ma_3, ma_6, ma_12, roll_std_6, pctchg_1, pctchg_3, cross_flow_lag1 ...
         float64 : 5 cols -> cf_ma3_import, cf_ma3_export, origin_total_exports, origin_total_imports, origin_total_trade
           int16 : 1 cols -> consec_zero_run
           int32 : 1 cols -> month_id
           int64 : 1 cols -> value
            int8 : 4 cols -> month_num, quarter, was_trade_lag1, horizon


In [8]:
import numpy as np
import pandas as pd
import pandas.api.types as pdt

OBJ_TO_NUMERIC_THRESHOLD = 0.97  # try convert object→numeric if ≥97% convertible

# --- Robust dtype testers (work fine with CategoricalDtype) ---
def is_datetime_dtype(s: pd.Series) -> bool:
    return pdt.is_datetime64_any_dtype(s)

def is_period_dtype(s: pd.Series) -> bool:
    return pdt.is_period_dtype(s)

def is_timedelta_dtype(s: pd.Series) -> bool:
    return pdt.is_timedelta64_dtype(s)

def is_sparse_dtype(s: pd.Series) -> bool:
    return pdt.is_sparse(s.dtype)

def is_category_dtype(s: pd.Series) -> bool:
    return pdt.is_categorical_dtype(s)

def is_numeric_or_bool(s: pd.Series) -> bool:
    return pdt.is_integer_dtype(s) or pdt.is_float_dtype(s) or pdt.is_bool_dtype(s)

def try_object_to_numeric(series: pd.Series):
    conv = pd.to_numeric(series, errors="coerce")
    ok_ratio = conv.notna().mean()
    return ok_ratio >= OBJ_TO_NUMERIC_THRESHOLD, conv

# --- Build plan on TRAIN ---
plan = {}  # col -> action
for c in candidate_features:
    s = df_train[c]

    if is_datetime_dtype(s) or is_period_dtype(s) or is_timedelta_dtype(s):
        plan[c] = "drop_dt"
    elif is_sparse_dtype(s):
        plan[c] = "sparse_to_dense"
    elif is_category_dtype(s):
        plan[c] = "keep_category"
    elif is_numeric_or_bool(s):
        if pdt.is_bool_dtype(s):
            plan[c] = "bool_to_int8"
        elif pdt.is_float_dtype(s):
            plan[c] = "float_to_float32"
        else:
            plan[c] = "int_to_int32"
    elif pdt.is_object_dtype(s):
        ok, _ = try_object_to_numeric(s)
        plan[c] = "object_to_numeric" if ok else "drop_object"
    else:
        plan[c] = "drop_unknown"

# --- Apply plan to TRAIN & TEST ---
def apply_plan(df: pd.DataFrame, plan: dict) -> pd.DataFrame:
    out = df.copy()
    drops = []
    for c, action in plan.items():
        if c not in out.columns:
            continue
        s = out[c]
        if action in ("drop_dt", "drop_object", "drop_unknown"):
            drops.append(c)
        elif action == "sparse_to_dense":
            out[c] = pd.Series(pd.arrays.SparseArray(s).to_dense(), index=s.index)
        elif action == "keep_category":
            out[c] = out[c].astype("category")
        elif action == "bool_to_int8":
            out[c] = out[c].astype("int8")
        elif action == "float_to_float32":
            out[c] = out[c].astype("float32")
        elif action == "int_to_int32":
            out[c] = out[c].astype("int32")
        elif action == "object_to_numeric":
            out[c] = pd.to_numeric(out[c], errors="coerce").astype("float32")
    if drops:
        out = out.drop(columns=[c for c in drops if c in out.columns])
    return out

df_train_s = apply_plan(df_train, plan)
df_test_s  = apply_plan(df_test,  plan)

# Common, model-usable features
common_feats = sorted((set(df_train_s.columns) & set(df_test_s.columns)) - {"y", target_col, "month"})
feature_cols = []
for c in common_feats:
    s = df_train_s[c]
    if pdt.is_integer_dtype(s) or pdt.is_float_dtype(s) or pdt.is_categorical_dtype(s):
        feature_cols.append(c)

print(f"Kept {len(feature_cols)} features after sanitization.")
print("Sample:", feature_cols[:20])

# Final safety
bad_now = [c for c in feature_cols if (pdt.is_object_dtype(df_train_s[c]) or pdt.is_datetime64_any_dtype(df_train_s[c]))]
assert not bad_now, f"Unexpected dtypes remain: {bad_now}"

# Replace downstream dataframes
df_train = df_train_s
df_test  = df_test_s

print("\nTrain dtypes (features):")
print(df_train[feature_cols].dtypes.value_counts())

  return pdt.is_period_dtype(s)
  return pdt.is_sparse(s.dtype)
  return pdt.is_categorical_dtype(s)
  if pdt.is_integer_dtype(s) or pdt.is_float_dtype(s) or pdt.is_categorical_dtype(s):


Kept 38 features after sanitization.
Sample: ['cf_ma3_export', 'cf_ma3_import', 'consec_zero_run', 'cross_flow_lag1', 'cross_flow_lag10', 'cross_flow_ma3', 'destination', 'horizon', 'hs4', 'hs6', 'lag_1', 'lag_12', 'lag_2', 'lag_3', 'lag_6', 'ma_12', 'ma_3', 'ma_6', 'month_id', 'month_num']

Train dtypes (features):
float32     26
int32        7
category     1
category     1
category     1
category     1
category     1
Name: count, dtype: int64


Validation mirrors h=2 setup (target months are 2023-09, 2024-03, 2024-09).
For each fold: train on rows with month ≤ train_end, validate on rows where month = val_target − 2 months.

In [9]:
def ts(ym: str) -> pd.Timestamp:
    return pd.to_datetime(ym + "-01")

folds = [
    {"fold": "F1", "train_end": ts("2023-07"), "val_target": ts("2023-09")},
    {"fold": "F2", "train_end": ts("2024-01"), "val_target": ts("2024-03")},
    {"fold": "F3", "train_end": ts("2024-07"), "val_target": ts("2024-09")},
]
folds

[{'fold': 'F1',
  'train_end': Timestamp('2023-07-01 00:00:00'),
  'val_target': Timestamp('2023-09-01 00:00:00')},
 {'fold': 'F2',
  'train_end': Timestamp('2024-01-01 00:00:00'),
  'val_target': Timestamp('2024-03-01 00:00:00')},
 {'fold': 'F3',
  'train_end': Timestamp('2024-07-01 00:00:00'),
  'val_target': Timestamp('2024-09-01 00:00:00')}]

In [10]:
SEED = 2025

lgb_params = {
    "objective": "rmse",   # RMSE on log1p(y)
    "metric": "rmse",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "min_data_in_leaf": 100,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "lambda_l2": 2.0,
    "max_depth": -1,
    "verbosity": -1,
    "force_row_wise": True,
    "seed": SEED,
}

# Categorical features = any of the feature_cols that are categorical dtypes
cat_cols = [c for c in feature_cols if pd.api.types.is_categorical_dtype(df_train[c])]

def fit_one_fold(train_df, val_df, features, cat_cols, params,
                 early_stopping_rounds=200, num_boost_round=5000, verbose_eval=200):
    y_tr = np.log1p(train_df["y_target"].values)
    y_va = np.log1p(val_df["y_target"].values)

    dtrain = lgb.Dataset(train_df[features], label=y_tr,
                         categorical_feature=cat_cols, free_raw_data=False)
    dvalid = lgb.Dataset(val_df[features], label=y_va,
                         categorical_feature=cat_cols, reference=dtrain, free_raw_data=False)

    callbacks = [
        lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=True),
        lgb.log_evaluation(period=verbose_eval),
    ]

    model = lgb.train(
        params=params,
        train_set=dtrain,
        valid_sets=[dtrain, dvalid],
        valid_names=["train", "valid"],
        num_boost_round=num_boost_round,
        callbacks=callbacks
    )
    return model, model.best_iteration

  cat_cols = [c for c in feature_cols if pd.api.types.is_categorical_dtype(df_train[c])]


In [43]:
oof_rows = []
cv_logs  = []

for f in folds:
    fold_name  = f["fold"]
    train_end  = f["train_end"]
    val_target = f["val_target"]

    # Train: month ≤ train_end
    trn_mask = df_train["month"] <= train_end

    # Valid: feature month = val_target - 2 months
    val_month = (val_target - pd.DateOffset(months=2)).replace(day=1)
    va_mask = df_train["month"] == val_month

    trn_df = df_train.loc[trn_mask].copy()
    va_df  = df_train.loc[va_mask].copy()

    print(f"\n=== {fold_name} | train_end={train_end.date()} | val_target={val_target.date()} ===")
    print("Train rows:", len(trn_df), " Valid rows:", len(va_df))
    if len(va_df) == 0:
        print("WARNING: No validation rows for this fold — skipping.")
        continue

    model, best_iter = fit_one_fold(
        trn_df, va_df,
        feature_cols, cat_cols, lgb_params,
        early_stopping_rounds=200, num_boost_round=5000, verbose_eval=200
    )

    va_pred = model.predict(va_df[feature_cols], num_iteration=best_iter)
    va_pred = np.expm1(va_pred)
    va_pred = np.clip(va_pred, 0.0, None)

    fold_smape = smape(va_df["y_target"].values, va_pred, eps=1.0)
    print(f"{fold_name} sMAPE:", round(fold_smape, 4))

    tmp = va_df[["origin", "destination", "hs6", "hs4", "trade_flow", "month"]].copy()
    tmp["y_true"] = va_df["y_target"].values
    tmp["y_pred_" + MODEL_NAME] = va_pred
    tmp["fold"] = fold_name
    oof_rows.append(tmp)

    cv_logs.append({
        "run_id": RUN_ID,
        "model": MODEL_NAME,
        "fold": fold_name,
        "train_end": str(train_end.date()),
        "val_target": str(val_target.date()),
        "rows_train": len(trn_df),
        "rows_valid": len(va_df),
        "smape": fold_smape
    })

if len(oof_rows):
    oof_df = pd.concat(oof_rows, ignore_index=True)
    overall_smape = smape(oof_df["y_true"].values, oof_df["y_pred_" + MODEL_NAME].values, eps=1.0)
    print("\n=== OOF Overall sMAPE:", round(overall_smape, 4), "===")

    oof_path = os.path.join(OOF_DIR, f"{MODEL_NAME}_oof.parquet")
    oof_df.to_parquet(oof_path, index=False)
    print("Saved OOF to:", oof_path)

    for row in cv_logs:
        append_cv_score(os.path.join(LOG_DIR, "cv_scores.csv"), row)

    append_cv_score(os.path.join(LOG_DIR, "cv_scores.csv"), {
        "run_id": RUN_ID, "model": MODEL_NAME, "fold": "OOF",
        "train_end": "", "val_target": "",
        "rows_train": int(df_train.shape[0]), "rows_valid": int(oof_df.shape[0]),
        "smape": overall_smape
    })
else:
    print("No OOF rows collected — check fold date logic or coverage.")


=== F1 | train_end=2023-07-01 | val_target=2023-09-01 ===
Train rows: 2047602  Valid rows: 316445
Training until validation scores don't improve for 200 rounds
[200]	train's rmse: 2.68629	valid's rmse: 2.66956
[400]	train's rmse: 2.63627	valid's rmse: 2.60291
[600]	train's rmse: 2.60878	valid's rmse: 2.57342
[800]	train's rmse: 2.58937	valid's rmse: 2.55374
[1000]	train's rmse: 2.57494	valid's rmse: 2.53895
[1200]	train's rmse: 2.56178	valid's rmse: 2.52505
[1400]	train's rmse: 2.5496	valid's rmse: 2.51355
[1600]	train's rmse: 2.53914	valid's rmse: 2.50289
[1800]	train's rmse: 2.52866	valid's rmse: 2.492
[2000]	train's rmse: 2.51961	valid's rmse: 2.48285
[2200]	train's rmse: 2.51076	valid's rmse: 2.47304
[2400]	train's rmse: 2.50273	valid's rmse: 2.46479
[2600]	train's rmse: 2.49448	valid's rmse: 2.45547
[2800]	train's rmse: 2.48652	valid's rmse: 2.44702
[3000]	train's rmse: 2.4784	valid's rmse: 2.43831
[3200]	train's rmse: 2.47082	valid's rmse: 2.4303
[3400]	train's rmse: 2.46309	val

In [11]:
# Train on full train set
full_train = df_train.copy()
y_full = np.log1p(full_train["y_target"].values)
dtrain_full = lgb.Dataset(full_train[feature_cols], label=y_full,
                          categorical_feature=cat_cols, free_raw_data=False)

# Use a conservative round cap; adjust later based on CV
final_params = lgb_params.copy()
final_num_boost_round = 2500

final_model = lgb.train(
    params=final_params,
    train_set=dtrain_full,
    num_boost_round=final_num_boost_round,
    valid_sets=[dtrain_full],
    valid_names=["train"],
    callbacks=[lgb.log_evaluation(period=500)]
)

# Forecast for test_h2 (these rows map to target = 2025-10)
test_pred = final_model.predict(df_test[feature_cols], num_iteration=final_model.best_iteration or final_num_boost_round)
test_pred = np.expm1(test_pred)
test_pred = np.clip(test_pred, 0.0, None)

fc_df = df_test[["origin", "destination", "hs6", "hs4", "trade_flow", "month"]].copy()
fc_df["y_pred_" + MODEL_NAME] = test_pred

fc_path = os.path.join(FC_DIR, f"{MODEL_NAME}_forecast.parquet")
fc_df.to_parquet(fc_path, index=False)
print("Saved forecast to:", fc_path)

[500]	train's rmse: 2.57944
[1000]	train's rmse: 2.54759
[1500]	train's rmse: 2.52877
[2000]	train's rmse: 2.51456
[2500]	train's rmse: 2.50252
Saved forecast to: /content/drive/MyDrive/ai4trade/predictions/forecast/lgbm_rmse_forecast.parquet


In [14]:
# Try to pull OOF overall sMAPE if available
try:
    overall_smape
except NameError:
    overall_smape = None

run_meta = {
    "run_id": RUN_ID,
    "model": MODEL_NAME,
    "time": datetime.now().isoformat(),
    "params": lgb_params,
    "feature_count": len(feature_cols),
    "categorical_features": cat_cols,
    "folds": folds,
    "oof_smape": float(overall_smape) if overall_smape is not None else None,
    "train_path": train_path,
    "test_path": test_path,
    "oof_output": os.path.join(OOF_DIR, f"{MODEL_NAME}_oof.parquet"),
    "forecast_output": os.path.join(FC_DIR, f"{MODEL_NAME}_forecast.parquet"),
}
meta_path = os.path.join(LOG_DIR, "runs", f"{RUN_ID}.json")
save_json(run_meta, meta_path)
print("Saved metadata to:", meta_path)

Saved metadata to: /content/drive/MyDrive/ai4trade/logs/runs/run_20251021_2333.json


So we now have:
	•	OOF: predictions/oof/lgbm_rmse_oof.parquet
	•	Forecast: predictions/forecast/lgbm_rmse_forecast.parquet
	•	Logs: logs/cv_scores.csv, logs/runs/{RUN_ID}.json

Next steps:
	1.	Run 11_lgbm_tweedie.ipynb (same folds; objective='tweedie', sweep tweedie_variance_power∈[1.2,1.5]).
	2.	Train xgb_tweedie, catboost_log1p, stl_ets, seasonal_naive, (optional) nHiTS.
	3.	Blend in 40_blend_weights.ipynb (non-negative weights minimizing sMAPE).
	4.	Aggregate HS-6→HS-4 and export submission CSV in 50_make_submission.ipynb.