<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/LGBM_Log_China_imports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# === 13_lgbm_rmse_CHN_export_h2_final.ipynb | Cell 1 ===
import os, json, gc, time, hashlib, datetime as dt
import numpy as np
import pandas as pd
import lightgbm as lgb

# --------------------------
# Project directories (same as earlier)
# --------------------------
BASE_DIR = "/content/drive/MyDrive/ai4trade"
DATA_DIR = f"{BASE_DIR}/data"
FEAT_DIR = f"{DATA_DIR}/features"
PRED_DIR = f"{BASE_DIR}/predictions"
OOF_DIR  = f"{PRED_DIR}/oof"
FC_DIR   = f"{PRED_DIR}/forecast"
LOG_DIR  = f"{BASE_DIR}/logs"
RUNS_DIR = f"{LOG_DIR}/runs"
for d in [OOF_DIR, FC_DIR, LOG_DIR, RUNS_DIR]:
    os.makedirs(d, exist_ok=True)

# --------------------------
# Run metadata
# --------------------------
RUN_TIME = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID   = f"lgbm_rmse_CHN_import_h2_{RUN_TIME}_final"
SEED     = 42
np.random.seed(SEED)

# Segment / horizon config
ORIGIN      = "CHN"
TRADE_FLOW  = "Import"
H           = 2
SEGMENT_KEY = f"{ORIGIN}_{TRADE_FLOW.lower()}_h{H}"

print("RUN_ID:", RUN_ID)
print("Segment:", SEGMENT_KEY)


RUN_ID: lgbm_rmse_CHN_import_h2_20251030_144444_final
Segment: CHN_import_h2


In [3]:
# === Cell 2: Load feature splits ===
train_path = f"{FEAT_DIR}/features_CHN_import_train_h2_final.parquet"
test_path  = f"{FEAT_DIR}/features_CHN_import_test_h2_final.parquet"

df_train = pd.read_parquet(train_path)
df_test  = pd.read_parquet(test_path)

# Sanity checks
for name, df in [("train", df_train), ("test", df_test)]:
    print(f"{name}: shape={df.shape}, months ~ [{df['month'].min()} → {df['month'].max()}]")
    assert df["origin"].eq(ORIGIN).all()
    assert df["trade_flow"].eq(TRADE_FLOW).all()

# Add target_month = month + H (for transparent logging/saving)
def add_target_month(frame, h):
    tm = pd.to_datetime(frame["month"]).dt.to_period("M") + h
    frame["target_month"] = tm.dt.to_timestamp()
    return frame

df_train = add_target_month(df_train, H)
df_test  = add_target_month(df_test,  H)

assert df_train["y_target"].notna().all()
assert df_test["y_target"].isna().all()


train: shape=(1118879, 39), months ~ [2023-01-01 00:00:00 → 2025-06-01 00:00:00]
test: shape=(35732, 39), months ~ [2025-08-01 00:00:00 → 2025-08-01 00:00:00]


In [4]:
# === Cell 3: Utilities ===
ID_COLS = ["origin", "destination", "hs6", "hs4", "trade_flow", "month", "target_month"]
TARGET_COL = "y_target"

BLACKLIST = set(ID_COLS + ["y"]) | {TARGET_COL}

def get_feature_cols(df):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    feat_cols = [c for c in num_cols if c not in BLACKLIST]
    return sorted(feat_cols)

def safe_fillna(frame, cols):
    frame[cols] = frame[cols].fillna(0.0)
    return frame

def smape(y_true, y_pred, eps=1.0):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    numer = np.abs(y_pred - y_true)
    denom = (np.abs(y_true) + np.abs(y_pred)) + eps
    return float(np.mean(2.0 * numer / denom))

def score_hs4_smape(valid_df, y_pred_col):
    g = (valid_df
         .groupby(["origin","destination","hs4","trade_flow","target_month"], as_index=False)
         .agg(y_true=("y_target","sum"), y_pred=(y_pred_col,"sum")))
    return smape(g["y_true"].values, g["y_pred"].values)


In [5]:
# === Cell 4: CV folds (C1–C6) ===
folds = [
    ("C1", "2024-05-01", "2024-06-01", "2024-08-01"),
    ("C2", "2024-06-01", "2024-07-01", "2024-09-01"),
    ("C3", "2024-07-01", "2024-08-01", "2024-10-01"),
    ("C4", "2024-09-01", "2024-10-01", "2024-12-01"),
    ("C5", "2025-03-01", "2025-04-01", "2025-06-01"),
    ("C6", "2025-04-01", "2025-05-01", "2025-07-01"),
]
fold_weights = {"C1":1.0, "C2":1.0, "C3":1.25, "C4":1.0, "C5":1.5, "C6":1.5}

folds_parsed = []
for name, trn_end, gap, val_tgt in folds:
    folds_parsed.append({
        "name": name,
        "train_end": pd.Timestamp(trn_end),
        "gap_start": pd.Timestamp(gap),
        "val_target": pd.Timestamp(val_tgt),
        "weight": fold_weights[name]
    })

pd.DataFrame(folds_parsed)


Unnamed: 0,name,train_end,gap_start,val_target,weight
0,C1,2024-05-01,2024-06-01,2024-08-01,1.0
1,C2,2024-06-01,2024-07-01,2024-09-01,1.0
2,C3,2024-07-01,2024-08-01,2024-10-01,1.25
3,C4,2024-09-01,2024-10-01,2024-12-01,1.0
4,C5,2025-03-01,2025-04-01,2025-06-01,1.5
5,C6,2025-04-01,2025-05-01,2025-07-01,1.5


In [6]:
# === Cell 5: Feature prep ===
feature_cols = get_feature_cols(df_train)

# Cast to float32 to speed up training/inference on CPU
df_train[feature_cols] = df_train[feature_cols].astype("float32")
df_test[feature_cols]  = df_test[feature_cols].astype("float32")

df_train = safe_fillna(df_train, feature_cols)
df_test  = safe_fillna(df_test,  feature_cols)

def build_weights(frame):
    col = "ma_12"
    if col in frame.columns:
        w = np.sqrt(frame[col].clip(lower=0.0) + 1.0)
        return w.astype(np.float32)
    else:
        return np.ones(len(frame), dtype=np.float32)

print("Num features:", len(feature_cols))
print("Feature sample:", feature_cols[:12])


Num features: 31
Feature sample: ['chinaCLI_ma2', 'china_GSCPI_ma2', 'cross_flow_ma3', 'forecast_horizon', 'horizon', 'lag_1', 'lag_12', 'lag_2', 'lag_3', 'lag_6', 'lag_year_eq', 'ma_12']


In [7]:
# === Cell 6 (REPLACE): CV training (LightGBM RMSE on log1p target) ===
lgb_params = dict(
    objective="rmse",
    metric="rmse",
    learning_rate=0.07,
    num_leaves=127,
    max_depth=-1,
    feature_fraction=0.7,
    bagging_fraction=0.7,
    bagging_freq=1,
    min_data_in_leaf=100,
    lambda_l1=0.0,
    lambda_l2=1.0,
    verbosity=-1,
    seed=SEED,
    num_threads=-1,
    # device_type="cpu",   # set to "gpu" if you have a GPU-enabled build
)

EARLY_STOP = 400
NUM_BOOST  = 8000

oof = []
cv_rows = []
best_iters = []
y_pred_col = "y_pred_lgbm_rmse"   # keep same column name for downstream blend

for f in folds_parsed:
    name   = f["name"]
    trn_end = f["train_end"]
    val_tgt = f["val_target"]
    w      = f["weight"]

    dtr = df_train.loc[df_train["month"] <= trn_end].copy()
    dval = df_train.loc[df_train["target_month"] == val_tgt].copy()

    # ---- log1p target ----
    y_tr = np.log1p(dtr[TARGET_COL].astype("float32").values)
    y_va = np.log1p(dval[TARGET_COL].astype("float32").values)

    X_tr, X_va = dtr[feature_cols], dval[feature_cols]
    # Try without weights for LGBM (avoids biasing to very large series)
    w_tr = None  # build_weights(dtr)  # <- re-enable later only if needed

    lgb_train = lgb.Dataset(X_tr, label=y_tr, weight=w_tr, free_raw_data=True)
    lgb_valid = lgb.Dataset(X_va, label=y_va, free_raw_data=True)

    model = lgb.train(
        params=lgb_params,
        train_set=lgb_train,
        num_boost_round=NUM_BOOST,
        valid_sets=[lgb_valid],
        valid_names=["valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=EARLY_STOP, verbose=False),
            lgb.log_evaluation(period=200),
        ],
    )

    best_iter = model.best_iteration or NUM_BOOST
    best_iters.append(best_iter)

    # invert log1p -> value, clip negatives
    dval[y_pred_col] = np.clip(np.expm1(model.predict(X_va, num_iteration=best_iter)), 0.0, None)

    fold_smape = score_hs4_smape(dval, y_pred_col)
    cv_rows.append({"fold": name, "train_end": trn_end, "val_target": val_tgt,
                    "weight": w, "best_iter": int(best_iter), "hs4_smape": fold_smape})

    # no duplicate columns
    oof.append(dval[ID_COLS + [TARGET_COL, y_pred_col]].copy())

    del dtr, dval, X_tr, X_va, y_tr, y_va, lgb_train, lgb_valid, model
    gc.collect()

cv_df = pd.DataFrame(cv_rows)
cv_df["weighted"] = cv_df["hs4_smape"] * cv_df["weight"]
wm_smape = cv_df["weighted"].sum() / cv_df["weight"].sum()
print(cv_df)
print("Weighted mean HS-4 sMAPE across folds:", round(wm_smape, 6))
print("Best iters (per fold):", best_iters, "→ median:", int(np.median(best_iters)))

oof_df = pd.concat(oof, ignore_index=True).rename(columns={TARGET_COL: "y_true"})
oof_df = oof_df.loc[:, ~oof_df.columns.duplicated()].copy()


[200]	valid's rmse: 1.47384
[400]	valid's rmse: 1.47756
[200]	valid's rmse: 1.40365
[400]	valid's rmse: 1.40621
[200]	valid's rmse: 1.38907
[400]	valid's rmse: 1.39127
[200]	valid's rmse: 1.30005
[400]	valid's rmse: 1.30209
[200]	valid's rmse: 1.04826
[400]	valid's rmse: 1.04833
[600]	valid's rmse: 1.04864
[200]	valid's rmse: 1.02045
[400]	valid's rmse: 1.01989
[600]	valid's rmse: 1.02111
  fold  train_end val_target  weight  best_iter  hs4_smape  weighted
0   C1 2024-05-01 2024-08-01    1.00         96   0.640609  0.640609
1   C2 2024-06-01 2024-09-01    1.00        141   0.620311  0.620311
2   C3 2024-07-01 2024-10-01    1.25        156   0.637940  0.797425
3   C4 2024-09-01 2024-12-01    1.00        133   0.615091  0.615091
4   C5 2025-03-01 2025-06-01    1.50        213   0.537138  0.805707
5   C6 2025-04-01 2025-07-01    1.50        374   0.543606  0.815410
Weighted mean HS-4 sMAPE across folds: 0.592352
Best iters (per fold): [96, 141, 156, 133, 213, 374] → median: 148


In [8]:
# === Cell 7: Persist OOF + CV logs ===
oof_path = f"{OOF_DIR}/lgbm_rmse_CHN_import_h2_final.parquet"
cvlog_path = f"{LOG_DIR}/lgbm_rmse_CHN_import_h2_cv_scores_final.csv"

oof_save_cols = ["origin","destination","hs6","hs4","trade_flow",
                 "month","target_month","y_true","y_pred_lgbm_rmse"]

# Drop any accidental duplicate columns just in case
oof_df = oof_df.loc[:, ~oof_df.columns.duplicated()].copy()

oof_df[oof_save_cols].to_parquet(oof_path, index=False)
cv_df.to_csv(cvlog_path, index=False)
print("Saved OOF:", oof_path)
print("Saved CV log:", cvlog_path)


Saved OOF: /content/drive/MyDrive/ai4trade/predictions/oof/lgbm_rmse_CHN_import_h2_final.parquet
Saved CV log: /content/drive/MyDrive/ai4trade/logs/lgbm_rmse_CHN_import_h2_cv_scores_final.csv


In [9]:
# === Cell 8: Full-fit and test forecast ===

# final_iter = int(np.median(best_iters)) if len(best_iters) else 3000

# X_full = df_train[feature_cols]
# y_full = df_train[TARGET_COL].astype("float32")
# w_full = build_weights(df_train)

# lgb_full = lgb.Dataset(X_full, label=y_full, weight=w_full, free_raw_data=True)

# final_model = lgb.train(
#     params=lgb_params,
#     train_set=lgb_full,
#     num_boost_round=final_iter,
#     valid_sets=None,
# )

# # Forecast on test split (context t = 2025-08 → target t+2 = 2025-10)
# X_te = df_test[feature_cols]
# test_pred = np.clip(final_model.predict(X_te, num_iteration=final_iter), 0.0, None)

# forecast_df = df_test[ID_COLS].copy()
#forecast_df["y_pred_lgbm_rmse"] = test_pred
# === Cell 8 (PATCH): Full-fit and test forecast ===
final_iter = int(np.median(best_iters)) if len(best_iters) else 3000

X_full = df_train[feature_cols]
y_full = np.log1p(df_train[TARGET_COL].astype("float32").values)
# no weights for now
lgb_full = lgb.Dataset(X_full, label=y_full, free_raw_data=True)

final_model = lgb.train(
    params=lgb_params,
    train_set=lgb_full,
    num_boost_round=final_iter,
    valid_sets=None,
)

X_te = df_test[feature_cols]
test_pred = np.clip(np.expm1(final_model.predict(X_te, num_iteration=final_iter)), 0.0, None)
forecast_df = df_test[ID_COLS].copy()
forecast_df["y_pred_lgbm_rmse"] = test_pred

forecast_save_cols = ["origin","destination","hs6","hs4","trade_flow",
                      "month","target_month","y_pred_lgbm_rmse"]
forecast_path = f"{FC_DIR}/lgbm_rmse_CHN_import_h2_final.parquet"
forecast_df[forecast_save_cols].to_parquet(forecast_path, index=False)
print("Saved forecast:", forecast_path)


Saved forecast: /content/drive/MyDrive/ai4trade/predictions/forecast/lgbm_rmse_CHN_import_h2_final.parquet


In [10]:
# === Cell 9: JSON run log ===
run_log = {
    "run_id": RUN_ID,
    "segment": SEGMENT_KEY,
    "horizon": H,
    "model": "lightgbm_rmse",
    "seed": SEED,
    "timestamps": {
        "started_at": RUN_TIME,
        "ended_at": dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    },
    "paths": {
        "train_features": train_path,
        "test_features": test_path,
        "oof": oof_path,
        "forecast": forecast_path,
        "cv_log": cvlog_path
    },
    "params": {k: (int(v) if isinstance(v, (np.integer,)) else v) for k, v in lgb_params.items()},
    "cv": [
        dict(
            fold=str(r["fold"]),
            train_end=str(pd.Timestamp(r["train_end"]).date()),
            val_target=str(pd.Timestamp(r["val_target"]).date()),
            weight=float(r["weight"]),
            best_iter=int(r["best_iter"]),
            hs4_smape=float(r["hs4_smape"]),
        )
        for _, r in cv_df.iterrows()
    ],
    "cv_weighted_hs4_smape": float(wm_smape),
    "final_num_boost_round": int(final_iter),
}

log_path = f"{RUNS_DIR}/{RUN_ID}_final.json"
with open(log_path, "w") as f:
    json.dump(run_log, f, indent=2)
print("Wrote run log:", log_path)


Wrote run log: /content/drive/MyDrive/ai4trade/logs/runs/lgbm_rmse_CHN_import_h2_20251030_144444_final_final.json


Notes

Fold policy, features, IDs, and HS-4 sMAPE exactly mirror your XGB-Tweedie run → OOFs will align perfectly for the 3-model blend.

Speed tips (CPU): float32 features ✅, num_threads=-1 ✅, early stopping ✅. If you have a GPU-enabled LightGBM wheel, set device_type="gpu" and keep everything else identical.

Output artifacts (OOF, forecast, CV-log, JSON) all use the _final suffix per the Final Execution Checklist.