<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/XGBoost_tweedie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip -q show xgboost

import os, json, gc, numpy as np, pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from xgboost import XGBRegressor

BASE_DIR = "/content/drive/MyDrive/ai4trade"
FEAT_DIR = f"{BASE_DIR}/data/features"
OOF_DIR  = f"{BASE_DIR}/predictions/oof"
FCST_DIR = f"{BASE_DIR}/predictions/forecast"
LOG_DIR  = f"{BASE_DIR}/logs/runs"
MODEL_DIR= f"{BASE_DIR}/models/xgb_tweedie"

for d in (OOF_DIR, FCST_DIR, LOG_DIR, MODEL_DIR):
    os.makedirs(d, exist_ok=True)

RUN_ID = datetime.now().strftime("xgb_tweedie_h2_%Y%m%d_%H%M")
print("RUN:", RUN_ID)


RUN: xgb_tweedie_h2_20251024_1438


In [3]:
train = pd.read_parquet(f"{FEAT_DIR}/features_train_h2.parquet")
test  = pd.read_parquet(f"{FEAT_DIR}/features_test_h2.parquet")

ID_COLS = ["origin","destination","hs6","trade_flow","month"]
TARGET  = "y_target"   # already shifted to t+2 in your feature pipeline

drop_cols = set(ID_COLS + [TARGET])
FEATS = [c for c in train.columns
         if c not in drop_cols and np.issubdtype(train[c].dtype, np.number)]

# Fill NaNs and drop constant columns for speed
for c in FEATS:
    if train[c].isna().any(): train[c] = train[c].fillna(0.0)
    if test[c].isna().any():  test[c]  = test[c].fillna(0.0)

nunq = train[FEATS].nunique()
FEATS = [c for c in FEATS if nunq[c] > 1]

# Cast to float32 (big speed + memory win)
train[FEATS] = train[FEATS].astype(np.float32)
test[FEATS]  = test[FEATS].astype(np.float32)

# Build series key for grouped CV
train["series_key"] = (
    train["origin"]+"|"+train["destination"]+"|"+train["hs6"]+"|"+train["trade_flow"]
)

gkf = GroupKFold(n_splits=3)
folds = [(tr, va) for tr, va in gkf.split(train[FEATS], groups=train["series_key"])]
len(FEATS), len(train), len(test)


(33, 5979239, 320208)

In [4]:
def smape(y_true, y_pred, eps=1.0):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float).clip(min=0)
    denom = np.maximum(np.abs(y_true)+np.abs(y_pred), eps)
    return float(np.mean(2.0*np.abs(y_true - y_pred)/denom))


In [5]:
DEVICE = "cpu"
try:
    _probe = XGBRegressor(tree_method="hist", device="cuda")
    _ = _probe.get_xgb_params()
    DEVICE = "cuda"
except Exception:
    pass
print("Using device:", DEVICE)


Using device: cuda


In [6]:
base_params = dict(
    n_estimators=1500,       # early stopping active → stops earlier
    learning_rate=0.05,
    max_depth=7,
    min_child_weight=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=2.0,
    n_jobs=-1,
    tree_method="hist",
    objective="reg:tweedie",
    eval_metric="rmse",
    max_bin=128,
    early_stopping_rounds=100,   # <— moved ES here
)
if DEVICE == "cuda":
    base_params["device"] = "cuda"

POWER_GRID = [1.20, 1.35, 1.50]   # recommended sweep


In [9]:
# Cell 6 — Train CV across powers, collect OOF (robust to missing best_iteration)

import numpy as np

def _predict_best(model, X):
    """Use best_iteration if early stopping set it; else use full model."""
    bi = getattr(model, "best_iteration", None)
    if isinstance(bi, (int, np.integer)) and bi >= 0:
        return model.predict(X, iteration_range=(0, bi + 1))
    return model.predict(X)

results = []
oof_by_power = {}

# Pre-allocate once for speed; we overwrite per power
oof = np.zeros(len(train), dtype=np.float32)

for p in POWER_GRID:
    params = {**base_params, "tweedie_variance_power": p}
    print(f"\n=== Training Tweedie power={p} ===")
    fold_scores = []
    oof[:] = 0.0  # reset buffer

    for k, (tr_idx, va_idx) in enumerate(folds, 1):
        tr, va = train.iloc[tr_idx], train.iloc[va_idx]

        model = XGBRegressor(**params)
        model.fit(
            tr[FEATS], tr[TARGET],
            eval_set=[(va[FEATS], va[TARGET])],
            verbose=False
        )

        pred = _predict_best(model, va[FEATS]).clip(min=0)
        score = smape(va[TARGET].values, pred)
        oof[va_idx] = pred.astype(np.float32)

        print(f"  Fold {k} sMAPE: {score:.4f}")
        fold_scores.append(score)

    cv = float(np.mean(fold_scores))
    print(f"  -> CV sMAPE (p={p}): {cv:.4f}")

    # keep per-power OOF and a canonical column on train for inspection
    train[f"y_pred_xgb_tweedie_p{p}"] = oof
    oof_by_power[p] = oof.copy()
    results.append({"power": p, "cv_smape": cv, "fold_smape": fold_scores})

# Select best power & finalize canonical OOF column
best = min(results, key=lambda d: d["cv_smape"])
best_power = best["power"]
train["y_pred_xgb_tweedie"] = train[f"y_pred_xgb_tweedie_p{best_power}"].astype(np.float32)
cv_smape = smape(train[TARGET], train["y_pred_xgb_tweedie"])
print("\nBest Tweedie power:", best_power, "CV sMAPE:", round(best["cv_smape"], 4))
print("OOF sMAPE (best power):", round(cv_smape, 4))


=== Training Tweedie power=1.2 ===


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


  Fold 1 sMAPE: 0.9066
  Fold 2 sMAPE: 0.9060
  Fold 3 sMAPE: 0.9064
  -> CV sMAPE (p=1.2): 0.9063

=== Training Tweedie power=1.35 ===
  Fold 1 sMAPE: 0.9035
  Fold 2 sMAPE: 0.9035
  Fold 3 sMAPE: 0.9031
  -> CV sMAPE (p=1.35): 0.9034

=== Training Tweedie power=1.5 ===
  Fold 1 sMAPE: 0.9009
  Fold 2 sMAPE: 0.9011
  Fold 3 sMAPE: 0.9005
  -> CV sMAPE (p=1.5): 0.9008

Best Tweedie power: 1.5 CV sMAPE: 0.9008
OOF sMAPE (best power): 0.9008


In [10]:
# Cell 7 — Save OOF and run log

oof_out = train[ID_COLS + [TARGET, "y_pred_xgb_tweedie"]].copy()
oof_path = f"{OOF_DIR}/xgb_tweedie_oof_{RUN_ID}_p{best_power}.parquet"
oof_out.to_parquet(oof_path, index=False)

run_meta = {
    "run_id": RUN_ID,
    "model": "xgb_tweedie",
    "best_power": float(best_power),
    "cv_smape_best": float(cv_smape),
    "grid": [
        {"power": float(r["power"]), "cv_smape": float(r["cv_smape"]),
         "fold_smape": [float(x) for x in r["fold_smape"]]}
        for r in results
    ],
    "features": FEATS,
    "params_base": {k: (str(v) if k == "device" else v) for k, v in base_params.items()},
}
log_path = f"{LOG_DIR}/{RUN_ID}.json"
with open(log_path, "w") as f:
    json.dump(run_meta, f)

print("Saved OOF:", oof_path)
print("Saved LOG:", log_path)

Saved OOF: /content/drive/MyDrive/ai4trade/predictions/oof/xgb_tweedie_oof_xgb_tweedie_h2_20251024_1438_p1.5.parquet
Saved LOG: /content/drive/MyDrive/ai4trade/logs/runs/xgb_tweedie_h2_20251024_1438.json


In [11]:
# Cell 8 — Final fit on full train (best power) + forecast

final_params = {**base_params, "tweedie_variance_power": best_power}
final = XGBRegressor(**final_params)
final.fit(
    train[FEATS], train[TARGET],
    # using train as eval set so early_stopping may NOT trigger; _predict_best handles both cases
    eval_set=[(train[FEATS], train[TARGET])],
    verbose=False
)

test["y_pred_xgb_tweedie"] = _predict_best(final, test[FEATS]).clip(min=0).astype(np.float32)

fcst_out = test[ID_COLS + ["y_pred_xgb_tweedie"]].copy()
fcst_path = f"{FCST_DIR}/xgb_tweedie_forecast_{RUN_ID}_p{best_power}.parquet"
fcst_out.to_parquet(fcst_path, index=False)

print("Saved forecast:", fcst_path)

Saved forecast: /content/drive/MyDrive/ai4trade/predictions/forecast/xgb_tweedie_forecast_xgb_tweedie_h2_20251024_1438_p1.5.parquet


In [12]:
# Basic distribution check
print("Train target mean/std:", train[TARGET].mean(), train[TARGET].std())
print("OOF pred mean/std:", train["y_pred_xgb_tweedie"].mean(), train["y_pred_xgb_tweedie"].std())

# Sanity: non-negativity
print("Any negative OOF preds?", (train["y_pred_xgb_tweedie"] < 0).any())
print("Any negative FCST preds?", (test["y_pred_xgb_tweedie"] < 0).any())


Train target mean/std: 2766904.697287732 43519207.314558625
OOF pred mean/std: 2605624.0 24830836.0
Any negative OOF preds? False
Any negative FCST preds? False
