<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/XGBoost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive  # noqa: F401
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, json, gc, numpy as np, pandas as pd
from datetime import datetime
from sklearn.model_selection import GroupKFold
from xgboost import XGBRegressor

BASE_DIR = "/content/drive/MyDrive/ai4trade"
FEAT_DIR = f"{BASE_DIR}/data/features"
OOF_DIR  = f"{BASE_DIR}/predictions/oof"
FCST_DIR = f"{BASE_DIR}/predictions/forecast"
LOG_DIR  = f"{BASE_DIR}/logs/runs"
MODEL_DIR= f"{BASE_DIR}/models/xgb_log1p"

os.makedirs(OOF_DIR, exist_ok=True)
os.makedirs(FCST_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

RUN_ID = datetime.now().strftime("xgb_log1p_h2_%Y%m%d_%H%M")

In [None]:
train = pd.read_parquet(f"{FEAT_DIR}/features_train_h2.parquet")
test  = pd.read_parquet(f"{FEAT_DIR}/features_test_h2.parquet")

# Canonical identifiers
ID_COLS = ["origin","destination","hs6","trade_flow","month"]
TARGET  = "y_target"  # already shifted t->t+2 per policy

# Feature columns: all numeric except ids + target
drop_cols = set(ID_COLS + [TARGET])
FEATS = [c for c in train.columns if c not in drop_cols and np.issubdtype(train[c].dtype, np.number)]

# Safety
for c in FEATS:
    train[c] = train[c].fillna(0.0)
    test[c]  = test[c].fillna(0.0)

# Log1p target
train["y_log1p"] = np.log1p(train[TARGET].clip(lower=0))

In [None]:
def smape(y_true, y_pred, eps=1.0):
    y_true = np.asarray(y_true, float)
    y_pred = np.asarray(y_pred, float).clip(min=0)
    denom = np.maximum(np.abs(y_true)+np.abs(y_pred), eps)
    return np.mean(2.0*np.abs(y_true - y_pred)/denom)

In [None]:
# Group by series so folds don’t leak across (origin, destination, hs6, flow)
train["series_key"] = (
    train["origin"]+"|"+train["destination"]+"|"+train["hs6"]+"|"+train["trade_flow"]
)

# We’ll emulate the policy folds using month ranges embedded during feature build;
# if you saved an explicit fold id, prefer that. Otherwise use GroupKFold as a fallback.
gkf = GroupKFold(n_splits=3)
folds = [(tr, va) for tr, va in gkf.split(train[FEATS], groups=train["series_key"])]

In [None]:
params = dict(
    n_estimators=1500,
    learning_rate=0.03,
    max_depth=8,
    min_child_weight=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=2.0,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",      # fallback on CPU
    objective="reg:squarederror",
    eval_metric="rmse",
)

In [None]:
oof = np.zeros(len(train))
models = []

for k, (tr_idx, va_idx) in enumerate(folds, 1):
    tr, va = train.iloc[tr_idx], train.iloc[va_idx]
    m = XGBRegressor(**params)
    m.fit(tr[FEATS], tr["y_log1p"],
          eval_set=[(va[FEATS], va["y_log1p"])],
          verbose=False)
    models.append(m)
    # invert log1p
    pred = np.expm1(m.predict(va[FEATS])).clip(min=0)
    oof[va_idx] = pred
    print(f"Fold {k} sMAPE: {smape(va[TARGET].values, pred):.4f}")

train["y_pred_xgb_log1p"] = oof
cv_smape = smape(train[TARGET], train["y_pred_xgb_log1p"])
print("CV sMAPE:", round(cv_smape, 4))

Fold 1 sMAPE: 0.9922
Fold 2 sMAPE: 0.9909
Fold 3 sMAPE: 0.9905
CV sMAPE: 0.9912


In [None]:
oof_out = train[ID_COLS + [TARGET, "y_pred_xgb_log1p"]].copy()
oof_path = f"{OOF_DIR}/xgb_log1p_oof.parquet"
oof_out.to_parquet(oof_path, index=False)

with open(f"{LOG_DIR}/{RUN_ID}.json","w") as f:
    json.dump({"run_id": RUN_ID, "model":"xgb_log1p", "cv_smape": float(cv_smape),
               "features": FEATS, "params": params}, f)
print("Saved:", oof_path)

Saved: /content/drive/MyDrive/ai4trade/predictions/oof/xgb_log1p_oof.parquet


In [None]:
full = XGBRegressor(**params)
full.fit(train[FEATS], train["y_log1p"])
test["y_pred_xgb_log1p"] = np.expm1(full.predict(test[FEATS])).clip(min=0)

fcst_out = test[ID_COLS + ["y_pred_xgb_log1p"]].copy()
fcst_path = f"{FCST_DIR}/xgb_log1p_forecast.parquet"
fcst_out.to_parquet(fcst_path, index=False)
print("Saved:", fcst_path)

Saved: /content/drive/MyDrive/ai4trade/predictions/forecast/xgb_log1p_forecast.parquet
