<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/XGB_T_USA_imports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# === 12_xgb_tweedie_USA_export_h3_final.ipynb | Cell 1 ===
import os, json, math, gc, time, hashlib, datetime as dt
import numpy as np
import pandas as pd

from xgboost import XGBRegressor

# --------------------------
# Project directories (see OEC_Modeling_Workflow_and_Directory_Guide)
# Adjust BASE_DIR if your Drive mount differs.
# --------------------------
BASE_DIR = "/content/drive/MyDrive/ai4trade"
DATA_DIR = f"{BASE_DIR}/data"
FEAT_DIR = f"{DATA_DIR}/features"
PRED_DIR = f"{BASE_DIR}/predictions"
OOF_DIR  = f"{PRED_DIR}/oof"
FC_DIR   = f"{PRED_DIR}/forecast"
LOG_DIR  = f"{BASE_DIR}/logs"
RUNS_DIR = f"{LOG_DIR}/runs"
os.makedirs(OOF_DIR, exist_ok=True)
os.makedirs(FC_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(RUNS_DIR, exist_ok=True)

# --------------------------
# Run metadata
# --------------------------
RUN_TIME = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID   = f"xgb_tweedie_USA_import_h3_{RUN_TIME}_final"
SEED     = 42
np.random.seed(SEED)

# Segment / horizon config (per Horizon Policy)
ORIGIN      = "USA"
TRADE_FLOW  = "Import"
H           = 3   # direct horizon
SEGMENT_KEY = f"{ORIGIN}_{TRADE_FLOW.lower()}_h{H}"

print("RUN_ID:", RUN_ID)
print("Segment:", SEGMENT_KEY)

# Tweedie config
TWEEDIE_POWER = 1.35  # 1.2–1.5 generally robust; can be tuned fold-wise


RUN_ID: xgb_tweedie_USA_import_h3_20251030_155410_final
Segment: USA_import_h3


In [3]:
# === Cell 2: Load feature splits ===
train_path = f"{FEAT_DIR}/features_USA_import_train_h3_final.parquet"
test_path  = f"{FEAT_DIR}/features_USA_import_test_h3_final.parquet"

df_train = pd.read_parquet(train_path)
df_test  = pd.read_parquet(test_path)

# Expectations from our feature schema:
# columns include: origin, destination, hs6, hs4, trade_flow, month,
# y (t), y_target (t+h), and engineered features (lags, mas, cross-flow, macro).
for name, df in [("train", df_train), ("test", df_test)]:
    print(f"{name}: shape={df.shape}, months ~ [{df['month'].min()} → {df['month'].max()}]")
    assert df["origin"].eq(ORIGIN).all(), "Origin mismatch"
    assert df["trade_flow"].eq(TRADE_FLOW).all(), "Flow mismatch"

# Create a consistent 'target_month' = month + H for transparent evaluation & saving
def add_target_month(frame, h):
    # month is 1st of month -> add months safely (via Period)
    tm = pd.to_datetime(frame["month"]).dt.to_period("M") + h
    frame["target_month"] = tm.dt.to_timestamp()
    return frame

df_train = add_target_month(df_train, H)
df_test  = add_target_month(df_test,  H)

# The train split must have y_target non-null; test intentionally has NaN
assert df_train["y_target"].notna().all()
assert df_test["y_target"].isna().all()


train: shape=(1952673, 39), months ~ [2023-01-01 00:00:00 → 2025-04-01 00:00:00]
test: shape=(57393, 39), months ~ [2025-07-01 00:00:00 → 2025-07-01 00:00:00]


In [4]:
# === Cell 3: Utilities ===

ID_COLS = ["origin", "destination", "hs6", "hs4", "trade_flow", "month", "target_month"]
TARGET_COL = "y_target"

# Some helper feature blacklist to avoid leakage or non-numerics
BLACKLIST = set(ID_COLS + ["y"]) | {TARGET_COL}

def get_feature_cols(df):
    # Start with numeric columns
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # Remove explicit blacklist + any accidental future columns
    feat_cols = [c for c in num_cols if c not in BLACKLIST]
    return sorted(feat_cols)

def safe_fillna(frame, cols):
    # Tree models tolerate zeros better than NaNs on our schema
    frame[cols] = frame[cols].fillna(0.0)
    return frame

def smape(y_true, y_pred, eps=1.0):
    # sMAPE with epsilon stabilization (see our validation docs)
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    numer = np.abs(y_pred - y_true)
    denom = (np.abs(y_true) + np.abs(y_pred)) + eps
    return float(np.mean(2.0 * numer / denom))

def score_hs4_smape(valid_df, y_pred_col):
    """
    Aggregate to HS-4 and compute sMAPE across all HS4 cells for the validation target_month.
    Expect columns: ['hs4', 'y_target', y_pred_col, 'target_month', 'origin','destination','trade_flow']
    """
    g = (valid_df
         .groupby(["origin","destination","hs4","trade_flow","target_month"], as_index=False)
         .agg(y_true=("y_target","sum"), y_pred=(y_pred_col,"sum")))
    return smape(g["y_true"].values, g["y_pred"].values)


In [5]:
# === Cell 4: CV folds for USA h=2 (C1–C6) ===
# Ref: OEC_Horizon_Policy_Direct_MultiHorizon_Forecasting (folds C1–C6)
# Train ≤ date; 1-month gap; validate at t+2 listed below.

folds = [
    # name, train_end,   gap_month, validate_target_month
    ("U1", "2024-05-01", "2024-06-01", "2024-08-01"),  # Early H2 anchor
    ("U2", "2024-07-01", "2024-08-01", "2024-10-01"),  # Mirror final target
    ("U3", "2024-08-01", "2024-09-01", "2024-11-01"),  # Peak seasonality
    ("U4", "2025-02-01", "2025-03-01", "2025-06-01"),  # Close to submission
    ("U5", "2025-03-01", "2025-04-01", "2025-07-01"),  # Last observable target
]

fold_weights = {"U1":1.0, "U2":1.25, "U3":1.0, "U4":1.5, "U5":1.5}

# Parse to timestamps for filtering
folds_parsed = []
for name, trn_end, gap, val_tgt in folds:
    folds_parsed.append({
        "name": name,
        "train_end": pd.Timestamp(trn_end),
        "gap_start": pd.Timestamp(gap),
        "val_target": pd.Timestamp(val_tgt),
        "weight": fold_weights[name]
    })

pd.DataFrame(folds_parsed)


Unnamed: 0,name,train_end,gap_start,val_target,weight
0,U1,2024-05-01,2024-06-01,2024-08-01,1.0
1,U2,2024-07-01,2024-08-01,2024-10-01,1.25
2,U3,2024-08-01,2024-09-01,2024-11-01,1.0
3,U4,2025-02-01,2025-03-01,2025-06-01,1.5
4,U5,2025-03-01,2025-04-01,2025-07-01,1.5


In [6]:
# === Cell 5: Feature lists and weights ===

feature_cols = get_feature_cols(df_train)

# Optional: sample weights to de-emphasize tiny/noisy series (per schema's suggestion)
# Use sqrt(ma_12 + 1) if present; else uniform weights.
def build_weights(frame):
    col = "ma_12"
    if col in frame.columns:
        w = np.sqrt(frame[col].clip(lower=0.0) + 1.0)
        return w.astype(np.float32)
    else:
        return np.ones(len(frame), dtype=np.float32)

# Ensure no NaNs in features
df_train = safe_fillna(df_train, feature_cols)
df_test  = safe_fillna(df_test,  feature_cols)

# Sanity
print("Num features:", len(feature_cols))
print("Examples:", feature_cols[:12])


Num features: 31
Examples: ['chinaCLI_ma2', 'china_GSCPI_ma2', 'cross_flow_ma3', 'forecast_horizon', 'horizon', 'lag_1', 'lag_12', 'lag_2', 'lag_3', 'lag_6', 'lag_year_eq', 'ma_12']


In [7]:
# === Cell 6: XGB Tweedie CV (GPU-safe; uses folds_parsed) ===
import gc
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from xgboost.core import XGBoostError

# keep output column name consistent with Cell 7
y_pred_col = "y_pred_xgb_tweedie"

def make_model(use_gpu=True):
    """
    Version-safe XGBoost config:
    - XGBoost ≥ 2.0: tree_method="hist" + device="cuda"/"cpu"
    - Works on CPU-only too (device="cpu")
    """
    print("Using GPU T/F: ", use_gpu)
    return XGBRegressor(
        objective="reg:tweedie",
        tweedie_variance_power=TWEEDIE_POWER,
        max_depth=7,
        learning_rate=0.06,
        n_estimators=2500,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=1.0,
        random_state=SEED,
        verbosity=1,
        tree_method="hist",
        device=("cuda" if use_gpu else "cpu"),
        eval_metric="rmse",  # set in constructor, not .fit(...)
    )

cv_rows, oof = [], []

# >>> Use your parsed dict folds built in Cell 4
for f in folds_parsed:
    name      = f["name"]
    trn_end   = f["train_end"]
    val_tgt   = f["val_target"]
    w_fold    = float(f["weight"])

    trn_idx = (df_train["month"] <= trn_end)
    val_idx = (df_train["target_month"] == val_tgt)

    dtr  = df_train.loc[trn_idx]
    dval = df_train.loc[val_idx]

    X_tr, y_tr = dtr[feature_cols].values,  dtr[TARGET_COL].values
    X_va, y_va = dval[feature_cols].values, dval[TARGET_COL].values
    w_tr = build_weights(dtr)

    # GPU first, fall back to CPU if needed (handles gpu_hist incompatibility)
    try:
        model = make_model(use_gpu=True)
        model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_va, y_va)], verbose=False)
    except XGBoostError as e:
        print("GPU fit failed, retrying on CPU. Reason:", str(e))
        model = make_model(use_gpu=False)
        model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_va, y_va)], verbose=False)

    val_pred = np.clip(model.predict(X_va), 0.0, None)

    # assemble OOF rows
    val_oof = dval[ID_COLS + [TARGET_COL]].copy()
    val_oof[y_pred_col] = val_pred
    oof.append(val_oof)

    # HS-4 sMAPE using your utility
    hs4_smape = score_hs4_smape(val_oof, y_pred_col=y_pred_col)
    cv_rows.append({
        "fold": name,
        "train_end": trn_end,
        "val_target": val_tgt,
        "weight": w_fold,
        "hs4_smape": hs4_smape,
    })

    del dtr, dval, X_tr, y_tr, X_va, y_va, w_tr, model
    gc.collect()

cv_df = pd.DataFrame(cv_rows)
cv_df["weighted"] = cv_df["hs4_smape"] * cv_df["weight"]
wm_smape = cv_df["weighted"].sum() / cv_df["weight"].sum()

print(cv_df)
print("Weighted mean HS-4 sMAPE across folds:", round(wm_smape, 6))

oof_df = pd.concat(oof, ignore_index=True).rename(columns={TARGET_COL: "y_true"})


Using GPU T/F:  True


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Using GPU T/F:  True
Using GPU T/F:  True
Using GPU T/F:  True
Using GPU T/F:  True
  fold  train_end val_target  weight  hs4_smape  weighted
0   U1 2024-05-01 2024-08-01    1.00   0.817465  0.817465
1   U2 2024-07-01 2024-10-01    1.25   0.857959  1.072449
2   U3 2024-08-01 2024-11-01    1.00   0.870897  0.870897
3   U4 2025-02-01 2025-06-01    1.50   0.518565  0.777848
4   U5 2025-03-01 2025-07-01    1.50   0.478859  0.718289
Weighted mean HS-4 sMAPE across folds: 0.681112


In [8]:
# === Cell 7: Persist OOF + CV logs ===
oof_path = f"{OOF_DIR}/xgb_tweedie_USA_import_h3_final.parquet"
cvlog_path = f"{LOG_DIR}/xgb_tweedie_USA_import_h3_cv_scores_final.csv"

oof_save_cols = ["origin","destination","hs6","hs4","trade_flow",
                 "month","target_month","y_true","y_pred_xgb_tweedie"]

# Drop any accidental duplicate columns just in case
oof_df = oof_df.loc[:, ~oof_df.columns.duplicated()].copy()

oof_df[oof_save_cols].to_parquet(oof_path, index=False)
cv_df.to_csv(cvlog_path, index=False)
print("Saved OOF:", oof_path)
print("Saved CV log:", cvlog_path)


Saved OOF: /content/drive/MyDrive/ai4trade/predictions/oof/xgb_tweedie_USA_import_h3_final.parquet
Saved CV log: /content/drive/MyDrive/ai4trade/logs/xgb_tweedie_USA_import_h3_cv_scores_final.csv


In [9]:
# === Cell 8: Final model on full train → test forecast ===

# Full train data
X_full = df_train[feature_cols].values
y_full = df_train[TARGET_COL].values
w_full = build_weights(df_train)

# Reuse the same constructor + GPU→CPU fallback
try:
    final_model = make_model(use_gpu=True)
    final_model.fit(X_full, y_full, sample_weight=w_full, verbose=False)
except XGBoostError as e:
    print("GPU final fit failed, retrying on CPU. Reason:", str(e))
    final_model = make_model(use_gpu=False)
    final_model.fit(X_full, y_full, sample_weight=w_full, verbose=False)

# Forecast on test split (context t = 2025-08 → target t+2 = 2025-10)
X_te = df_test[feature_cols].values
test_pred = np.clip(final_model.predict(X_te), 0.0, None)

forecast_df = df_test[ID_COLS].copy()
forecast_df["y_pred_xgb_tweedie"] = test_pred

# y_target is intentionally NaN in test; keep schema tidy
forecast_save_cols = [
    "origin","destination","hs6","hs4","trade_flow",
    "month","target_month","y_pred_xgb_tweedie"
]
forecast_path = f"{FC_DIR}/xgb_tweedie_USA_import_h3_final.parquet"
forecast_df[forecast_save_cols].to_parquet(forecast_path, index=False)
print("Saved forecast:", forecast_path)


Using GPU T/F:  True
Saved forecast: /content/drive/MyDrive/ai4trade/predictions/forecast/xgb_tweedie_USA_import_h3_final.parquet


In [10]:
# === Cell 9: JSON run log ===
run_log = {
    "run_id": RUN_ID,
    "segment": SEGMENT_KEY,
    "horizon": H,
    "model": "xgb_tweedie",
    "tweedie_power": TWEEDIE_POWER,
    "seed": SEED,
    "timestamps": {
        "started_at": RUN_TIME,
        "ended_at": dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    },
    "paths": {
        "train_features": train_path,
        "test_features": test_path,
        "oof": oof_path,
        "forecast": forecast_path,
        "cv_log": cvlog_path
    },
    "folds": [
        dict(fold=r["fold"],
             train_end=str(pd.Timestamp(r["train_end"]).date()),
             val_target=str(pd.Timestamp(r["val_target"]).date()),
             weight=float(r["weight"]),
             hs4_smape=float(r["hs4_smape"]))
        for _, r in cv_df.iterrows()
    ],
    "cv_weighted_hs4_smape": float(wm_smape)
}

log_path = f"{RUNS_DIR}/{RUN_ID}_final.json"
with open(log_path, "w") as f:
    json.dump(run_log, f, indent=2)
print("Wrote run log:", log_path)


Wrote run log: /content/drive/MyDrive/ai4trade/logs/runs/xgb_tweedie_USA_import_h3_20251030_155410_final_final.json


Notes & alignment

Folds & horizons: match our dual-horizon policy and late-period CV template (C1–C6) so validation mirrors the submission window.

Artifacts & folders: follow the modeling workflow & checklist; file names include _final and live under /predictions/oof, /predictions/forecast, and /logs.

Evaluation metric: sMAPE computed after aggregating to HS-4, as required for model selection and later blending.

GPU-first: tree_method="gpu_hist" and predictor="gpu_predictor", with a simple fallback (edit make_model() to "hist" if needed).