<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/XGB_Log_USA_imports.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab mount

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# === 13a_xgb_log1p_USA_import_h3_final.ipynb | Cell 1 ===
import os, json, gc, datetime as dt
import numpy as np
import pandas as pd
from xgboost import XGBRegressor

# Paths
BASE_DIR = "/content/drive/MyDrive/ai4trade"
DATA_DIR = f"{BASE_DIR}/data"
FEAT_DIR = f"{DATA_DIR}/features"
PRED_DIR = f"{BASE_DIR}/predictions"
OOF_DIR  = f"{PRED_DIR}/oof"
FC_DIR   = f"{PRED_DIR}/forecast"
LOG_DIR  = f"{BASE_DIR}/logs"
RUNS_DIR = f"{LOG_DIR}/runs"
for d in [OOF_DIR, FC_DIR, LOG_DIR, RUNS_DIR]:
    os.makedirs(d, exist_ok=True)

# Run meta
RUN_TIME = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
RUN_ID   = f"xgb_log1p_USA_import_h3_{RUN_TIME}_final"
SEED     = 42
np.random.seed(SEED)

# Segment
ORIGIN, TRADE_FLOW, H = "USA", "Import", 3
SEGMENT_KEY = f"{ORIGIN}_{TRADE_FLOW.lower()}_h{H}"
print("RUN_ID:", RUN_ID, "| Segment:", SEGMENT_KEY)


RUN_ID: xgb_log1p_USA_import_h3_20251030_164533_final | Segment: USA_import_h3


In [3]:
# === Cell 2: Load feature splits ===
train_path = f"{FEAT_DIR}/features_USA_import_train_h3_final.parquet"
test_path  = f"{FEAT_DIR}/features_USA_import_test_h3_final.parquet"

df_train = pd.read_parquet(train_path)
df_test  = pd.read_parquet(test_path)

for name, df in [("train", df_train), ("test", df_test)]:
    print(f"{name}: shape={df.shape}, months ~ [{df['month'].min()} → {df['month'].max()}]")
    assert df["origin"].eq(ORIGIN).all()
    assert df["trade_flow"].eq(TRADE_FLOW).all()

def add_target_month(frame, h):
    tm = pd.to_datetime(frame["month"]).dt.to_period("M") + h
    frame["target_month"] = tm.dt.to_timestamp()
    return frame

df_train = add_target_month(df_train, H)
df_test  = add_target_month(df_test,  H)

assert df_train["y_target"].notna().all()
assert df_test["y_target"].isna().all()


train: shape=(1952673, 39), months ~ [2023-01-01 00:00:00 → 2025-04-01 00:00:00]
test: shape=(57393, 39), months ~ [2025-07-01 00:00:00 → 2025-07-01 00:00:00]


In [4]:
# === Cell 3: Utilities ===
ID_COLS    = ["origin","destination","hs6","hs4","trade_flow","month","target_month"]
TARGET_COL = "y_target"
BLACKLIST  = set(ID_COLS + ["y"]) | {TARGET_COL}

def get_feature_cols(df):
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return sorted([c for c in num_cols if c not in BLACKLIST])

def safe_fillna(frame, cols):
    frame[cols] = frame[cols].fillna(0.0)
    return frame

def smape(y_true, y_pred, eps=1.0):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.mean(2.0 * np.abs(y_pred - y_true) / (np.abs(y_true)+np.abs(y_pred)+eps)))

def score_hs4_smape(valid_df, y_pred_col):
    g = (valid_df
         .groupby(["origin","destination","hs4","trade_flow","target_month"], as_index=False)
         .agg(y_true=("y_target","sum"), y_pred=(y_pred_col,"sum")))
    return smape(g["y_true"].values, g["y_pred"].values)


In [5]:
# === Cell 4: CV folds (U1–U5) ===

folds = [
    # name, train_end,   gap_month, validate_target_month
    ("U1", "2024-05-01", "2024-06-01", "2024-08-01"),  # Early H2 anchor
    ("U2", "2024-07-01", "2024-08-01", "2024-10-01"),  # Mirror final target
    ("U3", "2024-08-01", "2024-09-01", "2024-11-01"),  # Peak seasonality
    ("U4", "2025-02-01", "2025-03-01", "2025-06-01"),  # Close to submission
    ("U5", "2025-03-01", "2025-04-01", "2025-07-01"),  # Last observable target
]

fold_weights = {"U1":1.0, "U2":1.25, "U3":1.0, "U4":1.5, "U5":1.5}

folds_parsed=[]
for name, trn_end, gap, val_tgt in folds:
    folds_parsed.append({
        "name": name,
        "train_end": pd.Timestamp(trn_end),
        "gap_start": pd.Timestamp(gap),
        "val_target": pd.Timestamp(val_tgt),
        "weight": fold_weights[name]
    })

pd.DataFrame(folds_parsed)


Unnamed: 0,name,train_end,gap_start,val_target,weight
0,U1,2024-05-01,2024-06-01,2024-08-01,1.0
1,U2,2024-07-01,2024-08-01,2024-10-01,1.25
2,U3,2024-08-01,2024-09-01,2024-11-01,1.0
3,U4,2025-02-01,2025-03-01,2025-06-01,1.5
4,U5,2025-03-01,2025-04-01,2025-07-01,1.5


In [6]:
# === Cell 5: Feature prep ===
feature_cols = get_feature_cols(df_train)
df_train[feature_cols] = df_train[feature_cols].astype("float32")
df_test[feature_cols]  = df_test[feature_cols].astype("float32")
df_train = safe_fillna(df_train, feature_cols)
df_test  = safe_fillna(df_test,  feature_cols)

def build_weights(frame):
    if "ma_12" in frame.columns:
        return np.sqrt(frame["ma_12"].clip(lower=0.0) + 1.0).astype("float32")
    return np.ones(len(frame), dtype=np.float32)

print("Num features:", len(feature_cols))
print("Feature sample:", feature_cols[:12])


Num features: 31
Feature sample: ['chinaCLI_ma2', 'china_GSCPI_ma2', 'cross_flow_ma3', 'forecast_horizon', 'horizon', 'lag_1', 'lag_12', 'lag_2', 'lag_3', 'lag_6', 'lag_year_eq', 'ma_12']


In [7]:
# === Cell 6 (PATCHED): CV training (XGB - Log1p) ===

def make_model(use_gpu=True):
    params = dict(
        objective="reg:squarederror",
        eval_metric="rmse",          # << set here (not in .fit)
        max_depth=7,
        learning_rate=0.06,
        n_estimators=2500,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=1.0,
        random_state=SEED,
        verbosity=1,
    )
    if use_gpu:
        params.update(tree_method="hist", device='cuda')
    else:
        params.update(tree_method="hist") # Assuming 'hist' is also suitable for CPU
    return XGBRegressor(**params)

oof = []
cv_rows = []
y_pred_col = "y_pred_xgb_log1p"

for f in folds_parsed:
    name, trn_end, val_tgt, w = f["name"], f["train_end"], f["val_target"], f["weight"]

    dtr  = df_train.loc[df_train["month"] <= trn_end].copy()
    dval = df_train.loc[df_train["target_month"] == val_tgt].copy()

    # log1p transform of target
    y_tr = np.log1p(dtr[TARGET_COL].values.astype(np.float64))
    y_va = np.log1p(dval[TARGET_COL].values.astype(np.float64))
    X_tr = dtr[feature_cols].values
    X_va = dval[feature_cols].values
    w_tr = build_weights(dtr)

    # Fit with GPU→CPU fallback (NO eval_metric kwarg here)
    model = make_model(use_gpu=True)
    try:
        # Check for GPU availability and print
        if model.get_params().get('device') == 'cuda':
            print(f"{name}: Attempting to use GPU.")
        else:
             print(f"{name}: GPU not requested or available, using CPU.")

        model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_va, y_va)], verbose=False)
        print(f"{name}: Training complete.")
    except Exception as e:
        print(f"{name}: Training failed ({e}). Falling back to CPU.")
        model = make_model(use_gpu=False)
        model.fit(X_tr, y_tr, sample_weight=w_tr, eval_set=[(X_va, y_va)], verbose=False)
        print(f"{name}: Training complete on CPU.")


    # Predict (log space → invert), no negatives
    dval[y_pred_col] = np.clip(np.expm1(model.predict(X_va)), 0.0, None)

    # HS-4 sMAPE
    fold_smape = score_hs4_smape(dval, y_pred_col)
    cv_rows.append({"fold": name, "train_end": trn_end, "val_target": val_tgt,
                    "weight": w, "hs4_smape": fold_smape})

    # Avoid duplicate 'target_month'
    oof.append(dval[ID_COLS + [TARGET_COL, y_pred_col]].copy())

    del dtr, dval, X_tr, y_tr, X_va, y_va, w_tr, model
    gc.collect()

cv_df = pd.DataFrame(cv_rows)
cv_df["weighted"] = cv_df["hs4_smape"] * cv_df["weight"]
wm_smape = cv_df["weighted"].sum() / cv_df["weight"].sum()

print(cv_df)
print("Weighted mean HS-4 sMAPE across folds:", round(wm_smape, 6))

oof_df = pd.concat(oof, ignore_index=True).rename(columns={TARGET_COL: "y_true"})
oof_df = oof_df.loc[:, ~oof_df.columns.duplicated()].copy()

U1: Attempting to use GPU.
U1: Training complete.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


U2: Attempting to use GPU.
U2: Training complete.
U3: Attempting to use GPU.
U3: Training complete.
U4: Attempting to use GPU.
U4: Training complete.
U5: Attempting to use GPU.
U5: Training complete.
  fold  train_end val_target  weight  hs4_smape  weighted
0   U1 2024-05-01 2024-08-01    1.00   0.981582  0.981582
1   U2 2024-07-01 2024-10-01    1.25   1.020577  1.275721
2   U3 2024-08-01 2024-11-01    1.00   1.028191  1.028191
3   U4 2025-02-01 2025-06-01    1.50   0.573003  0.859504
4   U5 2025-03-01 2025-07-01    1.50   0.551043  0.826565
Weighted mean HS-4 sMAPE across folds: 0.79545


In [12]:
# === Cell 10: Feature Importance ===
# Get feature importances from the final model
feature_importances = final_model.get_booster().get_score(importance_type='weight')

# Create a mapping from the internal 'f' names to the actual feature names
feature_name_map = {f'f{i}': name for i, name in enumerate(feature_cols)}

# Convert to a pandas DataFrame for easier viewing and sorting
importance_df = pd.DataFrame(list(feature_importances.items()),
                             columns=['feature', 'importance'])

# Replace the 'f' names with the actual feature names
importance_df['feature'] = importance_df['feature'].map(feature_name_map)

# Sort by importance
importance_df = importance_df.sort_values('importance', ascending=False)

# Display the top features
print("Top 20 Feature Importances:")
display(importance_df.head(20))

Top 20 Feature Importances:


Unnamed: 0,feature,importance
2,cross_flow_ma3,29821.0
9,ma_12,21604.0
3,lag_1,21482.0
5,lag_2,19434.0
24,roll_std_6,18419.0
11,ma_6,16976.0
0,chinaCLI_ma2,16651.0
6,lag_3,16637.0
10,ma_3,16226.0
7,lag_6,15696.0


In [8]:
# === Cell 7: Persist OOF + CV logs ===
oof_path  = f"{OOF_DIR}/xgb_log1p_USA_import_h3_final.parquet"
cvlog_path = f"{LOG_DIR}/xgb_log1p_USA_import_h3_cv_scores_final.csv"

oof_save_cols = ["origin","destination","hs6","hs4","trade_flow",
                 "month","target_month","y_true","y_pred_xgb_log1p"]
oof_df[oof_save_cols].to_parquet(oof_path, index=False)
cv_df.to_csv(cvlog_path, index=False)
print("Saved OOF:", oof_path)
print("Saved CV log:", cvlog_path)


Saved OOF: /content/drive/MyDrive/ai4trade/predictions/oof/xgb_log1p_USA_import_h3_final.parquet
Saved CV log: /content/drive/MyDrive/ai4trade/logs/xgb_log1p_USA_import_h3_cv_scores_final.csv


In [9]:
# === Cell 8: Full-fit and test forecast ===
# Use all rounds (no early stop here); or set n_estimators to a slightly higher value if desired.
final_model = make_model(use_gpu=True)
try:
    y_full = np.log1p(df_train[TARGET_COL].values.astype(np.float64))
    X_full = df_train[feature_cols].values
    w_full = build_weights(df_train)
    final_model.fit(X_full, y_full, sample_weight=w_full, verbose=False)
    print("Full-fit: Using GPU.")
except Exception as e:
    print(f"Full-fit: GPU failed ({e}). Falling back to CPU.")
    final_model = make_model(use_gpu=False)
    final_model.fit(X_full, y_full, sample_weight=w_full, verbose=False)

# Forecast on test (context 2025-07 → target 2025-10)
X_te = df_test[feature_cols].values
test_pred = np.clip(np.expm1(final_model.predict(X_te)), 0.0, None)

forecast_df = df_test[ID_COLS].copy()
forecast_df["y_pred_xgb_log1p"] = test_pred

forecast_save_cols = ["origin","destination","hs6","hs4","trade_flow",
                      "month","target_month","y_pred_xgb_log1p"]
forecast_path = f"{FC_DIR}/xgb_log1p_USA_import_h3_final.parquet"
forecast_df[forecast_save_cols].to_parquet(forecast_path, index=False)
print("Saved forecast:", forecast_path)


Full-fit: Using GPU.
Saved forecast: /content/drive/MyDrive/ai4trade/predictions/forecast/xgb_log1p_USA_import_h3_final.parquet


In [10]:
# === Cell 9: JSON run log ===
run_log = {
    "run_id": RUN_ID,
    "segment": SEGMENT_KEY,
    "horizon": H,
    "model": "xgb_log1p",
    "seed": SEED,
    "timestamps": {
        "started_at": RUN_TIME,
        "ended_at": dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    },
    "paths": {
        "train_features": train_path,
        "test_features": test_path,
        "oof": oof_path,
        "forecast": forecast_path,
        "cv_log": cvlog_path
    },
    "params": {
        "objective": "reg:squarederror",
        "max_depth": 7,
        "learning_rate": 0.06,
        "n_estimators": 2500,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "reg_alpha": 0.0,
        "reg_lambda": 1.0
    },
    "cv": [
        dict(
            fold=str(r["fold"]),
            train_end=str(pd.Timestamp(r["train_end"]).date()),
            val_target=str(pd.Timestamp(r["val_target"]).date()),
            weight=float(r["weight"]),
            hs4_smape=float(r["hs4_smape"]),
        )
        for _, r in cv_df.iterrows()
    ],
    "cv_weighted_hs4_smape": float(wm_smape),
}

log_path = f"{RUNS_DIR}/{RUN_ID}_final.json"
with open(log_path, "w") as f:
    json.dump(run_log, f, indent=2)
print("Wrote run log:", log_path)


Wrote run log: /content/drive/MyDrive/ai4trade/logs/runs/xgb_log1p_USA_import_h3_20251030_164533_final_final.json
