<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/oct_2024_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
# ==== REAL-WORLD VALIDATION (Oct 2024) — CLEAN RESTART ====
# This notebook:
# 1) Loads 4 h=2 forecast parquets (month column = BASE Aug-2024).
# 2) Computes TARGET month = BASE + 2M → Oct-2024.
# 3) Merges & blends with fixed weights.
# 4) Aggregates HS6→HS4, saves ensemble & submission-style CSV.
# 5) Reads real Sept & Oct 2024 from filtered universe and evaluates sMAPE vs naive (Sep→Oct carry).

import os
import gc
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 160)

# Adjust if your Drive mount differs
BASE_DIR = "/content/drive/MyDrive/ai4trade"

# Forecast parquet files (h=2 runs trained ≤ Aug 2024)
PRED_DIR = os.path.join(BASE_DIR, "predictions", "forecast")
PRED_FILES = {
    "xgb_tweedie": "xgb_tweedie_forecast_xgb_tweedie_h2_20251024_1438_p1.5.parquet",
    "xgb_log1p":   "xgb_log1p_forecast.parquet",
    "lgbm_rmse":   "lgbm_rmse_forecast.parquet",
    "lgbm_tweedie":"lgbm_tweedie_forecast_run_h2_20251023_165145.parquet",
}

# Blend weights (fixed)
BLEND_WEIGHTS = {
    "xgb_tweedie": 0.4444,
    "xgb_log1p":   0.1667,
    "lgbm_rmse":   0.2778,
    "lgbm_tweedie":0.1111,
}

# Filtered “30/30/30/40” universe (for actuals)
FILTERED_PATH = os.path.join(BASE_DIR, "data", "interim", "filtered_30_30_30_40.parquet")

# Outputs
OUT_DIR = os.path.join(BASE_DIR, "predictions", "final")
os.makedirs(OUT_DIR, exist_ok=True)
ENSEMBLE_HS6_OUT = os.path.join(OUT_DIR, "ensemble_h2_hs6.parquet")
ENSEMBLE_HS4_OUT = os.path.join(OUT_DIR, "ensemble_h2_hs4.parquet")
SUBMISSION_HS4_CSV = os.path.join(OUT_DIR, "submission_h2_hs4.csv")
EVAL_OUT = os.path.join(OUT_DIR, "eval_oct2024_hs4_comparison.parquet")

# Months
MONTH_SEP_2024 = pd.Timestamp("2024-09-01")
MONTH_OCT_2024 = pd.Timestamp("2024-10-01")

In [26]:
# sMAPE with epsilon floor to stabilize zeros/tiny denominators
def smape(y_true, y_pred, eps=1.0):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true) + np.abs(y_pred), eps)
    return np.mean(2.0 * np.abs(y_true - y_pred) / denom)

# Loader:
# - Reads a model parquet
# - Normalizes columns (origin/destination/hs6/trade_flow)
# - Computes month_target = month(BASE) + 2 months (h=2)
# - Returns minimal schema: [origin,destination,hs6,trade_flow,month_target,y_pred_<model>]
def load_and_standardize_model(model_key, fname):
    path = os.path.join(PRED_DIR, fname)
    df = pd.read_parquet(path)

    # Convert categoricals (e.g., lgbm output) → string
    for c in df.select_dtypes(include=["category"]).columns:
        df[c] = df[c].astype(str)

    # Common alias normalization
    ren = {}
    if "flow" in df.columns and "trade_flow" not in df.columns:
        ren["flow"] = "trade_flow"
    if "reporter" in df.columns and "origin" not in df.columns:
        ren["reporter"] = "origin"
    if "partner" in df.columns and "destination" not in df.columns:
        ren["partner"] = "destination"
    if "product" in df.columns and "hs6" not in df.columns:
        ren["product"] = "hs6"
    df = df.rename(columns=ren)

    # Types / casing
    df["trade_flow"] = df["trade_flow"].astype(str).str.strip().str.title()  # Export/Import
    df["hs6"] = df["hs6"].astype(str).str.zfill(6)

    # Compute TARGET month = BASE month + 2 months (files store Aug-2024 as base)
    base = pd.to_datetime(df["month"], errors="coerce").dt.to_period("M").dt.to_timestamp()
    df["month_target"] = base + pd.DateOffset(months=2)

    # Prediction column should already be named y_pred_<model>
    pred_cols = [c for c in df.columns if c.startswith("y_pred_")]
    if len(pred_cols) != 1:
        raise ValueError(f"{model_key}: expected exactly one y_pred_* column, found {pred_cols}")
    pred_col = pred_cols[0]

    # Minimal schema
    out = df[["origin","destination","hs6","trade_flow","month_target", pred_col]].copy()
    return out.rename(columns={pred_col: f"y_pred_{model_key}"})

In [27]:
from functools import reduce

# Load all model DFs
dfs = []
for mk, fn in PRED_FILES.items():
    print(f"Loading {mk}: {fn}")
    dfs.append(load_and_standardize_model(mk, fn))

# Outer-merge on IDs + month_target so no model drops rows prematurely
merged = reduce(lambda l, r: pd.merge(
    l, r,
    on=["origin","destination","hs6","trade_flow","month_target"],
    how="outer"
), dfs)

# Fill any missing preds with 0 and clip non-neg
for mk in PRED_FILES:
    col = f"y_pred_{mk}"
    if col not in merged.columns:
        merged[col] = 0.0
    merged[col] = merged[col].fillna(0.0).clip(lower=0.0)

# Keep only Oct-2024 (target) and rename month_target → month for downstream
merged = merged[merged["month_target"] == MONTH_OCT_2024].copy()
merged = merged.rename(columns={"month_target": "month"})

print("Unique months in merged (should be ['2024-10']):",
      merged["month"].dt.to_period("M").astype(str).unique().tolist())
print("Rows (HS6 level) before blending:", len(merged))

# Blend with fixed weights
w = BLEND_WEIGHTS
merged["y_pred_ensemble"] = (
    merged["y_pred_xgb_tweedie"] * w["xgb_tweedie"] +
    merged["y_pred_xgb_log1p"]   * w["xgb_log1p"]   +
    merged["y_pred_lgbm_rmse"]   * w["lgbm_rmse"]   +
    merged["y_pred_lgbm_tweedie"]* w["lgbm_tweedie"]
).astype(float).clip(lower=0.0)

print("Merged & blended shape (HS6):", merged.shape)
merged.to_parquet(ENSEMBLE_HS6_OUT, index=False)
print("Saved HS6 ensemble to:", ENSEMBLE_HS6_OUT)

Loading xgb_tweedie: xgb_tweedie_forecast_xgb_tweedie_h2_20251024_1438_p1.5.parquet
Loading xgb_log1p: xgb_log1p_forecast.parquet
Loading lgbm_rmse: lgbm_rmse_forecast.parquet
Loading lgbm_tweedie: lgbm_tweedie_forecast_run_h2_20251023_165145.parquet
Unique months in merged (should be ['2024-10']): ['2024-10']
Rows (HS6 level) before blending: 320208
Merged & blended shape (HS6): (320208, 10)
Saved HS6 ensemble to: /content/drive/MyDrive/ai4trade/predictions/final/ensemble_h2_hs6.parquet


In [28]:
# Derive hs4 and aggregate predictions
ens_hs4 = merged.copy()
ens_hs4["hs4"] = ens_hs4["hs6"].str[:4]

grp = ["origin","destination","hs4","trade_flow","month"]
ens_hs4 = (ens_hs4.groupby(grp, as_index=False)["y_pred_ensemble"].sum())

# Save HS4 ensemble
ens_hs4.to_parquet(ENSEMBLE_HS4_OUT, index=False)
print("Saved HS4 ensemble to:", ENSEMBLE_HS4_OUT)

# Submission format: Country1, Country2, ProductCode, TradeFlow, Value
submission = (ens_hs4[["origin","destination","hs4","trade_flow","y_pred_ensemble"]]
              .rename(columns={
                  "origin":"Country1",
                  "destination":"Country2",
                  "hs4":"ProductCode",
                  "trade_flow":"TradeFlow",
                  "y_pred_ensemble":"Value"
              }))
submission.to_csv(SUBMISSION_HS4_CSV, index=False)
print("Saved submission CSV to:", SUBMISSION_HS4_CSV)

Saved HS4 ensemble to: /content/drive/MyDrive/ai4trade/predictions/final/ensemble_h2_hs4.parquet
Saved submission CSV to: /content/drive/MyDrive/ai4trade/predictions/final/submission_h2_hs4.csv


In [29]:
# Load filtered harmonized data and extract Sep & Oct 2024 actuals
filt = pd.read_parquet(FILTERED_PATH)

# Normalize types/casing
filt["month"] = pd.to_datetime(filt["month"]).dt.to_period("M").dt.to_timestamp()
filt["trade_flow"] = filt["trade_flow"].astype(str).str.strip().str.title()
filt["hs6"] = filt["hs6"].astype(str).str.zfill(6)
filt["hs4"] = filt["hs4"].astype(str).str.zfill(4)

# Project scope guard
filt = filt[filt["origin"].isin(["USA","CHN"])]

# Actuals at HS4 (sum HS6)
actual_sep = (filt[filt["month"] == MONTH_SEP_2024]
              .groupby(["origin","destination","hs4","trade_flow","month"], as_index=False)["value"].sum()
              .rename(columns={"value":"y_sep"}))

actual_oct = (filt[filt["month"] == MONTH_OCT_2024]
              .groupby(["origin","destination","hs4","trade_flow","month"], as_index=False)["value"].sum()
              .rename(columns={"value":"y_oct"}))

print("Actual Sep rows:", actual_sep.shape, "Actual Oct rows:", actual_oct.shape)

Actual Sep rows: (104592, 6) Actual Oct rows: (105200, 6)


In [30]:
# Prepare evaluation table on Oct-2024 keys
keys = ["origin","destination","hs4","trade_flow"]

oct_keys = actual_oct[keys].drop_duplicates().copy()
eval_df = (oct_keys
           .merge(actual_oct[keys + ["y_oct"]], on=keys, how="left")
           .merge(actual_sep[keys + ["y_sep"]], on=keys, how="left"))

# Naive prediction: Oct_hat = Sep
eval_df["y_sep"] = eval_df["y_sep"].fillna(0.0).clip(lower=0.0)
eval_df["y_hat_naive"] = eval_df["y_sep"]

# Attach ensemble predictions (HS4)
ens_pred_hs4 = ens_hs4[keys + ["y_pred_ensemble"]].copy()
eval_df = eval_df.merge(ens_pred_hs4, on=keys, how="left")
eval_df["y_hat_ens"] = eval_df["y_pred_ensemble"].fillna(0.0).clip(lower=0.0)

# Tidy
eval_df = eval_df[keys + ["y_oct","y_hat_naive","y_hat_ens"]].copy()
print("Eval HS4 rows:", len(eval_df))
eval_df.head()

Eval HS4 rows: 105200


Unnamed: 0,origin,destination,hs4,trade_flow,y_oct,y_hat_naive,y_hat_ens
0,CHN,AGO,2506,Import,169394,483221.0,479564.1
1,CHN,AGO,2516,Import,2805274,5305998.0,4145079.0
2,CHN,AGO,2601,Import,2831,0.0,34416.17
3,CHN,AGO,2709,Import,1256774279,1425815000.0,1413954000.0
4,CHN,AGO,2711,Import,16776123,0.0,34122240.0


In [31]:
# Overall sMAPE
smape_ens = smape(eval_df["y_oct"], eval_df["y_hat_ens"], eps=1.0)
smape_naive = smape(eval_df["y_oct"], eval_df["y_hat_naive"], eps=1.0)
delta = smape_naive - smape_ens

print(f"Overall sMAPE — Ensemble: {smape_ens:.4f}")
print(f"Overall sMAPE — Naive:    {smape_naive:.4f}")
print(f"Δ (naive − ensemble):      {delta:.4f}  --> {'✅ Ensemble better' if delta>0 else '⚠️ Not better'}")

# Breakdowns
def group_smape(df, by_cols):
    rows = []
    for k, g in df.groupby(by_cols):
        s_e = smape(g["y_oct"], g["y_hat_ens"], eps=1.0)
        s_n = smape(g["y_oct"], g["y_hat_naive"], eps=1.0)
        rows.append((*((k,) if not isinstance(k, tuple) else k), s_e, s_n, s_n - s_e))
    return pd.DataFrame(rows, columns=[*by_cols, "sMAPE_ensemble", "sMAPE_naive", "delta_naive_minus_ens"])\
             .sort_values("delta_naive_minus_ens", ascending=False)

by_flow  = group_smape(eval_df, ["trade_flow"])
by_origin= group_smape(eval_df, ["origin"])
by_both  = group_smape(eval_df, ["origin","trade_flow"])

print("\nBreakdown — by trade_flow:\n", by_flow)
print("\nBreakdown — by origin:\n", by_origin)
print("\nBreakdown — by origin x trade_flow:\n", by_both)

# Save detailed eval table (attach group stats for convenience)
eval_save = eval_df.merge(by_both, on=["origin","trade_flow"], how="left", suffixes=("","_grp"))
eval_save.to_parquet(EVAL_OUT, index=False)
print("\nSaved evaluation comparison to:", EVAL_OUT)

Overall sMAPE — Ensemble: 0.8935
Overall sMAPE — Naive:    0.7056
Δ (naive − ensemble):      -0.1879  --> ⚠️ Not better

Breakdown — by trade_flow:
   trade_flow  sMAPE_ensemble  sMAPE_naive  delta_naive_minus_ens
0     Export        0.835441      0.67289              -0.162551
1     Import        0.971005      0.74920              -0.221805

Breakdown — by origin:
   origin  sMAPE_ensemble  sMAPE_naive  delta_naive_minus_ens
0    CHN        0.779712     0.670998              -0.108714
1    USA        0.980823     0.732099              -0.248724

Breakdown — by origin x trade_flow:
   origin trade_flow  sMAPE_ensemble  sMAPE_naive  delta_naive_minus_ens
0    CHN     Export        0.655741     0.570102              -0.085639
1    CHN     Import        0.981799     0.835471              -0.146329
2    USA     Export        0.995246     0.764298              -0.230947
3    USA     Import        0.964230     0.695053              -0.269177

Saved evaluation comparison to: /content/drive/My

In [32]:
# Largest misses (absolute error) for ensemble
diag = eval_df.copy()
diag["abs_err_ens"] = (diag["y_oct"] - diag["y_hat_ens"]).abs()
diag["abs_err_naive"] = (diag["y_oct"] - diag["y_hat_naive"]).abs()

print("\nTop 15 absolute misses (ensemble):")
display(diag.sort_values("abs_err_ens", ascending=False)
            .head(15)[["origin","destination","trade_flow","hs4","y_oct","y_hat_ens","y_hat_naive","abs_err_ens","abs_err_naive"]])

# Any HS4 pairs present in Oct actuals but predicted zero?
missing_preds = eval_df[eval_df["y_hat_ens"]==0]
print("\nZero-prediction rows (count):", len(missing_preds))
display(missing_preds.head(10))


Top 15 absolute misses (ensemble):


Unnamed: 0,origin,destination,trade_flow,hs4,y_oct,y_hat_ens,y_hat_naive,abs_err_ens,abs_err_naive
41463,CHN,TWN,Import,8542,13254238369,5493236000.0,13652390000.0,7761002000.0,398147100.0
56090,USA,CAN,Export,9999,7299015070,1265032000.0,2050359000.0,6033983000.0,5248657000.0
53844,USA,CAN,Import,2709,8052446006,2831083000.0,8285286000.0,5221363000.0,232840200.0
1356,CHN,AUS,Import,2601,5964303457,1344915000.0,6147818000.0,4619389000.0,183514600.0
61393,USA,CHN,Import,8517,7055189992,2933809000.0,6154652000.0,4121381000.0,900537900.0
33656,CHN,RUS,Import,2709,5239886963,1753679000.0,4650472000.0,3486208000.0,589414900.0
26048,CHN,KOR,Import,8542,7413555361,4164858000.0,6734496000.0,3248697000.0,679059400.0
16034,CHN,HKG,Export,8542,5552333665,2506367000.0,6306333000.0,3045967000.0,753999200.0
89370,USA,MEX,Import,8703,4664494022,1769999000.0,4731948000.0,2894495000.0,67453670.0
15989,CHN,HKG,Export,8517,4522325659,1753474000.0,4880759000.0,2768851000.0,358433500.0



Zero-prediction rows (count): 12515


Unnamed: 0,origin,destination,hs4,trade_flow,y_oct,y_hat_naive,y_hat_ens
7,CHN,AGO,6802,Import,19980,8824.0,0.0
10,CHN,AGO,9403,Import,42,0.0,0.0
12,CHN,ARE,1061,Export,2200,2500.0,0.0
13,CHN,ARE,1062,Export,780,780.0,0.0
14,CHN,ARE,1069,Export,280,840.0,0.0
29,CHN,ARE,1514,Export,4482,2232.0,0.0
32,CHN,ARE,1516,Export,42579,0.0,0.0
40,CHN,ARE,1701,Import,475,32.0,0.0
43,CHN,ARE,1805,Export,73160,0.0,0.0
62,CHN,ARE,2044,Export,101377,0.0,0.0
