<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/ensemble_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# === Optimize ensemble weights on OOF (trees only) — robust to column names ===
OOF_DIR = "/content/drive/MyDrive/ai4trade/predictions/oof"

paths = {
    "xgb_tweedie":  f"{OOF_DIR}/xgb_tweedie_oof_xgb_tweedie_h2_20251024_1438_p1.5.parquet",
    "xgb_log1p":    f"{OOF_DIR}/xgb_log1p_oof.parquet",
    "lgbm_rmse":    f"{OOF_DIR}/lgbm_rmse_oof.parquet",
    "lgbm_tweedie": f"{OOF_DIR}/lgbm_tweedie_oof_run_h2_20251023_165145.parquet",
}

import pandas as pd, numpy as np, os, json, re

KEYS = ['origin','destination','hs6','trade_flow','month']
Y_CANDIDATES = ['y','y_true','y_target','target','label','value']
PRED_CANDIDATES_BASE = ['y_pred','prediction','pred','yhat','y_pred_mean']

def detect_truth_col(df):
    for c in Y_CANDIDATES:
        if c in df.columns: return c
    return None

def detect_pred_col(df, model_name):
    # try exact pattern first (e.g., y_pred_xgb_tweedie)
    exact = f"y_pred_{model_name}"
    if exact in df.columns: return exact
    # then common names
    for c in PRED_CANDIDATES_BASE:
        if c in df.columns: return c
    # then any column that starts with 'y_pred'
    for c in df.columns:
        if str(c).startswith('y_pred'): return c
    # last resort: any column containing the model name
    for c in df.columns:
        if model_name in str(c): return c
    return None

def load_oof(name, path):
    df = pd.read_parquet(path)
    # keys present?
    missing = set(KEYS) - set(df.columns)
    if missing:
        raise ValueError(f"{name}: missing keys {sorted(missing)} in {path}")
    # truth column (optional here)
    ycol = detect_truth_col(df)
    # pred column (required)
    pcol = detect_pred_col(df, name)
    if pcol is None:
        raise ValueError(f"{name}: could not find prediction column in {path}. Columns: {df.columns.tolist()}")
    # build minimal frame; if no truth, mark it
    cols = KEYS + ([ycol] if ycol else []) + [pcol]
    df = df[cols].copy()
    if ycol: df = df.rename(columns={ycol:'y'})
    df = df.rename(columns={pcol:name})
    if 'y' not in df.columns: df['__no_y__'] = 1
    return df

# Load all model OOFs
frames = {n: load_oof(n, p) for n,p in paths.items()}

# Choose a truth source (any file that has 'y')
truth_src = None
for n, df in frames.items():
    if 'y' in df.columns:
        truth_src = df[KEYS + ['y']].drop_duplicates()
        break
if truth_src is None:
    cols_report = {n: list(df.columns) for n,df in frames.items()}
    raise ValueError(f"No ground-truth column found in any OOF file. "
                     f"Checked {Y_CANDIDATES}. Columns: {cols_report}")

# Merge predictions onto the truth rows
o = truth_src.copy()
for n, df in frames.items():
    o = o.merge(df[KEYS + [n]], on=KEYS, how='inner')

models = list(paths.keys())
print("Joined OOF rows:", len(o), "| models:", models)

def smape_np(y, yhat, eps=1.0):
    y = np.asarray(y, float); yhat = np.asarray(yhat, float)
    return float(np.mean(2*np.abs(y - yhat) / np.maximum(np.abs(y)+np.abs(yhat), eps)))

# Start from your proposed weights
start = {"xgb_tweedie":0.40,"xgb_log1p":0.25,"lgbm_rmse":0.25,"lgbm_tweedie":0.10}
start = {m: start.get(m, 0.0) for m in models}
s = sum(start.values());
w_best = {k: (v/s if s>0 else 1.0/len(models)) for k,v in start.items()}

def yhat_from_w(w):
    yh = 0.0
    for m in models: yh += w[m]*o[m]
    return yh

best = (w_best, smape_np(o['y'], yhat_from_w(w_best)))
print("Start sMAPE:", best[1])

# Coordinate search around start (non-negative, sum=1)
grid = np.linspace(-0.10, 0.10, 21)
for m in models:
    base = w_best.copy()
    for delta in grid:
        w_try = base.copy()
        w_try[m] = max(0.0, w_try[m] + delta)
        s = sum(w_try.values())
        if s <= 0: continue
        w_try = {k: v/s for k, v in w_try.items()}
        score = smape_np(o['y'], yhat_from_w(w_try))
        if score < best[1]:
            best = (w_try, score)

w_opt, smape_opt = best
print("Optimal weights:", w_opt, "| OOF sMAPE:", smape_opt)

# Save weights + provenance
LOG_DIR = "/content/drive/MyDrive/ai4trade/logs"
os.makedirs(LOG_DIR, exist_ok=True)
with open(f"{LOG_DIR}/ensemble_weights_trees_h2.json","w") as f:
    json.dump({"weights": w_opt, "smape_oof": smape_opt, "files_used": paths}, f, indent=2)
print("Saved:", f"{LOG_DIR}/ensemble_weights_trees_h2.json")

Joined OOF rows: 881118 | models: ['xgb_tweedie', 'xgb_log1p', 'lgbm_rmse', 'lgbm_tweedie']
Start sMAPE: 0.8644095116242546
Optimal weights: {'xgb_tweedie': np.float64(0.4444444444444445), 'xgb_log1p': np.float64(0.16666666666666666), 'lgbm_rmse': np.float64(0.2777777777777778), 'lgbm_tweedie': np.float64(0.11111111111111112)} | OOF sMAPE: 0.8627073108661023
Saved: /content/drive/MyDrive/ai4trade/logs/ensemble_weights_trees_h2.json


In [6]:
# === Apply optimized weights to forecasts (trees only) ===
import os, json, glob, re
import pandas as pd
import numpy as np

BASE_DIR = "/content/drive/MyDrive/ai4trade"
PRED_DIR = f"{BASE_DIR}/predictions"
FC_DIR   = f"{PRED_DIR}/forecast"
OUT_DIR  = f"{PRED_DIR}/forecast_ensemble"
LOG_DIR  = f"{BASE_DIR}/logs"
os.makedirs(OUT_DIR, exist_ok=True)

# Load weights you just saved
with open(f"{LOG_DIR}/ensemble_weights_trees_h2.json") as f:
    meta = json.load(f)
w_opt = {k: float(v) for k, v in meta["weights"].items()}
models = list(w_opt.keys())
print("Using weights:", w_opt)

# --- Robust file finder ---
def find_parquet_for(model_key, search_dir):
    # Prefer files containing model_key and not 'oof'
    cands = [p for p in glob.glob(os.path.join(search_dir, "*.parquet"))
             if model_key in os.path.basename(p).lower() and "oof" not in os.path.basename(p).lower()]
    if not cands:
        # fallback: allow 'oof' if nothing else (but we *should* have forecast files)
        cands = [p for p in glob.glob(os.path.join(search_dir, "*.parquet"))
                 if model_key in os.path.basename(p).lower()]
    if not cands:
        raise FileNotFoundError(f"No forecast parquet found for '{model_key}' in {search_dir}")
    # If multiple, pick the most recent
    cands.sort(key=lambda p: os.path.getmtime(p), reverse=True)
    return cands[0]

# --- Column detection helpers ---
KEYS = ['origin','destination','hs6','trade_flow','month']
PRED_CANDS = ['y_pred', 'prediction', 'pred', 'yhat', 'y_pred_mean']

def detect_pred_col(df, model_key):
    exact = f"y_pred_{model_key}"
    if exact in df.columns:
        return exact
    for c in PRED_CANDS:
        if c in df.columns:
            return c
    # any column starting with y_pred*
    for c in df.columns:
        if str(c).startswith("y_pred"):
            return c
    # any column containing model key
    for c in df.columns:
        if model_key in str(c).lower():
            return c
    raise ValueError(f"Could not find prediction column in columns: {df.columns.tolist()}")

def load_fc_detect(name, path):
    df = pd.read_parquet(path)
    missing = set(KEYS) - set(df.columns)
    if missing:
        raise ValueError(f"{name}: missing keys {sorted(missing)} in {path}")
    pcol = detect_pred_col(df, name)
    df = df[KEYS + [pcol]].rename(columns={pcol: name})
    return df

# --- Discover & load forecasts for each model by filename substring keys ---
# Map model names to filename substrings to search
model_key_to_substr = {
    "xgb_tweedie":  "xgb_tweedie",
    "xgb_log1p":    "xgb_log1p",
    "lgbm_rmse":    "lgbm_rmse",
    "lgbm_tweedie": "lgbm_tweedie",
}

file_map = {}
for m in models:
    fn = find_parquet_for(model_key_to_substr[m], FC_DIR)
    file_map[m] = fn
print("Forecast files found:", file_map)

# Load and inner-join
fc = None
for m in models:
    dfm = load_fc_detect(m, file_map[m])
    fc = dfm if fc is None else fc.merge(dfm, on=KEYS, how="inner")

# Blend
fc['y_pred_ens'] = 0.0
for m, w in w_opt.items():
    fc['y_pred_ens'] = fc['y_pred_ens'] + w * fc[m]

# Post-processing: non-negativity
fc['y_pred_ens'] = fc['y_pred_ens'].clip(lower=0)

# Save HS-6 ensemble
hs6_out = fc[KEYS + ['y_pred_ens']].copy()
hs6_path = f"{OUT_DIR}/ensemble_h2_hs6.parquet"
hs6_out.to_parquet(hs6_path, index=False)
print("Saved HS-6 ensemble:", hs6_path)

# Derive hs4 and aggregate
if hs6_out['hs6'].dtype != 'O':
    hs6_out['hs6'] = hs6_out['hs6'].astype(str)
hs6_out['hs4'] = hs6_out['hs6'].str[:4]

hs4_out = (hs6_out
           .groupby(['origin','destination','hs4','trade_flow','month'], as_index=False)['y_pred_ens']
           .sum())

hs4_path = f"{OUT_DIR}/ensemble_h2_hs4.parquet"
hs4_out.to_parquet(hs4_path, index=False)
print("Saved HS-4 ensemble:", hs4_path)

# Competition-style CSV: “USA”, “CHL”, “8404”, “Export”, “1234567”
csv_path = f"{OUT_DIR}/submission_h2_hs4.csv"
sub = hs4_out.rename(columns={'y_pred_ens':'value'})
sub = sub[['origin','destination','hs4','trade_flow','month','value']]
sub.to_csv(csv_path, index=False)
print("Saved submission CSV:", csv_path)

Using weights: {'xgb_tweedie': 0.4444444444444445, 'xgb_log1p': 0.16666666666666666, 'lgbm_rmse': 0.2777777777777778, 'lgbm_tweedie': 0.11111111111111112}
Forecast files found: {'xgb_tweedie': '/content/drive/MyDrive/ai4trade/predictions/forecast/xgb_tweedie_forecast_xgb_tweedie_h2_20251024_1438_p1.5.parquet', 'xgb_log1p': '/content/drive/MyDrive/ai4trade/predictions/forecast/xgb_log1p_forecast.parquet', 'lgbm_rmse': '/content/drive/MyDrive/ai4trade/predictions/forecast/lgbm_rmse_forecast.parquet', 'lgbm_tweedie': '/content/drive/MyDrive/ai4trade/predictions/forecast/lgbm_tweedie_forecast_run_h2_20251023_165145.parquet'}
Saved HS-6 ensemble: /content/drive/MyDrive/ai4trade/predictions/forecast_ensemble/ensemble_h2_hs6.parquet
Saved HS-4 ensemble: /content/drive/MyDrive/ai4trade/predictions/forecast_ensemble/ensemble_h2_hs4.parquet
Saved submission CSV: /content/drive/MyDrive/ai4trade/predictions/forecast_ensemble/submission_h2_hs4.csv


In [7]:
import pandas as pd
ens = pd.read_parquet("/content/drive/MyDrive/ai4trade/predictions/forecast_ensemble/ensemble_h2_hs6.parquet")
print(sorted(ens['month'].unique()))

[Timestamp('2024-08-01 00:00:00')]
