<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/Final_submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook loads the merged HS-4 ensemble, applies the “≥200 HS4 lines” rule, picks the top-20 partners by total predicted value per origin×flow, and writes submission_final.csv (with audit tables for transparency).

In [1]:
# Colab mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
# === cell: config & imports ===
import os
from pathlib import Path
import numpy as np
import pandas as pd

BASE_DIR = Path("/content/drive/MyDrive/ai4trade")  # adjust if needed
FINAL_DIR = BASE_DIR / "predictions" / "final"
FINAL_DIR.mkdir(parents=True, exist_ok=True)

MERGED_HS4_PATH = FINAL_DIR / "final_forecast_hs4_final.parquet"   # produced earlier
SUBMISSION_CSV  = FINAL_DIR / "submission_final.csv"
AUDIT_PARTNERS  = FINAL_DIR / "submission_partners_selected_final.csv"
AUDIT_COUNTS    = FINAL_DIR / "submission_hs4_counts_final.csv"

FORECAST_MONTH  = "2025-10-01"   # per OEC final target
HS4_MIN_LINES   = 200            # eligibility threshold
TOP_K_PARTNERS  = 20             # per origin×trade_flow

In [14]:
# === cell: relabel forecast months to actual target ===

df = pd.read_parquet(MERGED_HS4_PATH)
print("Before relabel:", df["month"].value_counts().head())

# Map CHN/USA segments to their true forecast month
df["month"] = pd.to_datetime(df["month"])
df.loc[df["origin"].eq("CHN"), "month"] = pd.to_datetime("2025-10-01")
df.loc[df["origin"].eq("USA"), "month"] = pd.to_datetime("2025-10-01")

df.to_parquet(MERGED_HS4_PATH, index=False)
print(f"Relabeled all forecast rows to month={FORECAST_MONTH}")
print("After relabel:", df["month"].value_counts().head())

Before relabel: month
2025-10-01    85191
Name: count, dtype: int64
Relabeled all forecast rows to month=2025-10-01
After relabel: month
2025-10-01    85191
Name: count, dtype: int64


In [15]:
df.head()

Unnamed: 0,origin,destination,trade_flow,month,hs4,y_pred_ensemble
0,CHN,ARE,Export,2025-10-01,106,19601.13
1,CHN,ARE,Export,2025-10-01,204,115634.1
2,CHN,ARE,Export,2025-10-01,206,48738.08
3,CHN,ARE,Export,2025-10-01,207,2620571.0
4,CHN,ARE,Export,2025-10-01,301,28062.63


In [16]:
# === cell: load merged HS4 & sanity ===
df = pd.read_parquet(MERGED_HS4_PATH)

# Expected columns: origin, destination, trade_flow, month, hs4, y_pred_ensemble
req = ["origin","destination","trade_flow","month","hs4","y_pred_ensemble"]
missing = [c for c in req if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in {MERGED_HS4_PATH}: {missing}")

# Keep only the forecast month (Oct 2025)
df["month"] = pd.to_datetime(df["month"]).dt.strftime("%Y-%m-%d")
df = df[df["month"] == FORECAST_MONTH].copy()
if df.empty:
    raise ValueError(f"No rows found for month={FORECAST_MONTH} in {MERGED_HS4_PATH}")

# Normalize types
df["hs4"] = df["hs4"].astype(str).str.zfill(4)
df["trade_flow"] = df["trade_flow"].str.capitalize()  # "Export"/"Import"
df["y_pred_ensemble"] = df["y_pred_ensemble"].clip(lower=0).astype(float)

In [17]:
# === cell: eligibility (≥200 HS4 lines) ===
# Count distinct HS4 lines with predicted value > 0; use >0 to avoid counting empty noise
active = df[df["y_pred_ensemble"] > 0]
hs4_counts = (
    active.groupby(["origin","trade_flow","destination"], as_index=False)["hs4"]
    .nunique()
    .rename(columns={"hs4":"hs4_count"})
)

# Save counts for audit
hs4_counts.to_csv(AUDIT_COUNTS, index=False)
print(f"Saved HS4 counts: {AUDIT_COUNTS}")

eligible = hs4_counts[hs4_counts["hs4_count"] >= HS4_MIN_LINES].copy()

Saved HS4 counts: /content/drive/MyDrive/ai4trade/predictions/final/submission_hs4_counts_final.csv


In [18]:
# === cell: select partners ===
totals = (
    df.groupby(["origin","trade_flow","destination"], as_index=False)["y_pred_ensemble"]
    .sum()
    .rename(columns={"y_pred_ensemble":"total_pred"})
)

# Merge counts & filter to eligible partners
elig_totals = (
    eligible.merge(totals, on=["origin","trade_flow","destination"], how="left")
            .fillna({"total_pred": 0.0})
)

# Rank by total_pred within each origin×trade_flow and pick top K
elig_totals["rank"] = (
    elig_totals
    .groupby(["origin","trade_flow"])["total_pred"]
    .rank(method="first", ascending=False)
)

selected_partners = (
    elig_totals[elig_totals["rank"] <= TOP_K_PARTNERS]
    .sort_values(["origin","trade_flow","rank"])
    .drop(columns=["rank"])
    .reset_index(drop=True)
)

# Save selected partner list for audit
selected_partners.to_csv(AUDIT_PARTNERS, index=False)
print(f"Saved selected partners: {AUDIT_PARTNERS}")

# Filter main df to only selected partners
df_sel = df.merge(
    selected_partners[["origin","trade_flow","destination"]],
    on=["origin","trade_flow","destination"],
    how="inner"
).copy()

Saved selected partners: /content/drive/MyDrive/ai4trade/predictions/final/submission_partners_selected_final.csv


In [21]:
df_sel.head()

Unnamed: 0,origin,destination,trade_flow,month,hs4,y_pred_ensemble
0,CHN,ARE,Export,2025-10-01,106,19601.13
1,CHN,ARE,Export,2025-10-01,204,115634.1
2,CHN,ARE,Export,2025-10-01,206,48738.08
3,CHN,ARE,Export,2025-10-01,207,2620571.0
4,CHN,ARE,Export,2025-10-01,301,28062.63


In [28]:
# === cell: build submission ===
# Map to OEC columns
# sub = pd.DataFrame({
#     "Country1": df_sel["origin"].astype(str),        # reporter (USA/CHN)
#     "Country2": df_sel["destination"].astype(str),   # partner ISO3
#     "ProductCode": df_sel["hs4"].astype(str).str.zfill(4),
#     "TradeFlow": df_sel["trade_flow"].astype(str),   # "Export"/"Import"
#     # Round to nearest dollar (int)
#     "Value": np.rint(df_sel["y_pred_ensemble"].values).astype(np.int64),
# })

sub = pd.DataFrame({
    "Country1": df_sel["origin"].astype(str),
    "Country2": df_sel["destination"].astype(str),
    # Force HS4 to be 4-char zero-padded string (not int!)
    "ProductCode": df_sel["hs4"].apply(lambda x: f"{int(x):04d}" if str(x).isdigit() else str(x).zfill(4)),
    "TradeFlow": df_sel["trade_flow"].astype(str),
    "Value": np.rint(df_sel["y_pred_ensemble"].values).astype(np.int64),
})

# Optional: sort for readability
sub = sub.sort_values(["Country1","TradeFlow","Country2","ProductCode"]).reset_index(drop=True)
# Write final CSV
# OEC-compliant CSV export ===
import csv

FINAL_DIR = Path("/content/drive/MyDrive/ai4trade/predictions/final")
SUBMISSION_CSV = FINAL_DIR / "submission_final.csv"

# Ensure correct dtypes & zero-padding
sub["Country1"] = sub["Country1"].astype(str).str.upper()
sub["Country2"] = sub["Country2"].astype(str).str.upper()
sub["TradeFlow"] = sub["TradeFlow"].astype(str).str.capitalize()

# Force HS4 → 4-digit zero-padded string
sub["ProductCode"] = sub["ProductCode"].astype(str).str.zfill(4)

# Round & cast value to int
sub["Value"] = np.rint(sub["Value"].astype(float)).astype(int)

# Reorder columns exactly per spec
cols = ["Country1","Country2","ProductCode","TradeFlow","Value"]
sub = sub[cols]

# Save as plain text CSV with all fields quoted
sub.to_csv(
    SUBMISSION_CSV,
    index=False,
    quoting=csv.QUOTE_ALL,
    encoding="utf-8",
    lineterminator="\n"
)

print(f"Wrote OEC submission file: {SUBMISSION_CSV}")
print(sub.head(3))

Wrote OEC submission file: /content/drive/MyDrive/ai4trade/predictions/final/submission_final.csv
  Country1 Country2 ProductCode TradeFlow   Value
0      CHN      ARE        0106    Export   19601
1      CHN      ARE        0204    Export  115634
2      CHN      ARE        0206    Export   48738


In [29]:
!head -5 /content/drive/MyDrive/ai4trade/predictions/final/submission_final.csv

"Country1","Country2","ProductCode","TradeFlow","Value"
"CHN","ARE","0106","Export","19601"
"CHN","ARE","0204","Export","115634"
"CHN","ARE","0206","Export","48738"
"CHN","ARE","0207","Export","2620571"
