<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/final_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Colab mount

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# ===== Cell 0: Config & Imports =====
import os
import gc
import numpy as np
import pandas as pd

pd.options.display.float_format = "{:,.4f}".format

# ---- Paths (adjust to your Drive layout if needed) ----
BASE_DIR = '/content/drive/MyDrive/ai4trade'  # <-- change if your root is different
DATA_INTERIM = f"{BASE_DIR}/data/interim"
DATA_FEATURES = f"{BASE_DIR}/data/features"

# Input harmonized parquet (HS6-level; already partner-filtered 30/30/30/40)
HARMONIZED = f"{DATA_INTERIM}/final_clean_data.parquet"

# External macro (with columns: month, china_GSCPI_ma2, usa_GSCPI_ma2, chinaCLI_ma2, usaCLI_ma2)
MACRO_FILE = f"{DATA_INTERIM}/GSCPI_CLI.xlsx"

# Outputs
OUT_CHN_EXP = f"{DATA_FEATURES}/features_CHN_export.parquet"
OUT_CHN_IMP = f"{DATA_FEATURES}/features_CHN_import.parquet"
OUT_USA_EXP = f"{DATA_FEATURES}/features_USA_export.parquet"
OUT_USA_IMP = f"{DATA_FEATURES}/features_USA_import.parquet"
OUT_ALL     = f"{DATA_FEATURES}/features_all_unified.parquet"

os.makedirs(DATA_FEATURES, exist_ok=True)

In [3]:
# ===== Cell 1: Load & Basic Hygiene =====
df = pd.read_parquet(HARMONIZED)

# expected columns: origin, destination, hs6, hs4, trade_flow, month, value
# standardize dtypes and names
df = df.rename(columns={"value": "y"}).copy()

df["origin"] = df["origin"].astype(str)
df["destination"] = df["destination"].astype(str)
df["trade_flow"] = df["trade_flow"].astype(str)
df["hs6"] = df["hs6"].astype(str).str.zfill(6)
# ensure hs4 present and consistent
if "hs4" not in df.columns:
    df["hs4"] = df["hs6"].str[:4]
else:
    df["hs4"] = df["hs4"].astype(str).str.zfill(4)

# month to first-of-month datetime
df["month"] = pd.to_datetime(df["month"]).dt.to_period("M").dt.to_timestamp()

# sort for all subsequent groupby/shift ops
df = df.sort_values(["origin", "destination", "hs6", "trade_flow", "month"]).reset_index(drop=True)

print(df.head(3))

  origin destination     hs6   hs4 trade_flow      month    y
0    CHN         ARE  010619  0106     Export 2023-02-01  200
1    CHN         ARE  010619  0106     Export 2023-07-01  600
2    CHN         ARE  010619  0106     Export 2023-08-01  300


In [4]:
# ===== Cell 2: Calendar / Seasonality Features =====
df["year"] = df["month"].dt.year.astype("int16")
df["month_num"] = df["month"].dt.month.astype("int8")
df["quarter"] = df["month"].dt.quarter.astype("int8")

# cyclical encodings (12-month cycle)
df["month_sin"] = np.sin(2 * np.pi * df["month_num"] / 12.0)
df["month_cos"] = np.cos(2 * np.pi * df["month_num"] / 12.0)

# sequential month index per series or global? — use global index for trend
df = df.sort_values("month").reset_index(drop=True)
# month_id as global running index starting at 0
month_mapping = {m:i for i, m in enumerate(sorted(df["month"].unique()))}
df["month_id"] = df["month"].map(month_mapping).astype("int32")

In [5]:
# ===== Cell 3: Bilateral Lags & Rolling Stats =====
KEY = ["origin", "destination", "hs6", "trade_flow"]

g = df.groupby(KEY, group_keys=False)

# basic lags
for L in [1, 2, 3, 6, 12, 9, 10]:  # 9/10 for helper; we'll compress to lag_year_eq later
    df[f"lag_{L}"] = g["y"].shift(L)

# ===== Fixed: Bilateral Rolling Features using transform =====
# Rolling means (<= t−1)
df["ma_3"]  = g["y"].transform(lambda s: s.shift(1).rolling(3, min_periods=1).mean())
df["ma_6"]  = g["y"].transform(lambda s: s.shift(1).rolling(6, min_periods=1).mean())
df["ma_12"] = g["y"].transform(lambda s: s.shift(1).rolling(12, min_periods=1).mean())

# Volatility (6-month std, ≤ t−1)
df["roll_std_6"] = g["y"].transform(lambda s: s.shift(1).rolling(6, min_periods=2).std())

# momentum
df["pctchg_1"] = (df["y"] - df["lag_1"]) / df["lag_1"].replace(0, np.nan)
df["pctchg_3"] = (df["y"] - df["lag_3"]) / df["lag_3"].replace(0, np.nan)

# activity/inactivity flags
df["was_trade_lag1"] = (df["lag_1"].fillna(0) > 0).astype("int8")

In [6]:
# ===== Cell 4: Last-year-equivalent lag =====
df["lag_year_eq"] = np.where(df["origin"].eq("USA"), df["lag_9"], df["lag_10"])

# drop helper lag_9/lag_10 to keep schema tidy
df = df.drop(columns=["lag_9", "lag_10"])

In [7]:
# ===== Fixed: Cross-Flow (HS4-level, Opposite Flow) =====
# 1) Compute origin–month–HS4 totals by flow
totals = (
    df.groupby(["origin", "hs4", "trade_flow", "month"], as_index=False)["y"]
      .sum()
      .rename(columns={"y": "flow_total"})
)

# 2) Define opposite flow (Export ↔ Import)
totals["opposite_flow"] = np.where(totals["trade_flow"].eq("Export"), "Import", "Export")

# 3) Compute MA3 of flow_total per (origin, hs4, trade_flow) using transform (no reset_index!)
totals = totals.sort_values(["origin", "hs4", "trade_flow", "month"])
g_hs4 = totals.groupby(["origin", "hs4", "trade_flow"], group_keys=False)

totals["flow_total_ma3"] = g_hs4["flow_total"].transform(
    lambda s: s.shift(1).rolling(3, min_periods=1).mean()
)

# 4) Prepare tidy cross-flow table for merge
#    We only need one trade_flow column — the one representing the flow to match on.
cf = (
    totals.rename(columns={
        "opposite_flow": "trade_flow_to_match",
        "flow_total_ma3": "cross_flow_ma3"
    })[["origin", "hs4", "month", "trade_flow_to_match", "cross_flow_ma3"]]
    .rename(columns={"trade_flow_to_match": "trade_flow"})
)

# 5) Merge back to bilateral rows
df = df.merge(cf, on=["origin", "hs4", "month", "trade_flow"], how="left")

In [8]:
# ===== Cell 6: Origin-level Totals & Drift =====
# compute monthly origin totals per flow
origin_tot = (
    df.groupby(["origin", "trade_flow", "month"], as_index=False)["y"]
      .sum()
      .rename(columns={"y": "origin_flow_total"})
)

# pivot to two columns per month: exports & imports
orig_piv = origin_tot.pivot_table(index=["origin", "month"], columns="trade_flow", values="origin_flow_total").reset_index()
orig_piv = orig_piv.rename(columns={"Export": "origin_total_exports", "Import": "origin_total_imports"})
orig_piv["origin_total_trade"] = orig_piv[["origin_total_exports", "origin_total_imports"]].sum(axis=1)

# lags and MA3 per origin
g_o = orig_piv.groupby("origin", group_keys=False)
for col in ["origin_total_exports", "origin_total_imports", "origin_total_trade"]:
    orig_piv[col + "_lag1"] = g_o[col].shift(1)
for col in ["origin_total_exports", "origin_total_imports"]:
    orig_piv[col + "_ma3"] = g_o[col].shift(1).rolling(3, min_periods=1).mean()

keep_cols = [
    "origin","month",
    "origin_total_exports_lag1","origin_total_imports_lag1","origin_total_trade_lag1",
    "origin_total_exports_ma3","origin_total_imports_ma3"
]
orig_feats = orig_piv[keep_cols].copy()

# join back to main df
df = df.merge(orig_feats, on=["origin","month"], how="left")

In [9]:
# ===== Cell 7: External Macro Join (Excel version) =====
# file: ai4trade/data/interim/GSCPI_CLI.xlsx
# expected sheet: first sheet or specify name if needed
# columns: month, china_GSCPI_ma2, usa_GSCPI_ma2, chinaCLI_ma2, usaCLI_ma2

macro = pd.read_excel(f"{DATA_INTERIM}/GSCPI_CLI.xlsx")

# enforce proper datetime (first day of month)
macro["month"] = pd.to_datetime(macro["month"]).dt.to_period("M").dt.to_timestamp()

# keep only the required columns in correct order
macro_cols_expected = ["china_GSCPI_ma2", "usa_GSCPI_ma2", "chinaCLI_ma2", "usaCLI_ma2"]
macro_cols = ["month"] + [c for c in macro_cols_expected if c in macro.columns]
macro = macro[macro_cols].copy()

# merge to main dataframe by month
df = df.merge(macro, on="month", how="left")

print("✅ Macro data merged — columns now include:", [c for c in df.columns if "ma2" in c or "CLI" in c])

✅ Macro data merged — columns now include: ['china_GSCPI_ma2', 'usa_GSCPI_ma2', 'chinaCLI_ma2', 'usaCLI_ma2']


In [10]:
# ===== Cell 8: Direct Horizon Policy & y_target =====
# horizon: CHN -> 2, USA -> 3
df["forecast_horizon"] = np.where(df["origin"].eq("CHN"), 2, 3).astype("int8")

# y_target = y(t + h) within each bilateral series
def make_target(sub):
    h = int(sub["forecast_horizon"].iloc[0])
    sub = sub.sort_values("month").copy()
    sub["y_target"] = sub["y"].shift(-h)
    return sub

df = df.groupby(["origin","destination","hs6","trade_flow"], group_keys=False).apply(make_target)

# drop rows without target (end-of-series where future isn't observed)
df = df[~df["y_target"].isna()].reset_index(drop=True)

  df = df.groupby(["origin","destination","hs6","trade_flow"], group_keys=False).apply(make_target)


In [11]:
# ===== Cell 9: Cleanup & Column Order =====
# clip small negatives to zero (safety), but don't touch NaNs
for col in ["y", "lag_1","lag_2","lag_3","lag_6","lag_12","lag_year_eq","ma_3","ma_6","ma_12",
            "roll_std_6","cross_flow_ma3",
            "origin_total_exports_lag1","origin_total_imports_lag1","origin_total_trade_lag1",
            "origin_total_exports_ma3","origin_total_imports_ma3"]:
    if col in df.columns:
        df[col] = df[col].where(df[col].isna() | (df[col] >= 0), 0.0)

FINAL_ORDER = [
    # IDs
    "origin","destination","hs6","hs4","trade_flow","month","y",
    # calendar
    "year","month_num","quarter","month_sin","month_cos","month_id",
    # bilateral
    "lag_1","lag_2","lag_3","lag_6","lag_12","lag_year_eq",
    "ma_3","ma_6","ma_12","roll_std_6","pctchg_1","pctchg_3","was_trade_lag1","consec_zero_run",
    # cross-flow
    "cross_flow_ma3",
    # origin totals
    "origin_total_exports_lag1","origin_total_imports_lag1","origin_total_trade_lag1",
    "origin_total_exports_ma3","origin_total_imports_ma3",
    # external macro (all four retained)
    "china_GSCPI_ma2","usa_GSCPI_ma2","chinaCLI_ma2","usaCLI_ma2",
    # meta
    "forecast_horizon","y_target"
]
# keep only columns that exist to avoid KeyError if any optional missing
FINAL_ORDER = [c for c in FINAL_ORDER if c in df.columns]
df = df[FINAL_ORDER].copy()

df.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8136805 entries, 0 to 8136804
Data columns (total 38 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   origin                     8136805 non-null  object        
 1   destination                8136805 non-null  object        
 2   hs6                        8136805 non-null  object        
 3   hs4                        8136805 non-null  object        
 4   trade_flow                 8136805 non-null  object        
 5   month                      8136805 non-null  datetime64[ns]
 6   y                          8136805 non-null  int64         
 7   year                       8136805 non-null  int16         
 8   month_num                  8136805 non-null  int8          
 9   quarter                    8136805 non-null  int8          
 10  month_sin                  8136805 non-null  float64       
 11  month_cos                  8136805 no

In [12]:
# ===== Cell 10: Segment Saves + Unified =====
is_CHN = df["origin"].eq("CHN")
is_USA = df["origin"].eq("USA")
is_EXP = df["trade_flow"].eq("Export")
is_IMP = df["trade_flow"].eq("Import")

df_chn_exp = df[is_CHN & is_EXP].copy()
df_chn_imp = df[is_CHN & is_IMP].copy()
df_usa_exp = df[is_USA & is_EXP].copy()
df_usa_imp = df[is_USA & is_IMP].copy()

df_chn_exp.to_parquet(OUT_CHN_EXP, index=False)
df_chn_imp.to_parquet(OUT_CHN_IMP, index=False)
df_usa_exp.to_parquet(OUT_USA_EXP, index=False)
df_usa_imp.to_parquet(OUT_USA_IMP, index=False)

df_all = pd.concat([df_chn_exp, df_chn_imp, df_usa_exp, df_usa_imp], axis=0, ignore_index=True)
df_all.to_parquet(OUT_ALL, index=False)

# quick sanity prints
for path, d in [
    (OUT_CHN_EXP, df_chn_exp),
    (OUT_CHN_IMP, df_chn_imp),
    (OUT_USA_EXP, df_usa_exp),
    (OUT_USA_IMP, df_usa_imp),
    (OUT_ALL,     df_all),
]:
    print(f"{path} -> rows={len(d):,}, cols={d.shape[1]}")

del df_chn_exp, df_chn_imp, df_usa_exp, df_usa_imp, df_all
gc.collect()

/content/drive/MyDrive/ai4trade/data/features/features_CHN_export.parquet -> rows=2,807,460, cols=38
/content/drive/MyDrive/ai4trade/data/features/features_CHN_import.parquet -> rows=1,118,879, cols=38
/content/drive/MyDrive/ai4trade/data/features/features_USA_export.parquet -> rows=2,257,793, cols=38
/content/drive/MyDrive/ai4trade/data/features/features_USA_import.parquet -> rows=1,952,673, cols=38
/content/drive/MyDrive/ai4trade/data/features/features_all_unified.parquet -> rows=8,136,805, cols=38


0

In [13]:
# ===== Cell 11: QC Checks =====
# reload to ensure they wrote correctly
chk = {
    "CHN_export": pd.read_parquet(OUT_CHN_EXP),
    "CHN_import": pd.read_parquet(OUT_CHN_IMP),
    "USA_export": pd.read_parquet(OUT_USA_EXP),
    "USA_import": pd.read_parquet(OUT_USA_IMP),
}

# 1) identical columns across segments
cols_sets = {k: tuple(v.columns) for k, v in chk.items()}
print("Column sets identical across segments:",
      len(set(cols_sets.values())) == 1)

# 2) leakage check on a small sample: y_target month = month + horizon
def leak_check(d, n=5):
    s = d.sample(min(n, len(d)), random_state=42)
    # group keys for computing month+h (requires knowing h per origin)
    # here we just print sample context for manual eyeballing
    return s[["origin","destination","hs6","trade_flow","month","forecast_horizon","y","y_target"]]

print(leak_check(chk["CHN_export"]))
print(leak_check(chk["USA_import"]))

Column sets identical across segments: True
        origin destination     hs6 trade_flow      month  forecast_horizon  \
730744     CHN         SGP  731990     Export 2023-08-01                 2   
1751830    CHN         FRA  721114     Export 2024-07-01                 2   
2182209    CHN         THA  901310     Export 2024-11-01                 2   
1166908    CHN         USA  610459     Export 2024-01-01                 2   
207883     CHN         FRA  848291     Export 2023-03-01                 2   

               y       y_target  
730744   1007192   406,191.0000  
1751830     7241     8,774.0000  
2182209      699    17,000.0000  
1166908  2277756 1,639,594.0000  
207883   1191836   903,142.0000  
        origin destination     hs6 trade_flow      month  forecast_horizon  \
307460     USA         IND  390910     Import 2023-05-01                 3   
1879722    USA         CHN  090710     Import 2025-03-01                 3   
931488     USA         BEL  610442     Import 202