<a href="https://colab.research.google.com/github/Tiru-Kaggundi/Trade_AI/blob/main/00_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
# Define paths and parameters
base_path = '/content/drive/MyDrive/ai4trade'
input_parquet = os.path.join(base_path, 'data/interim/filtered_30_30_30_40.parquet')
output_folder = os.path.join(base_path, 'data/features/')
H = 2
train_end_month = pd.to_datetime('2024-08-01')
forecast_month = pd.to_datetime('2024-10-01')
cutoff_month = train_end_month

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)



In [33]:
# Load the data
df = pd.read_parquet(input_parquet)

In [34]:
df.shape

(8061920, 7)

In [35]:
df.head()

Unnamed: 0,origin,destination,hs6,hs4,trade_flow,month,value
0,CHN,AGO,210690,2106,Import,2023-06-01,40
1,CHN,AGO,210690,2106,Import,2023-07-01,44
2,CHN,AGO,210690,2106,Import,2023-10-01,29
3,CHN,AGO,250610,2506,Import,2023-01-01,94339
4,CHN,AGO,250610,2506,Import,2023-05-01,60876


# 00_feature_engineering.ipynb — Prototype (h = 2 → predict Oct-2024)

**Goal:** Build training & forecasting feature sets at HS6 for a **direct two-month-ahead** task (h=2).  
- **Cutoff (t):** 2024-08 → we will predict **2024-10** (t+2) later in the modeling notebooks.
- **Key rule:** All features must use information ≤ t−1 (no leakage).  
- **Seasonality anchor under h=2:** “same month last year” = **t−10** (Oct-2023 when t=Aug-2024), not t−13.

**Outputs**
- `data/features/features_train_h2.parquet`  (rows ≤ 2024-08 with non-NA y_target)
- `data/features/features_test_h2.parquet`   (rows = 2024-08 to predict 2024-10)



# 00_feature_engineering — Prototype (h=2 → predict Oct-2024)

**Goal:** Build features at HS6 to learn a **direct two-month-ahead** task (h=2).  
**Cutoff (t):** 2024-08 → forecast target is **2024-10**.  
**Rule:** All features use data ≤ t−1 (no lookahead).  
**Seasonality (h=2):** “same month last year” relative to cutoff is **t−10**.

**Outputs**
- `data/features/features_train_h2.parquet`   (rows ≤ 2024-08 with non-NA `y_target`)
- `data/features/features_test_h2.parquet`    (rows = 2024-08, to predict 2024-10)


In [36]:
import numpy as np
import pandas as pd

# Ensure expected columns exist
expected = {'origin','destination','hs6','hs4','trade_flow','month','value'}
missing = expected - set(df.columns)
assert not missing, f"Missing cols: {missing}"

# Normalize types / values
df['month'] = pd.to_datetime(df['month'], errors='coerce').dt.to_period('M').dt.to_timestamp()
df['origin'] = df['origin'].astype(str)
df['destination'] = df['destination'].astype(str)
df['hs6'] = df['hs6'].astype(str).str.zfill(6)
df['hs4'] = df['hs4'].astype(str).str.zfill(4)
df['trade_flow'] = df['trade_flow'].str.lower().str.strip()  # 'import' / 'export'
df['value'] = pd.to_numeric(df['value'], errors='coerce').fillna(0).clip(lower=0)

# Drop exact dupes on key
key_cols = ['origin','destination','hs6','trade_flow','month']
before = len(df)
df = df.sort_values(key_cols).drop_duplicates(subset=key_cols, keep='last').reset_index(drop=True)
print(f"Deduped: {before} → {len(df)}")

# Target base
df['y'] = df['value'].astype('float64')

print(df.head(3))


Deduped: 8061920 → 8061920
  origin destination     hs6   hs4 trade_flow      month   value         y
0    CHN         AGO  010612  1061     import 2024-09-01  680000  680000.0
1    CHN         AGO  010612  1061     import 2025-02-01  729707  729707.0
2    CHN         AGO  140490  1404     import 2024-12-01     287     287.0


In [37]:
df['month_num'] = df['month'].dt.month.astype('int8')
df['quarter']   = df['month'].dt.quarter.astype('int8')

# month_id = 0,1,2,... per origin in chronological order
origin_month = (
    df[['origin','month']].drop_duplicates().sort_values(['origin','month']).reset_index(drop=True)
)
origin_month['month_id'] = origin_month.groupby('origin').cumcount().astype('int32')
df = df.merge(origin_month, on=['origin','month'], how='left', validate='many_to_one')

assert df['month_id'].notna().all()
print(df[['origin','month','month_num','quarter','month_id']].head())


  origin      month  month_num  quarter  month_id
0    CHN 2024-09-01          9        3        20
1    CHN 2025-02-01          2        1        25
2    CHN 2024-12-01         12        4        23
3    CHN 2023-06-01          6        2         5
4    CHN 2023-07-01          7        3         6


In [38]:
df.shape

(8061920, 11)

In [39]:
def group_lag(s: pd.Series, k: int):
    return s.groupby(level=list(range(s.index.nlevels-1))).shift(k)

def add_group_lags(df, by, col='y', ks=(1,2,3,6,12)):
    g = df.set_index(by + ['month'])
    for k in ks:
        df[f'lag_{k}'] = g[col].groupby(level=by).shift(k).values
    return df

def add_group_rolls(df, by, col='y'):
    g = df.set_index(by + ['month'])[col]
    df['ma_3']  = g.groupby(level=by).shift(1).rolling(3).mean().values
    df['ma_6']  = g.groupby(level=by).shift(1).rolling(6).mean().values
    df['ma_12'] = g.groupby(level=by).shift(1).rolling(12).mean().values
    df['roll_std_6'] = g.groupby(level=by).shift(1).rolling(6).std(ddof=0).values
    return df

def add_safe_pctchg(df, by, col='y'):
    g = df.set_index(by + ['month'])[col]
    prev1 = g.groupby(level=by).shift(1)
    prev3 = g.groupby(level=by).shift(3)
    pct1 = (g - prev1) / np.maximum(prev1, 1.0)
    pct3 = (g - prev3) / np.maximum(prev3, 1.0)
    pct1 = pct1.where(prev1 > 0, 0.0)
    pct3 = pct3.where(prev3 > 0, 0.0)
    df['pctchg_1'] = pct1.values
    df['pctchg_3'] = pct3.values
    return df

def add_trailing_zero_run(df, by, col='y'):
    # trailing run of zeros up to t−1
    df = df.sort_values(by + ['month'])
    def _run(gr):
        z = (gr[col] == 0).astype(int)
        blocks = (z == 0).cumsum()
        runlen = z.groupby(blocks).cumsum()
        return runlen.shift(1).fillna(0).astype('int16')
    df['consec_zero_run'] = df.groupby(by, group_keys=False).apply(_run).values
    return df


In [40]:
series_keys = ['origin','destination','hs6','trade_flow']
df = df.sort_values(series_keys + ['month']).reset_index(drop=True)

df = add_group_lags(df, series_keys, col='y', ks=(1,2,3,6,12))
df = add_group_rolls(df, series_keys, col='y')
df = add_safe_pctchg(df, series_keys, col='y')
df = add_trailing_zero_run(df, series_keys, col='y')

# Cast engineered columns to compact types
float32_cols = [
    'lag_1','lag_2','lag_3','lag_6','lag_12',
    'ma_3','ma_6','ma_12','roll_std_6','pctchg_1','pctchg_3'
]
df[float32_cols] = df[float32_cols].fillna(0).astype('float32')
df['consec_zero_run'] = df['consec_zero_run'].astype('int16')
df['was_trade_lag1']  = (df['lag_1'] > 0).astype('int8')

print("Bilateral features complete.")


  df['consec_zero_run'] = df.groupby(by, group_keys=False).apply(_run).values


Bilateral features complete.


In [48]:
# 🔍 quick diagnostic after Cell 6 — nonzero feature counts

cols_to_check = [
    'lag_1','lag_2','lag_3','lag_6','lag_12',
    'ma_3','ma_6','ma_12','roll_std_6',
    'pctchg_1','pctchg_3'
]

print("Non-zero value counts in bilateral features:")
for c in cols_to_check:
    nz = (df[c] != 0).sum()
    print(f"{c:>10}: {nz:,}  ({nz/len(df):.1%} of rows)")

Non-zero value counts in bilateral features:
     lag_1: 6,526,910  (81.0% of rows)
     lag_2: 6,195,326  (76.8% of rows)
     lag_3: 5,892,133  (73.1% of rows)
     lag_6: 5,016,620  (62.2% of rows)
    lag_12: 3,343,269  (41.5% of rows)
      ma_3: 6,398,828  (79.4% of rows)
      ma_6: 5,596,492  (69.4% of rows)
     ma_12: 3,709,598  (46.0% of rows)
roll_std_6: 5,595,842  (69.4% of rows)
  pctchg_1: 6,509,974  (80.7% of rows)
  pctchg_3: 5,882,664  (73.0% of rows)


In [44]:
# 🔁 Cell 7 — Cross-flow features (robust; separate Import/Export paths; h=2 seasonal lag10)

# 1) Aggregate totals by flow = 'Import' / 'Export' (case-robust)
tmp = df.copy()
tmp['flow_lower'] = tmp['trade_flow'].str.lower()

def build_flow_totals(flow_name_lower: str, suffix: str):
    t = (tmp[tmp['flow_lower'].eq(flow_name_lower)]
         .groupby(['origin','hs6','month'], as_index=False)['y'].sum()
         .sort_values(['origin','hs6','month'])
         .rename(columns={'y': f'flow_total_{suffix}'}))
    # lags/rolls per (origin, hs6)
    g = t.groupby(['origin','hs6'])[f'flow_total_{suffix}']
    t[f'cf_lag1_{suffix}']  = g.shift(1)
    t[f'cf_ma3_{suffix}']   = g.shift(1).rolling(3).mean()
    # h=2 seasonal anchor → same calendar month last year relative to cutoff = shift(10)
    t[f'cf_lag10_{suffix}'] = g.shift(10)
    return t

imp = build_flow_totals('import', 'import')
exp = build_flow_totals('export', 'export')

# 2) Merge both onto a common (origin, hs6, month) frame
piv = pd.merge(imp, exp, on=['origin','hs6','month'], how='outer')

# 3) Merge back to bilateral rows
df = df.merge(piv, on=['origin','hs6','month'], how='left')

# 4) Pick opposite-flow features row-wise
is_export = df['trade_flow'].str.lower().eq('export')

df['cross_flow_lag1']  = np.where(is_export, df['cf_lag1_import'],  df['cf_lag1_export'])
df['cross_flow_ma3']   = np.where(is_export, df['cf_ma3_import'],   df['cf_ma3_export'])
df['cross_flow_lag10'] = np.where(is_export, df['cf_lag10_import'], df['cf_lag10_export'])

# 5) Cleanup & types
drop_cols = [c for c in df.columns if c.startswith('flow_total_') or c.startswith('cf_lag') and ('import' in c or 'export' in c)]
df.drop(columns=drop_cols, inplace=True, errors='ignore')

for c in ['cross_flow_lag1','cross_flow_ma3','cross_flow_lag10']:
    df[c] = df[c].fillna(0).astype('float32')

print("Cross-flow features complete (separate Import/Export paths; lag1, ma3, lag10).")

Cross-flow features complete (separate Import/Export paths; lag1, ma3, lag10).


In [47]:
print("Nonzero counts:",
      (df['cross_flow_lag1']>0).sum(),
      (df['cross_flow_ma3']>0).sum(),
      (df['cross_flow_lag10']>0).sum(),
      )

# Spot-check one series present in test month
sample = (df[df['month'].eq(train_end_month)]
          [['origin','hs6','trade_flow']].drop_duplicates()
          .sample(1, random_state=7).iloc[0].to_dict())
mask = (df['origin'].eq(sample['origin']) &
        df['hs6'].eq(sample['hs6']) &
        df['trade_flow'].eq(sample['trade_flow']))
display(df.loc[mask, ['month','y','cross_flow_lag1','cross_flow_ma3','cross_flow_lag10']]
          .sort_values('month').tail(14))

Nonzero counts: 7611961 7080066 4868722


Unnamed: 0,month,y,cross_flow_lag1,cross_flow_ma3,cross_flow_lag10
4415418,2025-02-01,109592.0,2089406.0,2122684.75,1662934.0
5716359,2025-02-01,18420.0,2089406.0,2122684.75,1662934.0
6368969,2025-02-01,0.0,2089406.0,2122684.75,1662934.0
3902634,2025-02-01,56240.0,2089406.0,2122684.75,1662934.0
6919723,2025-02-01,729410.0,2089406.0,2122684.75,1662934.0
6014793,2025-02-01,108000.0,2089406.0,2122684.75,1662934.0
6550522,2025-02-01,20793.0,2089406.0,2122684.75,1662934.0
7079358,2025-02-01,3922.0,2089406.0,2122684.75,1662934.0
4089618,2025-02-01,0.0,2089406.0,2122684.75,1662934.0
7463293,2025-02-01,0.0,2089406.0,2122684.75,1662934.0


In [49]:
# 🌐 Cell 8 — Macro drift (origin totals: lag1/ma3 + optional lag10) — case-robust

# Build origin-month totals by flow and pivot
macro = (
    df.groupby(['origin','month','trade_flow'], as_index=False)['y'].sum()
      .pivot(index=['origin','month'], columns='trade_flow', values='y')
      .reset_index()
)

# Normalize column names to lowercase for consistent handling
macro.columns = [c.lower() if isinstance(c, str) else c for c in macro.columns]

# Ensure both export/import columns exist even if one is missing in a subset
if 'export' not in macro.columns:
    macro['export'] = 0.0
if 'import' not in macro.columns:
    macro['import'] = 0.0

# Rename to canonical macro names
macro = macro.rename(columns={
    'export': 'origin_total_exports',
    'import': 'origin_total_imports'
})

# Compute total
macro['origin_total_trade'] = macro['origin_total_exports'] + macro['origin_total_imports']

# Lags and MAs per origin
macro = macro.sort_values(['origin','month'])
for base in ['origin_total_exports','origin_total_imports','origin_total_trade']:
    macro[f'{base}_lag1'] = macro.groupby('origin')[base].shift(1)
for base in ['origin_total_exports','origin_total_imports']:
    macro[f'{base}_ma3'] = macro.groupby('origin')[base].shift(1).rolling(3).mean()
# Optional seasonal anchor under h=2 (same-month-last-year relative to cutoff)
for base in ['origin_total_exports','origin_total_imports']:
    macro[f'{base}_lag10'] = macro.groupby('origin')[base].shift(10)

# Merge back to bilateral rows
df = df.merge(macro, on=['origin','month'], how='left')

# Fill/cast engineered macro columns
macro_cols = [c for c in df.columns
              if c.startswith('origin_total_') and (c.endswith('_lag1') or c.endswith('_ma3') or c.endswith('_lag10'))]
df[macro_cols] = df[macro_cols].fillna(0).astype('float32')

print("Macro drift features complete (case-robust for Import/Export).")

Macro drift features complete (case-robust for Import/Export).


In [50]:
# 🔍 quick diagnostic after Cell 8 — macro drift feature fill check

macro_cols_check = [
    'origin_total_exports_lag1', 'origin_total_imports_lag1', 'origin_total_trade_lag1',
    'origin_total_exports_ma3', 'origin_total_imports_ma3',
    'origin_total_exports_lag10', 'origin_total_imports_lag10'
]

print("Non-zero value counts in macro drift features:")
for c in macro_cols_check:
    if c in df.columns:
        nz = (df[c] != 0).sum()
        print(f"{c:>35}: {nz:,}  ({nz/len(df):.1%} of rows)")
    else:
        print(f"{c:>35}:  column missing")

Non-zero value counts in macro drift features:
          origin_total_exports_lag1: 7,809,136  (96.9% of rows)
          origin_total_imports_lag1: 7,809,136  (96.9% of rows)
            origin_total_trade_lag1: 7,809,136  (96.9% of rows)
           origin_total_exports_ma3: 7,242,741  (89.8% of rows)
           origin_total_imports_ma3: 7,242,741  (89.8% of rows)
         origin_total_exports_lag10: 5,021,719  (62.3% of rows)
         origin_total_imports_lag10: 5,021,719  (62.3% of rows)


In [51]:
H = 2  # direct two-month-ahead
df = df.sort_values(series_keys + ['month']).reset_index(drop=True)

# y_target = y at t+2 within each series
df['y_target'] = df.groupby(series_keys)['y'].shift(-H)

# Meta
df['horizon'] = np.int8(H)
df['cutoff_month'] = cutoff_month

# Sanity: forecast month equals train_end + 2 months
assert forecast_month == (train_end_month + pd.offsets.MonthBegin(H)), "forecast_month must equal train_end_month + H months."

# Basic key integrity
assert not df.duplicated(subset=key_cols).any(), "Duplicate keys exist after feature merges."

print(df[['origin','destination','hs6','trade_flow','month','y','y_target']].head())

  origin destination     hs6 trade_flow      month         y  y_target
0    CHN         AGO  010612     import 2024-09-01  680000.0       NaN
1    CHN         AGO  010612     import 2025-02-01  729707.0       NaN
2    CHN         AGO  140490     import 2024-12-01     287.0       NaN
3    CHN         AGO  210690     import 2023-06-01      40.0      29.0
4    CHN         AGO  210690     import 2023-07-01      44.0       NaN


In [52]:
# Train: rows with month ≤ t and y_target available
train_mask = (df['month'] <= train_end_month) & df['y_target'].notna()
# Test: rows at cutoff month (t)
test_mask  = (df['month'] == train_end_month)

df_train = df.loc[train_mask].copy()
df_test  = df.loc[test_mask].copy()

print("Rows — train:", len(df_train), " test:", len(df_test))
print("Train months:", df_train['month'].min(), "→", df_train['month'].max())
print("Test month unique:", df_test['month'].drop_duplicates().tolist())

# Final quick checks
assert df_test['month'].nunique() == 1 and df_test['month'].iloc[0] == train_end_month
assert df_train['y_target'].notna().all()

# Save
train_out = os.path.join(output_folder, 'features_train_h2.parquet')
test_out  = os.path.join(output_folder, 'features_test_h2.parquet')
df_train.to_parquet(train_out, index=False)
df_test.to_parquet(test_out, index=False)

print("Saved:")
print("  ", train_out)
print("  ", test_out)

Rows — train: 5979239  test: 320208
Train months: 2023-01-01 00:00:00 → 2024-08-01 00:00:00
Test month unique: [Timestamp('2024-08-01 00:00:00')]
Saved:
   /content/drive/MyDrive/ai4trade/data/features/features_train_h2.parquet
   /content/drive/MyDrive/ai4trade/data/features/features_test_h2.parquet


In [53]:
# pick one random series present in test and preview last 14 months
sample_keys = (df_test[series_keys]
               .drop_duplicates()
               .sample(1, random_state=123)
               .iloc[0].to_dict())
mask = (df[series_keys[0]].eq(sample_keys[series_keys[0]]) &
        df[series_keys[1]].eq(sample_keys[series_keys[1]]) &
        df[series_keys[2]].eq(sample_keys[series_keys[2]]) &
        df[series_keys[3]].eq(sample_keys[series_keys[3]]))
ser = df.loc[mask].sort_values('month').tail(14)

print("Sample series:", sample_keys)
display(ser[['month','y','lag_1','ma_3','cross_flow_lag1','cross_flow_lag10',
             'origin_total_exports_lag1','origin_total_imports_ma3','y_target']])
print("Expectations for rows at month == 2024-08:")
print(" - lag_1 uses 2024-07")
print(" - cross_flow_lag10 uses 2023-10 (h=2 same-month-last-year)")

Sample series: {'origin': 'CHN', 'destination': 'TWN', 'hs6': '691010', 'trade_flow': 'export'}


Unnamed: 0,month,y,lag_1,ma_3,cross_flow_lag1,cross_flow_lag10,origin_total_exports_lag1,origin_total_imports_ma3,y_target
3365098,2024-02-01,3695391.0,8383008.0,8775340.0,2489881.0,5732900.0,251528500000.0,206615200000.0,8254610.0
3365099,2024-03-01,8933695.0,3695391.0,7246000.0,2852595.0,4525038.0,180453000000.0,193042000000.0,4603301.0
3365100,2024-04-01,8254610.0,8933695.0,7004031.5,5290334.0,5205355.0,234591700000.0,190430500000.0,3634400.0
3365101,2024-05-01,4603301.0,8254610.0,6961232.0,4902928.0,5134858.0,241531800000.0,189137400000.0,3931932.0
3365102,2024-06-01,3634400.0,4603301.0,7263868.5,4070432.0,5876987.0,248068700000.0,200765300000.0,4561348.0
3365103,2024-07-01,3931932.0,3634400.0,5497437.0,4215443.0,4468554.0,254313200000.0,196843800000.0,3587818.0
3365104,2024-08-01,4561348.0,3931932.0,4056544.25,4860669.0,4999690.0,249625200000.0,195464900000.0,4067127.0
3365105,2024-09-01,3587818.0,4561348.0,4042560.0,4857219.0,4554962.0,254160100000.0,194378600000.0,5539317.0
3365106,2024-10-01,4067127.0,3587818.0,4027032.75,4093596.0,4410168.0,251796100000.0,199165500000.0,6847597.0
3365107,2024-11-01,5539317.0,4067127.0,4072097.75,3678189.0,2489881.0,254257300000.0,198412600000.0,3946998.0


Expectations for rows at month == 2024-08:
 - lag_1 uses 2024-07
 - cross_flow_lag10 uses 2023-10 (h=2 same-month-last-year)
