## 04_advanced_models

### Train XGBoost and LightGBM with imbalance handling using undersampling. 

In [23]:
from pathlib import Path
import json
import joblib
import os
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

BASE = '..'
INPUT = os.path.join(BASE, 'data', 'Nova_pay_features.csv')
MODELS = os.path.join(BASE, 'models')

# Load the data
df = pd.read_csv(INPUT, parse_dates=['timestamp'])
df = df.dropna(subset=['timestamp', 'is_fraud'])

# Features
categorical = ['home_country','source_currency','dest_currency','channel','kyc_tier','ip_country','new_device','location_mismatch', 'ip_country_missing']
numeric = ['amount_src','amount_usd','fee','ip_risk_score','device_trust_score','account_age_days','txn_velocity_1h','txn_velocity_24h','corridor_risk','risk_score_internal', 'hour', 'dayofweek']
for c in categorical:
    if c in df.columns:
        df[c] = df[c].astype(str)
        
df = df.sort_values('timestamp')
max_date = df['timestamp'].max()
valid_start = max_date - pd.Timedelta(days=270)
test_start = max_date - pd.Timedelta(days=180)

train_df = df[df['timestamp'] < valid_start]
valid_df = df[(df['timestamp'] >= valid_start) & (df['timestamp'] < test_start)]
test_df = df[df['timestamp'] >= test_start]

X_train = train_df[categorical + numeric]
y_train = train_df['is_fraud'].astype(int).values
X_valid = valid_df[categorical + numeric]
y_valid = valid_df['is_fraud'].astype(int).values
X_test = test_df[categorical + numeric]
y_test = test_df['is_fraud'].astype(int).values

preprocessor = ColumnTransformer(
    transformers=[
        # One-hot econde categorical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),

        # Standard scale numerical features
        ("num", StandardScaler(with_mean=False), numeric),
    ],
    remainder="drop"
)

rus = RandomUnderSampler(random_state=42)

xgb_rus = ImbPipeline(steps=[
    ("prep", preprocessor),
    ("rus", rus),
    ("model", XGBClassifier(
        n_estimators=800,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        eval_metric="aucpr"
    ))
])

xgb_rus.fit(X_train, y_train)
y_prob_xgb_rus = xgb_rus.predict_proba(X_test)[:, 1]
print("XGB+RUS Test ROC-AUC:", roc_auc_score(y_test, y_prob_xgb_rus))


lgbm_rus = ImbPipeline(steps=[
    ("prep", preprocessor),
    ("rus", rus),
    ("model", LGBMClassifier(
          n_estimators=800,
        learning_rate=0.05,
        num_leaves=31,
        min_child_samples=5,        
        min_split_gain=0.0,         
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1     
       
    ))
])

lgbm_rus.fit(X_train, y_train)
y_prob_lgbm_rus = lgbm_rus.predict_proba(X_test)[:, 1]
print("LGBM+RUS Test ROC-AUC:", roc_auc_score(y_test, y_prob_lgbm_rus))

def metric_pack(y_true, y_prob, thr=0.5):
    y_pred = (y_prob >= thr).astype(int)
    p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    auc = roc_auc_score(y_true, y_prob) if len(np.unique(y_true))>1 else float('nan')
    return {'precision': float(p), 'recall': float(r), 'f1': float(f1), 'roc_auc': float(auc), 'threshold': thr}

metrics = {
    'xgboost': metric_pack(y_test, y_prob_xgb_rus),
    'lightgbm': metric_pack(y_test, y_prob_lgbm_rus)
}

print(json.dumps(metrics, indent=2))

os.makedirs(MODELS, exist_ok=True)
joblib.dump(xgb_rus, os.path.join(MODELS, 'model_xgb.joblib'))
joblib.dump(lgbm_rus, os.path.join(MODELS, 'model_lgbm.joblib'))
with open(os.path.join(MODELS, 'metrics_advanced.json'),'w',encoding='utf-8') as f:
    json.dump(metrics, f, indent=2)

XGB+RUS Test ROC-AUC: 0.9793384496915475
LGBM+RUS Test ROC-AUC: 0.974350433044637
{
  "xgboost": {
    "precision": 0.7046263345195729,
    "recall": 0.9473684210526315,
    "f1": 0.8081632653061225,
    "roc_auc": 0.9793384496915475,
    "threshold": 0.5
  },
  "lightgbm": {
    "precision": 0.6971830985915493,
    "recall": 0.9473684210526315,
    "f1": 0.8032454361054767,
    "roc_auc": 0.974350433044637,
    "threshold": 0.5
  }
}
