In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import gc
import json

In [2]:
df_final = pd.read_parquet("../data/processed/train_identity_final.parquet")

In [3]:
df_final.head()

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,TransactionAmt_log,isFraud
0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,4.241327,0
1,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,3.401197,0
2,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,4.094345,0
3,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,3.931826,0
4,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,...,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,3.931826,0


In [4]:
for col in df_final.select_dtypes(include=['float64']).columns:
    df_final[col] = df_final[col].astype('float32')
gc.collect()

20

In [5]:
y_identity = df_final['isFraud'].values
df_final.drop(columns=['isFraud'], inplace=True)

In [6]:
X_identity = df_final

In [7]:
#Identify all string/object columns
object_cols = X_identity.select_dtypes(include=['object']).columns.tolist()

#Convert them to 'category' dtype
# This is memory-efficient and required by LightGBM
for col in object_cols:
    X_identity[col] = X_identity[col].astype('category')

cat_features = X_identity.select_dtypes(include=['category']).columns.tolist()
print(f"Converted {len(object_cols)} columns to category dtype.")
print(f"Total Categorical Features: {len(cat_features)}")
gc.collect()

Converted 31 columns to category dtype.
Total Categorical Features: 31


0

In [8]:
# --- ROBUSTNESS INJECTION: Interaction Features ---
# These features link Identity to Monetary Value

# Device Context: How many standard deviations is this amount from the device's mean?
X_identity['Device_Avg_Amt'] = X_identity.groupby('DeviceInfo', observed=False)['TransactionAmt'].transform('mean')
X_identity['Device_Amt_Ratio'] = X_identity['TransactionAmt'] / (X_identity['Device_Avg_Amt'] + 1)

# Browser Context: Is this transaction amount higher than any ever seen on this browser?
X_identity['Browser_Max_Amt'] = X_identity.groupby('id_31', observed=False)['TransactionAmt'].transform('max')

# Email Context: Does the email provider typically handle transactions of this size?
X_identity['Email_P_Ratio'] = X_identity['TransactionAmt'] / (X_identity.groupby('P_emaildomain', observed=False)['TransactionAmt'].transform('mean') + 1)

# Filling NaNs
new_cols = ['Device_Avg_Amt', 'Device_Amt_Ratio', 'Browser_Max_Amt', 'Email_P_Ratio']
X_identity[new_cols] = X_identity[new_cols].fillna(0)

# EXPORT REFERENCE STATS FOR API 
stats = {
    "device_avg": X_identity.groupby('DeviceInfo')['TransactionAmt'].mean().to_dict(),
    "browser_max": X_identity.groupby('id_31')['TransactionAmt'].max().to_dict(),
    "email_avg": X_identity.groupby('P_emaildomain')['TransactionAmt'].mean().to_dict(),
    "global_mean": float(X_identity['TransactionAmt'].mean())}

with open("../artifacts/schema/reference_stats.json", "w") as f:
    json.dump(stats, f)

# dropping the raw TransactionAmt to prevent redundancy with TransactionAmt_log
if 'TransactionAmt_log' in X_identity.columns:
    X_identity = X_identity.drop(columns=['TransactionAmt'])

print(f"✅ Robustness features injected. Current Features: {X_identity.shape[1]}")

  X_identity['Device_Avg_Amt'] = X_identity.groupby('DeviceInfo', observed=False)['TransactionAmt'].transform('mean')
  X_identity['Device_Amt_Ratio'] = X_identity['TransactionAmt'] / (X_identity['Device_Avg_Amt'] + 1)
  X_identity['Browser_Max_Amt'] = X_identity.groupby('id_31', observed=False)['TransactionAmt'].transform('max')
  X_identity['Email_P_Ratio'] = X_identity['TransactionAmt'] / (X_identity.groupby('P_emaildomain', observed=False)['TransactionAmt'].transform('mean') + 1)
  "device_avg": X_identity.groupby('DeviceInfo')['TransactionAmt'].mean().to_dict(),
  "browser_max": X_identity.groupby('id_31')['TransactionAmt'].max().to_dict(),
  "email_avg": X_identity.groupby('P_emaildomain')['TransactionAmt'].mean().to_dict(),


✅ Robustness features injected. Current Features: 436


In [9]:
params_identity = {
    'objective': 'binary',
    'metric': 'auc',
    'is_unbalance': False, 
    'scale_pos_weight': 5, # High priority on catching fraud(False Negavtives)
    'learning_rate': 0.01, # Small lr to find complex patterns
    'num_leaves': 63,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'seed': 10 }

In [10]:
from sklearn.model_selection import train_test_split

#index-based Splitting
indices = np.arange(len(y_identity))

# 2. Split indices
train_idx, val_idx = train_test_split(
    indices,
    test_size=0.2,
    stratify=y_identity,
    random_state=10)

In [11]:
# 3. Creating LightGBM datasets using the sliced indices
# We use X_identity (the downcasted DataFrame) and y_identity (numpy array)
train_set_identity = lgb.Dataset(
    X_identity.iloc[train_idx],
    label=y_identity[train_idx],
    categorical_feature=cat_features,
    free_raw_data=False)

val_set_identity = lgb.Dataset(
    X_identity.iloc[val_idx],
    label=y_identity[val_idx],
    reference=train_set_identity,
    categorical_feature=cat_features,
    free_raw_data=False)

print(f"Datasets created. Train size: {len(train_idx)}, Val size: {len(val_idx)}")

Datasets created. Train size: 472432, Val size: 118108


In [12]:
from pathlib import Path

schema_dir = Path("../artifacts/schema")
schema_dir.mkdir(parents=True, exist_ok=True)

# Capture final feature list and categorical types
identity_schema = {
    "features": X_identity.columns.tolist(),
    "categorical_features": cat_features,
    "category_vocabulary": {}}

# Capture the EXACT string-to-integer mapping for every categorical column
for col in cat_features:
    if col in X_identity.columns:
        if hasattr(X_identity[col], 'cat'):
            identity_schema["category_vocabulary"][col] = X_identity[col].cat.categories.tolist()

with open(schema_dir / "schema_identity_final.json", "w") as f:
    json.dump(identity_schema, f, indent=2)
    
print(f"- Total Features: {len(identity_schema['features'])}")
print(f"- Categorical Columns: {len(identity_schema['category_vocabulary'])}")

- Total Features: 436
- Categorical Columns: 31


In [13]:
# Training the Identity-Enhanced Model
model_identity = lgb.train(
    params_identity,
    train_set_identity,
    valid_sets=[train_set_identity, val_set_identity],
    num_boost_round=1000,
    callbacks=[lgb.early_stopping(stopping_rounds=100),
               lgb.log_evaluation(period=50)])

Training until validation scores don't improve for 100 rounds
[50]	training's auc: 0.907484	valid_1's auc: 0.893465
[100]	training's auc: 0.920464	valid_1's auc: 0.902858
[150]	training's auc: 0.928912	valid_1's auc: 0.909952
[200]	training's auc: 0.934985	valid_1's auc: 0.915204
[250]	training's auc: 0.940256	valid_1's auc: 0.919938
[300]	training's auc: 0.944595	valid_1's auc: 0.923852
[350]	training's auc: 0.948447	valid_1's auc: 0.927241
[400]	training's auc: 0.951848	valid_1's auc: 0.930235
[450]	training's auc: 0.954968	valid_1's auc: 0.932991
[500]	training's auc: 0.957793	valid_1's auc: 0.935258
[550]	training's auc: 0.960209	valid_1's auc: 0.937207
[600]	training's auc: 0.962566	valid_1's auc: 0.939132
[650]	training's auc: 0.964694	valid_1's auc: 0.94063
[700]	training's auc: 0.966606	valid_1's auc: 0.942124
[750]	training's auc: 0.96836	valid_1's auc: 0.943478
[800]	training's auc: 0.969836	valid_1's auc: 0.944418
[850]	training's auc: 0.971358	valid_1's auc: 0.945533
[900]	

In [14]:
X_val_final = X_identity.iloc[val_idx]
y_val_final = y_identity[val_idx]

In [15]:
y_val_pred_identity = model_identity.predict(X_identity.iloc[val_idx],
                                             num_iteration=model_identity.best_iteration)

In [16]:
from sklearn.metrics import roc_curve, auc

fpr_identity, tpr_identity, thresholds_identity = roc_curve(y_identity[val_idx], y_val_pred_identity)
auc_identity = auc(fpr_identity, tpr_identity)
auc_identity

0.9481773708975849

In [17]:
def recall_at_fpr(fpr, tpr, target_fpr):
    idx = (fpr <= target_fpr).nonzero()[0]
    if len(idx) == 0:
        return 0.0
    return tpr[idx[-1]]

recall_1pct_identity = recall_at_fpr(fpr_identity, tpr_identity, 0.01)
recall_05pct_identity = recall_at_fpr(fpr_identity, tpr_identity, 0.005)

recall_1pct_identity,recall_05pct_identity

(np.float64(0.6583595451246068), np.float64(0.597870796031938))

In [18]:
#Optimal Threshold at 1% FPR 
valid_idx = fpr_identity <= 0.01
best_idx_val = valid_idx.nonzero()[0][-1]
optimal_threshold_identity = thresholds_identity[best_idx_val]

optimal_threshold_identity

np.float64(0.474017541816528)

In [19]:
#KS Statistic
def compute_ks(y_true, y_pred):
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    ks = np.max(tpr - fpr)
    return ks

ks_identity = compute_ks(y_identity[val_idx], y_val_pred_identity)
ks_identity

np.float64(0.7625740423950371)

In [20]:
# Display Results
print(f"--- Identity Model Performance ---")
print(f"AUC: {auc_identity:.5f}")
print(f"Recall @ 1% FPR: {recall_1pct_identity:.5f}")
print(f"Recall @ 0.5% FPR: {recall_05pct_identity:.5f}")
print(f"KS Statistic: {ks_identity:.5f}")
print(f"Optimal Threshold (1% FPR): {optimal_threshold_identity:.5f}")

--- Identity Model Performance ---
AUC: 0.94818
Recall @ 1% FPR: 0.65836
Recall @ 0.5% FPR: 0.59787
KS Statistic: 0.76257
Optimal Threshold (1% FPR): 0.47402


In [21]:
# feature Importance extraction
identity_importance = pd.DataFrame({
    'feature': X_identity.columns,
    'gain': model_identity.feature_importance(importance_type='gain')
}).sort_values('gain', ascending=False)

identity_importance["gain_pct"] = identity_importance["gain"] / identity_importance["gain"].sum()
identity_importance["cum_gain_pct"] = identity_importance["gain_pct"].cumsum()

identity_importance.to_csv("../artifacts/identity_feature_importance.csv", index=False)

print(identity_importance.head(15))

           feature           gain  gain_pct  cum_gain_pct
309           V258  892937.971601  0.071859      0.071859
13   R_emaildomain  703966.562832  0.056652      0.128511
27             C14  570065.707745  0.045876      0.174387
308           V257  500557.909637  0.040282      0.214669
14              C1  495626.554499  0.039885      0.254554
345           V294  463772.068991  0.037322      0.291876
430     DeviceInfo  441699.345697  0.035546      0.327422
26             C13  371737.685940  0.029916      0.357337
12   P_emaildomain  356511.057962  0.028690      0.386028
29              D2  318984.535904  0.025670      0.411698
421          id_31  288734.934849  0.023236      0.434934
2            card1  277120.207710  0.022301      0.457235
3            card2  275542.934505  0.022174      0.479409
359           V308  252590.360399  0.020327      0.499736
252           V201  219549.450354  0.017668      0.517405


* id_31 (Browser/Software version) and DeviceInfo (Hardware model) immediately jumped into the Top 10 features by gain. This proves that the digital "fingerprint" is as important as the transaction itself for identifying modern fraud.

In [22]:
identity_metadata = {
    "model_name": "lightgbm_identity_final",
    "num_features": X_identity.shape[1],
    "auc": auc_identity,
    "ks": ks_identity,
    "recall_at_1pct_fpr": recall_1pct_identity,
    "recall_at_05pct_fpr": recall_05pct_identity,
    "optimal_threshold": float(optimal_threshold_identity),
    "fraud_rate": float(y_identity.mean()),
    "training_rows": int(len(y_identity)),
    "notes": "Final Identity-Enhanced model including Transaction + Identity data and Log Transform"
}

identity_metadata

{'model_name': 'lightgbm_identity_final',
 'num_features': 436,
 'auc': 0.9481773708975849,
 'ks': np.float64(0.7625740423950371),
 'recall_at_1pct_fpr': np.float64(0.6583595451246068),
 'recall_at_05pct_fpr': np.float64(0.597870796031938),
 'optimal_threshold': 0.474017541816528,
 'fraud_rate': 0.03499000914417313,
 'training_rows': 590540,
 'notes': 'Final Identity-Enhanced model including Transaction + Identity data and Log Transform'}

In [23]:
with open("../artifacts/identity_final_metadata.json", "w") as f:
    json.dump(identity_metadata, f, indent=4)

# 3. Save the LightGBM Model
model_identity.save_model("../artifacts/lightgbm_identity_final.json")

<lightgbm.basic.Booster at 0x1b1bd2df380>