In [1]:
# ================================================
# FRAUD DETECTION: Proactive Modeling Pipeline
# PaySim-like dataset (10 columns, 6.36M rows)
# ================================================

# --------
# 0) Imports
# --------
import os, gc, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (roc_auc_score, average_precision_score, precision_recall_curve,
                             roc_curve, confusion_matrix, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.inspection import permutation_importance

# For VIF (multi-collinearity)
try:
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    import statsmodels.api as sm
    HAVE_SM = True
except Exception:
    HAVE_SM = False

# Optional resampling (applied ONLY on training)
try:
    from imblearn.pipeline import Pipeline as ImbPipeline
    from imblearn.over_sampling import SMOTE
    HAVE_IMB = True
except Exception:
    HAVE_IMB = False



In [2]:

# ----------------------------
# 1) Load data (memory friendly)
# ----------------------------
CSV_PATH = "Fraud.csv"  # <-- change if needed

dtypes = {
    "step": "int16",
    "type": "category",
    "amount": "float32",
    "nameOrig": "category",
    "oldbalanceOrg": "float32",
    "newbalanceOrig": "float32",
    "nameDest": "category",
    "oldbalanceDest": "float32",
    "newbalanceDest": "float32",
    "isFraud": "int8",
    "isFlaggedFraud": "int8",
}

usecols = list(dtypes.keys())
df = pd.read_csv(CSV_PATH, usecols=usecols, dtype=dtypes)

print(df.head())
print(df.shape, "rows, cols")


   step      type        amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.639648  C1231006815       170136.0   160296.359375   
1     1   PAYMENT   1864.280029  C1666544295        21249.0    19384.720703   
2     1  TRANSFER    181.000000  C1305486145          181.0        0.000000   
3     1  CASH_OUT    181.000000   C840083671          181.0        0.000000   
4     1   PAYMENT  11668.139648  C2048537720        41554.0    29885.859375   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  
(6362620, 11) rows, cols


In [3]:
# --------------------------------------------
# 2) Basic cleaning: duplicates, NA, consistency
# --------------------------------------------
# Drop exact duplicates if any
df = df.drop_duplicates()

# True missing values?
missing = df.isna().sum()
print("\nMissing values per column:\n", missing)

# In this dataset, merchant accounts (name starts with 'M') typically show 0 balances.
# Treat zeros in dest balances as missing *only for merchant* recipients to create robust features.
is_merchant_dest = df["nameDest"].astype(str).str.startswith("M")
for col in ["oldbalanceDest", "newbalanceDest"]:
    # Flag zero-as-missing for merchants (not imputing; we will build delta features that handle this)
    df.loc[is_merchant_dest & (df[col] == 0), col] = np.nan



Missing values per column:
 step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [4]:
# -----------------------
# 3) Feature engineering
# -----------------------
# 3.1 Cyclical encoding for step (hour of simulation: 1..744)
df["step_sin"] = np.sin(2 * np.pi * df["step"] / 24.0)
df["step_cos"] = np.cos(2 * np.pi * df["step"] / 24.0)

# 3.2 Numeric deltas capturing balance consistency
# Errors should be ~0 for legitimate transactions; large residuals often indicate fraud
df["delta_orig"] = (df["oldbalanceOrg"] - df["newbalanceOrig"] - df["amount"]).astype("float32")
df["delta_dest"] = (df["newbalanceDest"] - df["oldbalanceDest"] - df["amount"]).astype("float32")

# 3.3 Proportional features
df["ratio_amount_to_oldOrg"] = (df["amount"] / (df["oldbalanceOrg"].abs() + 1.0)).astype("float32")

# 3.4 Flags
df["is_merchant_dest"] = is_merchant_dest.astype("int8")

# 3.5 Log-transformed robust features (outlier-tolerant)
for col in ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]:
    df[f"log1p_{col}"] = np.log1p(df[col].clip(lower=0))

# 3.6 Domain rule helper: suspicious types where fraud occurs in PaySim
df["is_suspicious_type"] = df["type"].isin(["TRANSFER", "CASH_OUT"]).astype("int8")

In [5]:

# ------------------------
# 4) Handle outliers cleanly
# ------------------------
# Winsorize extremes for the most skewed raw features (keeps training stable).
def winsorize_series(s, low_q=0.001, high_q=0.999):
    lo, hi = s.quantile(low_q), s.quantile(high_q)
    return s.clip(lower=lo, upper=hi)

for col in ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest",
            "delta_orig", "delta_dest", "ratio_amount_to_oldOrg"]:
    df[col] = winsorize_series(df[col])
    

In [6]:

# ---------------------------------------
# 5) Train/Validation split (STRATIFIED)
# ---------------------------------------
y = df["isFraud"].astype(int)
X = df.drop(columns=["isFraud", "nameOrig", "nameDest"])  # drop IDs (leakage/noise)
# IMPORTANT: DO NOT USE isFlaggedFraud as a feature (it's a post-rule, not ground truth)
X = X.drop(columns=["isFlaggedFraud"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print("Class balance (train):", np.bincount(y_train))
print("Class balance (test): ", np.bincount(y_test))


Class balance (train): [5083526    6570]
Class balance (test):  [1270881    1643]


In [7]:

# -----------------------------------------------------------
# 6) Multi-collinearity check (VIF) on numeric-only features
# -----------------------------------------------------------
numeric_cols = X.select_dtypes(include=["float32", "float64", "int16", "int8"]).columns.tolist()
# We'll drop one of any highly correlated pairs (VIF > 10)
def drop_high_vif(df_num, thresh=10.0):
    if not HAVE_SM:
        print("statsmodels not found; skipping VIF-based dropping.")
        return df_num.columns.tolist()
    cols = list(df_num.columns)
    while True:
        Xn = sm.add_constant(df_num[cols].fillna(0.0))
        vif = pd.Series([variance_inflation_factor(Xn.values, i)
                         for i in range(1, Xn.shape[1])], index=cols, name="VIF")
        worst = vif.idxmax()
        if vif.max() > thresh:
            print(f"Dropping '{worst}' due to high VIF={vif.max():.2f}")
            cols.remove(worst)
        else:
            break
    return cols

vif_keep = drop_high_vif(X_train[numeric_cols])
# Ensure we keep only low-VIF numeric columns + all non-numeric
cols_keep = sorted(set(vif_keep).union(set(X.select_dtypes(exclude=["float32", "float64", "int16", "int8"]).columns)))
X_train = X_train[cols_keep]
X_test  = X_test[cols_keep]

Dropping 'newbalanceOrig' due to high VIF=19831.59
Dropping 'newbalanceDest' due to high VIF=105.70
Dropping 'delta_orig' due to high VIF=16.23
Dropping 'log1p_newbalanceDest' due to high VIF=10.55


In [8]:
# -----------------------------------------
# 7) Preprocess: OHE for 'type', scale nums
# -----------------------------------------
categorical_cols = X_train.select_dtypes(include=["category", "object", "bool"]).columns.tolist()
numeric_cols     = X_train.select_dtypes(include=["int16","int8","float32","float64"]).columns.tolist()

pre = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(with_mean=False), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
    ],
    remainder="drop",
    verbose_feature_names_out=False
)



In [None]:
# --------------------------------------------------------
# 8) Model: Hybrid "Rule + ML" (domain + classifier)
# --------------------------------------------------------
# Rule: Non-suspicious types (not TRANSFER/CASH_OUT) => predict 0 fraud.
# ML: Only learn on suspicious_type==1 (reduces class imbalance, improves signal)
train_mask = X_train["is_suspicious_type"] == 1
X_train_ml = X_train.loc[train_mask]
y_train_ml = y_train.loc[train_mask]

# Base RandomForest
rf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42,
    class_weight="balanced_subsample"
)

# Pipeline with preprocessing
if HAVE_IMB:
    model = ImbPipeline(steps=[
        ("pre", pre),
        ("smote", SMOTE(random_state=42, sampling_strategy=0.2)),
        ("rf", rf),
    ])
else:
    model = Pipeline(steps=[
        ("pre", pre),
        ("rf", rf),
    ])

# Hyperparameter space
param_dist = {
    "rf__n_estimators": [300, 400, 600],
    "rf__max_depth": [None, 12, 18, 24],
    "rf__min_samples_leaf": [1, 2, 4],
    "rf__max_features": ["sqrt", "log2", None],
}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Sample for tuning (stratified)
X_tune, _, y_tune, _ = train_test_split(
    X_train_ml, y_train_ml, train_size=200_000, stratify=y_train_ml, random_state=42
)

# Tune on subsample
print("Running hyperparameter tuning on 200k subsample...")
search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=5,   # lightweight search
    scoring="average_precision",
    cv=cv,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)
search.fit(X_tune, y_tune)

print("Best params from search:", search.best_params_)

# Retrain final model with best params on full suspicious dataset
best_model = search.best_estimator_.set_params(**search.best_params_)
best_model.fit(X_train_ml, y_train_ml)
print("Final model retrained on full suspicious subset.")


Running hyperparameter tuning on 200k subsample...
Fitting 3 folds for each of 5 candidates, totalling 15 fits


KeyboardInterrupt: 

In [None]:

# ---------------------------------
# 9) Evaluate on the full test set
# ---------------------------------
# Produce probabilities with the hybrid rule:
#   if is_suspicious_type==0 -> prob=0
#   else -> model.predict_proba
X_test_prepped = X_test.copy()
proba = np.zeros(len(X_test_prepped), dtype=float)

sus_mask_test = X_test_prepped["is_suspicious_type"] == 1
if sus_mask_test.any():
    proba[sus_mask_test.values] = best_model.predict_proba(X_test_prepped.loc[sus_mask_test])[:, 1]

roc = roc_auc_score(y_test, proba)
pr_auc = average_precision_score(y_test, proba)
print(f"\nROC-AUC: {roc:.4f} | PR-AUC: {pr_auc:.4f}")

# Choose a decision threshold that maximizes F1 on the PR curve
prec, rec, thr = precision_recall_curve(y_test, proba)
f1 = 2 * (prec * rec) / (prec + rec + 1e-12)
best_idx = np.nanargmax(f1)
best_thr = thr[max(best_idx-1, 0)] if best_idx < len(thr) else 0.5
print(f"Best threshold by F1: {best_thr:.4f}")

y_pred = (proba >= best_thr).astype(int)

print("\nConfusion matrix (test):\n", confusion_matrix(y_test, y_pred))
print("\nClassification report (test):\n", classification_report(y_test, y_pred, digits=4))

In [None]:

# ---------------------------------------------------
# 10) Feature importance (tree) + permutation import.
# ---------------------------------------------------
# Get feature names after preprocessing
def get_feature_names(preprocessor, numeric_cols, categorical_cols):
    num_names = list(numeric_cols)
    cat_encoder = preprocessor.named_transformers_["cat"]
    cat_names = list(cat_encoder.get_feature_names_out(categorical_cols))
    return num_names + cat_names

# Use the best model's inner RandomForest
rf_best = best_model.named_steps["rf"]
feat_names = get_feature_names(best_model.named_steps["pre"], numeric_cols, categorical_cols)

# Tree-based importances
tree_imp = pd.Series(rf_best.feature_importances_, index=feat_names).sort_values(ascending=False)
print("\nTop 15 features (Gini Importance):\n", tree_imp.head(15))

# Permutation importance on a manageable sample
sample_idx = np.random.RandomState(42).choice(np.where(sus_mask_test)[0], size=min(50000, sus_mask_test.sum()), replace=False)
perm = permutation_importance(best_model, X_test_prepped.iloc[sample_idx], y_test.iloc[sample_idx],
                              scoring="average_precision", n_repeats=3, random_state=42, n_jobs=-1)
perm_imp = pd.Series(perm.importances_mean, index=feat_names).sort_values(ascending=False)
print("\nTop 15 features (Permutation Importance):\n", perm_imp.head(15))


In [None]:
# -----------------------------------------
# 11) Save model + threshold for deployment
# -----------------------------------------
import joblib
ARTIFACT_DIR = "artifacts"
os.makedirs(ARTIFACT_DIR, exist_ok=True)
joblib.dump({"model": best_model, "threshold": float(best_thr), "cols_keep": cols_keep}, 
            os.path.join(ARTIFACT_DIR, "fraud_hybrid_rf.joblib"))
print("\nSaved model to artifacts/fraud_hybrid_rf.joblib")

In [None]:
# ----------------------
# 12) Inference helper
# ----------------------
def predict_fraud_proba(df_new, bundle_path=os.path.join(ARTIFACT_DIR, "fraud_hybrid_rf.joblib")):
    """Return fraud probability using the saved hybrid model bundle."""
    bundle = joblib.load(bundle_path)
    model, thr, cols_keep = bundle["model"], bundle["threshold"], bundle["cols_keep"]

    # Feature engineering must mirror training (wrap as needed for prod)
    out = df_new.copy()
    out["step_sin"] = np.sin(2 * np.pi * out["step"] / 24.0)
    out["step_cos"] = np.cos(2 * np.pi * out["step"] / 24.0)
    out["delta_orig"] = (out["oldbalanceOrg"] - out["newbalanceOrig"] - out["amount"])
    out["delta_dest"] = (out["newbalanceDest"] - out["oldbalanceDest"] - out["amount"])
    out["ratio_amount_to_oldOrg"] = (out["amount"] / (out["oldbalanceOrg"].abs() + 1.0))
    out["is_merchant_dest"] = out["nameDest"].astype(str).str.startswith("M").astype(int)
    for col in ["amount", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]:
        out[f"log1p_{col}"] = np.log1p(out[col].clip(lower=0))
    out["is_suspicious_type"] = out["type"].isin(["TRANSFER", "CASH_OUT"]).astype(int)

    out = out[cols_keep].copy()
    proba = np.zeros(len(out), dtype=float)
    sus_mask = out["is_suspicious_type"] == 1
    if sus_mask.any():
        proba[sus_mask.values] = model.predict_proba(out.loc[sus_mask])[:, 1]
    return proba

print("\nPipeline complete.")