In [None]:
import numpy as np
import pandas as pd
import random
from ctgan import CTGAN
import shap
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, precision_recall_curve
import seaborn as sns

#==================================Part 1==============================================================
# Load sample dataset and train CTGAN

seed_df = pd.read_csv('sample_with_3_percent_fraud.csv')
print("Sample dataset shape:", seed_df.shape)
print("Sample fraud rate:", seed_df['isFraud'].mean())

# Train CTGAN
cat_cols = ['type','nameOrig','nameDest']
ctgan = CTGAN(epochs=150, batch_size=1000)
ctgan.fit(seed_df, discrete_columns=cat_cols)

# Generate 100,000 synthetic transactions
syn = ctgan.sample(100000)
print("\nSynthetic dataset shape:", syn.shape)
print("Synthetic fraud rate:", syn['isFraud'].mean())

In [None]:
#==================================Part 2==============================================================
from geopy.distance import geodesic

# Assign random lat/lon per account
loc_o = {acc: (random.uniform(-90,90), random.uniform(-180,180))
         for acc in syn['nameOrig'].unique()}
loc_d = {acc: (random.uniform(-90,90), random.uniform(-180,180))
         for acc in syn['nameDest'].unique()}
syn['locOrig'] = syn['nameOrig'].map(loc_o)
syn['locDest'] = syn['nameDest'].map(loc_d)


#==================================Part 3==============================================================
def apply_rules(df):
    """Apply 7 fraud detection rules."""
    for i in range(1,8):
        df[f'rule{i}']=0
    history = {}
    df.sort_values(['nameOrig','step'], inplace=True)
    for idx,row in df.iterrows():
        acct = row['nameOrig']
        amt, tp, dest, ts = row['amount'], row['type'], row['nameDest'], row['step']
        loc1, loc2 = row['locOrig'], row['locDest']
        if acct not in history:
            history[acct] = {'times':[], 'dests':set(), 'type_amts':{}}
        h = history[acct]
        if amt>50000: df.at[idx,'rule1']=1
        recent = [t for t in h['times'] if ts-t<=1]
        if len(recent)>=5: df.at[idx,'rule2']=1
        if (ts%24)<5: df.at[idx,'rule3']=1
        if (tp=='CASH_OUT' and amt<10) or (tp=='TRANSFER' and amt>80000):
            df.at[idx,'rule4']=1
        if dest not in h['dests']: df.at[idx,'rule5']=1
        if geodesic(loc1, loc2).km>500: df.at[idx,'rule6']=1
        lst = h['type_amts'].get(tp,[])
        if len(lst)>=10:
            mu, sd = np.mean(lst), np.std(lst)
            if abs(amt-mu)>3*sd: df.at[idx,'rule7']=1
        h['times'].append(ts)
        h['dests'].add(dest)
        h['type_amts'].setdefault(tp,[]).append(amt)
    # Use consistent column name (BUG FIX - was isFlaggedFraud_rules)
    df['isFlaggedFraud'] = df[[f'rule{i}' for i in range(1,8)]].max(axis=1)
    return df

syn = apply_rules(syn)
print(syn[['rule1','rule2','rule3','rule4','rule5','rule6','rule7','isFlaggedFraud']].head())

In [None]:
#==================================Part 4==============================================================
# Data Preprocessing

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import joblib

model_df = syn.drop(columns=['locOrig','locDest']+[f'rule{i}' for i in range(1,8)])

# Save label encoders (BUG FIX)
label_encoders = {}
for col in ['type','nameOrig','nameDest']:
    le = LabelEncoder()
    model_df[col] = le.fit_transform(model_df[col])
    label_encoders[col] = le
joblib.dump(label_encoders, 'label_encoders.pkl')
print("Label encoders saved")

# Keep isFlaggedFraud as feature (BUG FIX)
X = model_df.drop(columns=['isFraud'])
y = model_df['isFraud']
print("Features:", X.columns.tolist())

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Fit scaler on training data only (BUG FIX)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, 'scaler.pkl')
print("Scaler saved")

sm = SMOTE(random_state=42)
X_tr_bal, y_tr_bal = sm.fit_resample(X_train_scaled, y_train)
y_tr_bal = y_tr_bal.astype(int)
print("Post-SMOTE fraud rate:", y_tr_bal.mean())

In [None]:
#==================================Part 5==============================================================
# Train Models

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.ensemble import RandomForestClassifier

# Autoencoder
X_norm = X_tr_bal[y_tr_bal==0]
inp_dim = X_norm.shape[1]

inp = layers.Input(shape=(inp_dim,))
e = layers.Dense(inp_dim//2, activation='relu')(inp)
e = layers.Dense(inp_dim//4, activation='relu')(e)
d = layers.Dense(inp_dim//2, activation='relu')(e)
out = layers.Dense(inp_dim, activation='sigmoid')(d)

ae = models.Model(inp, out)
ae.compile(optimizer='adam', loss='mse')
ae.fit(X_norm, X_norm, epochs=30, batch_size=256, validation_split=0.1, verbose=1)

# Random Forest
X_train_df = pd.DataFrame(X_tr_bal, columns=X.columns)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_df, y_tr_bal)
print("Models trained successfully")

In [None]:
#==================================Part 6==============================================================
# Ensemble Optimization

from sklearn.metrics import precision_recall_curve

p_rf = rf.predict_proba(X_test_scaled)[:,1]
recon = ae.predict(X_test_scaled)
err = np.mean((recon - X_test_scaled)**2, axis=1)
p_ae = (err - err.min()) / (err.max() - err.min())

best = {'f1':0}
for alpha in np.linspace(0,1,11):
    score = alpha*p_rf + (1-alpha)*p_ae
    prec, recs, th = precision_recall_curve(y_test, score)
    f1s = 2*prec*recs/(prec+recs+1e-8)
    idx = np.nanargmax(f1s)
    if f1s[idx] > best['f1']:
        best = {'alpha':alpha, 'threshold':th[idx], 'f1':f1s[idx]}

print(f"Best alpha: {best['alpha']}")
print(f"Best threshold: {best['threshold']}")
print(f"Best F1: {best['f1']}")

In [None]:
#==================================Part 7==============================================================
# Save Models (BUG FIX - save actual trained models)

import joblib

joblib.dump({'best_alpha': best['alpha'], 'best_thresh': best['threshold']}, 'thresholds.pkl')
joblib.dump(rf, 'rf_model.pkl')  # Save actual trained rf (BUG FIX)
ae.save('ae_model.keras')

print("All models saved successfully!")

In [None]:
#==================================Part 8==============================================================
# Evaluation

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

alpha, thr = best['alpha'], best['threshold']
final_score = alpha*p_rf + (1-alpha)*p_ae
y_pred = (final_score >= thr).astype(int)

print("=" * 50)
print("MODEL EVALUATION RESULTS")
print("=" * 50)
print(f"Accuracy:  {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall:    {recall_score(y_test, y_pred):.4f}")
print(f"F1-score:  {f1_score(y_test, y_pred):.4f}")
print(f"AUC-ROC:   {roc_auc_score(y_test, final_score):.4f}")
print("=" * 50)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
#==================================Part 9==============================================================
# Visualizations

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title("Confusion Matrix")
axes[0].set_xlabel("Predicted")
axes[0].set_ylabel("Actual")

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, final_score)
axes[1].plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_test, final_score):.2f}")
axes[1].plot([0, 1], [0, 1], 'k--')
axes[1].set_title("ROC Curve")
axes[1].set_xlabel("False Positive Rate")
axes[1].set_ylabel("True Positive Rate")
axes[1].legend()
axes[1].grid(True)

# Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_test, final_score)
axes[2].plot(recall, precision)
axes[2].set_title("Precision-Recall Curve")
axes[2].set_xlabel("Recall")
axes[2].set_ylabel("Precision")
axes[2].grid(True)

plt.tight_layout()
plt.show()

In [None]:
#==================================Part 10==============================================================
# SHAP Analysis

X_sample = pd.DataFrame(X_test, columns=X.columns).sample(n=min(200, len(X_test)), random_state=42)

explainer = shap.TreeExplainer(rf)
shap_vals = explainer.shap_values(X_sample)

print("X_sample shape:", X_sample.shape)
shap_vals_reduced = shap_vals[:, :, 0]
shap.summary_plot(shap_vals_reduced, X_sample)