In [20]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Preprocessing and modeling imports
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    classification_report,
    confusion_matrix
)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

## 1. Load data

In [21]:
fraud_df = pd.read_csv("fraudData.csv")
print(f"Dataset shape: {fraud_df.shape}")
print(f"Columns: {list(fraud_df.columns)}")
print(f"Fraud rate: {fraud_df['fraud'].mean()*100:.2f}%")

Dataset shape: (594643, 10)
Columns: ['step', 'customer', 'age', 'gender', 'zipcodeOri', 'merchant', 'zipMerchant', 'category', 'amount', 'fraud']
Fraud rate: 1.21%


## 2. Data cleaning

In [22]:
text_cols = ["customer", "age", "gender", "zipcodeOri", "merchant", "zipMerchant", "category"]
for col in text_cols:
    if col in fraud_df.columns:
        fraud_df[col] = fraud_df[col].astype(str).str.replace(r"[^\w\s]", "", regex=True)
fraud_df.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,C1093826151,4,M,28007,M348934600,28007,es_transportation,4.55,0
1,0,C352968107,2,M,28007,M348934600,28007,es_transportation,39.68,0
2,0,C2054744914,4,F,28007,M1823072687,28007,es_transportation,26.89,0
3,0,C1760612790,3,M,28007,M348934600,28007,es_transportation,17.25,0
4,0,C757503768,5,M,28007,M348934600,28007,es_transportation,35.72,0


## 3. Feature cardinality check

In [23]:
for col in ["customer", "merchant", "age", "gender", "category"]:
    if col in fraud_df.columns:
        print(f"{col}: {fraud_df[col].nunique()} unique values")

customer: 4112 unique values
merchant: 50 unique values
age: 8 unique values
gender: 4 unique values
category: 15 unique values


## 4.frequency encoding to avoid high cardinality
### Customer-based features

In [24]:
customer_stats = fraud_df.groupby('customer').agg({
                                                    'fraud': ['count', 'mean', 'sum'],
                                                    'amount': ['mean', 'std', 'min', 'max']
                                                  }).fillna(0)

customer_stats.columns = ['customer_transaction_count', 'customer_fraud_rate', 'customer_total_fraud',
                         'customer_avg_amount', 'customer_amount_std', 'customer_min_amount', 'customer_max_amount']

fraud_df = fraud_df.merge(customer_stats, left_on='customer', right_index=True, how='left')

### Merchant-based features

In [25]:
merchant_stats = fraud_df.groupby('merchant').agg({
                                                    'fraud': ['count', 'mean'],
                                                    'amount': ['mean', 'std']
                                                }).fillna(0)

merchant_stats.columns = ['merchant_transaction_count', 'merchant_fraud_rate', 'merchant_avg_amount', 'merchant_amount_std']

fraud_df = fraud_df.merge(merchant_stats, left_on='merchant', right_index=True, how='left')

### Amount deviation features

In [26]:
fraud_df['amount_vs_customer_avg'] = (fraud_df['amount'] - fraud_df['customer_avg_amount']) / (fraud_df['customer_amount_std'] + 1e-8)
fraud_df['amount_vs_merchant_avg'] = (fraud_df['amount'] - fraud_df['merchant_avg_amount']) / (fraud_df['merchant_amount_std'] + 1e-8)

### Time-based features (if step represents time)

In [27]:
fraud_df['step_mod_24'] = fraud_df['step'] % 24  # Hour of day pattern
fraud_df['step_mod_7'] = fraud_df['step'] % 7    # Day of week pattern

# Drop original high-cardinality and unnecessary columns
fraud_df = fraud_df.drop(['customer', 'zipcodeOri', 'zipMerchant'], axis=1)

print(f"Enhanced dataset shape: {fraud_df.shape}")
print(f"New features: {list(fraud_df.columns)}")
fraud_df.head()

Enhanced dataset shape: (594643, 22)
New features: ['step', 'age', 'gender', 'merchant', 'category', 'amount', 'fraud', 'customer_transaction_count', 'customer_fraud_rate', 'customer_total_fraud', 'customer_avg_amount', 'customer_amount_std', 'customer_min_amount', 'customer_max_amount', 'merchant_transaction_count', 'merchant_fraud_rate', 'merchant_avg_amount', 'merchant_amount_std', 'amount_vs_customer_avg', 'amount_vs_merchant_avg', 'step_mod_24', 'step_mod_7']


Unnamed: 0,step,age,gender,merchant,category,amount,fraud,customer_transaction_count,customer_fraud_rate,customer_total_fraud,...,customer_min_amount,customer_max_amount,merchant_transaction_count,merchant_fraud_rate,merchant_avg_amount,merchant_amount_std,amount_vs_customer_avg,amount_vs_merchant_avg,step_mod_24,step_mod_7
0,0,4,M,M348934600,es_transportation,4.55,0,167,0.0,0,...,0.44,95.91,205426,0.0,26.965542,17.526489,-1.313545,-1.278952,0,0
1,0,2,M,M348934600,es_transportation,39.68,0,169,0.0,0,...,0.53,135.86,205426,0.0,26.965542,17.526489,0.302604,0.725442,0,0
2,0,4,F,M1823072687,es_transportation,26.89,0,65,0.0,0,...,0.32,189.59,299693,0.0,26.953146,17.528857,-0.410479,-0.003602,0,0
3,0,3,M,M348934600,es_transportation,17.25,0,171,0.0,0,...,0.07,162.48,205426,0.0,26.965542,17.526489,-0.521501,-0.554335,0,0
4,0,5,M,M348934600,es_transportation,35.72,0,145,0.0,0,...,0.01,168.45,205426,0.0,26.965542,17.526489,0.170832,0.499499,0,0


## 5. Define features and target

In [28]:
X = fraud_df.drop(columns=["fraud"])
y = fraud_df["fraud"].astype(int)

## 6. Feature categorization

In [29]:
numeric_features = [
    'step', 'amount', 
    'customer_transaction_count', 'customer_fraud_rate', 'customer_total_fraud',
    'customer_avg_amount', 'customer_amount_std', 'customer_min_amount', 'customer_max_amount',
    'merchant_transaction_count', 'merchant_fraud_rate', 'merchant_avg_amount', 'merchant_amount_std',
    'amount_vs_customer_avg', 'amount_vs_merchant_avg', 'step_mod_24', 'step_mod_7'
]

categorical_features = ['merchant', 'age', 'gender', 'category']

print(f"\nNumeric features ({len(numeric_features)}): {numeric_features}")
print(f"Categorical features ({len(categorical_features)}): {categorical_features}")

# Verifying all features exist
numeric_features = [f for f in numeric_features if f in X.columns]
categorical_features = [f for f in categorical_features if f in X.columns]

print(f"\nFinal feature counts:")
print(f"- Numeric: {len(numeric_features)}")
print(f"- Categorical: {len(categorical_features)}")


Numeric features (17): ['step', 'amount', 'customer_transaction_count', 'customer_fraud_rate', 'customer_total_fraud', 'customer_avg_amount', 'customer_amount_std', 'customer_min_amount', 'customer_max_amount', 'merchant_transaction_count', 'merchant_fraud_rate', 'merchant_avg_amount', 'merchant_amount_std', 'amount_vs_customer_avg', 'amount_vs_merchant_avg', 'step_mod_24', 'step_mod_7']
Categorical features (4): ['merchant', 'age', 'gender', 'category']

Final feature counts:
- Numeric: 17
- Categorical: 4


## 8. Train/test split

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training: {X_train.shape}, Fraud rate: {y_train.mean()*100:.2f}%")
print(f"Testing: {X_test.shape}, Fraud rate: {y_test.mean()*100:.2f}%")

Training: (475714, 21), Fraud rate: 1.21%
Testing: (118929, 21), Fraud rate: 1.21%


## 9. Preprocessing pipeline

In [31]:
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder='drop')

## 10. Model configuration

In [32]:
print("\nConfiguring RandomForest model...")
model = RandomForestClassifier(
    n_estimators=200,           
    max_depth=20,              
    min_samples_split=3,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42,
    n_jobs=2,                  
    class_weight='balanced',  
    oob_score=True           
)


Configuring RandomForest model...


## 11. Complete pipeline

In [33]:
pipeline = ImbPipeline([
    ('preprocess', preprocessor),
    ('smote', SMOTE(random_state=42, k_neighbors=5)),
    ('classifier', model)
])

## 12. Model training

In [34]:
print("\nTraining model (this will take 5-10 minutes)...")
pipeline.fit(X_train, y_train)

print(f"Out-of-bag score: {pipeline.named_steps['classifier'].oob_score_:.4f}")


Training model (this will take 5-10 minutes)...
Out-of-bag score: 0.9970


## 13. Predictions

In [36]:
print("\nMaking predictions...")
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]


Making predictions...


## 14. Comprehensive evaluation

In [37]:
print("\n" + "="*70)
print("COMPREHENSIVE MODEL EVALUATION")
print("="*70)

# Classification report
print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, digits=4))

# Key metrics
print("\n=== Key Performance Metrics ===")
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
pr_auc = average_precision_score(y_test, y_proba)

print(f"Precision:    {precision:.4f}")
print(f"Recall:       {recall:.4f}")
print(f"F1-Score:     {f1:.4f}")
print(f"ROC-AUC:      {roc_auc:.4f}")
print(f"PR-AUC:       {pr_auc:.4f}")

# Confusion matrix
print("\n=== Confusion Matrix ===")
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

print("Predicted:     No Fraud  |  Fraud")
print("Actual:")
print(f"No Fraud:      {tn:8d}  |  {fp:5d}")
print(f"Fraud:         {fn:8d}  |  {tp:5d}")

# Business impact
fraud_detection_rate = tp / (tp + fn) if (tp + fn) > 0 else 0
false_alarm_rate = fp / (fp + tn) if (fp + tn) > 0 else 0

print("\n=== Business Impact Analysis ===")
print(f"Fraud Detection Rate:     {fraud_detection_rate:.1%}")
print(f"False Alarm Rate:         {false_alarm_rate:.1%}")
print(f"Frauds Caught:            {tp:,} transactions")
print(f"Frauds Missed:            {fn:,} transactions")
print(f"False Alarms:             {fp:,} transactions")
print(f"Correct Legitimate:       {tn:,} transactions")

# Feature importance
print("\n=== Top 20 Most Important Features ===")
try:
    # Get feature names after preprocessing
    feature_names = (numeric_features + 
                    list(pipeline.named_steps['preprocess']
                        .named_transformers_['cat']
                        .named_steps['onehot']
                        .get_feature_names_out(categorical_features)))

    importances = pipeline.named_steps['classifier'].feature_importances_
    feature_importance = list(zip(feature_names, importances))
    feature_importance.sort(key=lambda x: x[1], reverse=True)

    for i, (feature, importance) in enumerate(feature_importance[:20]):
        print(f"{i+1:2d}. {feature:<35} {importance:.4f}")

except Exception as e:
    print(f"Feature importance extraction failed: {e}")


COMPREHENSIVE MODEL EVALUATION

=== Classification Report ===
              precision    recall  f1-score   support

           0     0.9994    0.9942    0.9968    117489
           1     0.6681    0.9493    0.7843      1440

    accuracy                         0.9937    118929
   macro avg     0.8338    0.9718    0.8905    118929
weighted avg     0.9954    0.9937    0.9942    118929


=== Key Performance Metrics ===
Precision:    0.6681
Recall:       0.9493
F1-Score:     0.7843
ROC-AUC:      0.9988
PR-AUC:       0.9170

=== Confusion Matrix ===
Predicted:     No Fraud  |  Fraud
Actual:
No Fraud:        116810  |    679
Fraud:               73  |   1367

=== Business Impact Analysis ===
Fraud Detection Rate:     94.9%
False Alarm Rate:         0.6%
Frauds Caught:            1,367 transactions
Frauds Missed:            73 transactions
False Alarms:             679 transactions
Correct Legitimate:       116,810 transactions

=== Top 20 Most Important Features ===
 1. merchant_fraud_rat

## 15. Model persistence

In [40]:
import joblib
joblib.dump(pipeline, "fraud_detector_final.pkl")
print("Model saved as 'fraud_detector_final.pkl'")

Model saved as 'fraud_detector_final.pkl'


## 16. Performance summary

In [39]:
print("\n" + "="*70)
print("FINAL PERFORMANCE SUMMARY")
print("="*70)
print(f"✓ Model Type: RandomForest with advanced feature engineering")
print(f"✓ Training Data: {X_train.shape[0]:,} transactions")
print(f"✓ Features Used: {len(numeric_features + categorical_features)} total")
print(f"✓ Precision: {precision:.1%} (of flagged transactions, {precision:.1%} are actually fraud)")
print(f"✓ Recall: {recall:.1%} (catches {recall:.1%} of all fraud cases)")
print(f"✓ F1-Score: {f1:.3f} (balanced performance metric)")
print(f"✓ ROC-AUC: {roc_auc:.3f} (overall classification ability)")
print(f"✓ PR-AUC: {pr_auc:.3f} (performance on imbalanced data)")
print("="*70)
print("MODEL TRAINING COMPLETE - READY FOR PRODUCTION!")
print("="*70)


FINAL PERFORMANCE SUMMARY
✓ Model Type: RandomForest with advanced feature engineering
✓ Training Data: 475,714 transactions
✓ Features Used: 21 total
✓ Precision: 66.8% (of flagged transactions, 66.8% are actually fraud)
✓ Recall: 94.9% (catches 94.9% of all fraud cases)
✓ F1-Score: 0.784 (balanced performance metric)
✓ ROC-AUC: 0.999 (overall classification ability)
✓ PR-AUC: 0.917 (performance on imbalanced data)
MODEL TRAINING COMPLETE - READY FOR PRODUCTION!
