# EDA & Model Development for Fraud Detection

- Load and explore the dataset
- Preprocess: scaling, handle imbalance
- Train baseline models: Logistic Regression, Random Forest, XGBoost
- Save best model for backend integration

In [6]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib

In [5]:
# Load realistic banking dataset
df = pd.read_csv('../data/realistic_transactions.csv')
print('Dataset shape:', df.shape)
print('\nDataset info:')
df.info()
print('\nFirst few rows:')
df.head()

# EDA: Check class distribution
print('\nFraud distribution:')
print(df['Class'].value_counts())
print(f'Fraud rate: {df["Class"].mean():.3%}')

# Analyze merchant patterns
print('\nFraud by Merchant Type:')
fraud_by_merchant = df.groupby('Merchant_Type')['Class'].agg(['count', 'sum', 'mean']).round(3)
fraud_by_merchant.columns = ['Total_Transactions', 'Fraud_Cases', 'Fraud_Rate']
print(fraud_by_merchant.sort_values('Fraud_Rate', ascending=False))

# Select features for modeling (exclude categorical and derived features)
feature_cols = [col for col in df.columns if col.startswith('V') or col in ['Time', 'Amount']]
features = df[feature_cols]
labels = df['Class']

print(f'\nUsing {len(feature_cols)} features for modeling')
print('Features:', feature_cols)

# Preprocessing: Scaling features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# Handle class imbalance with SMOTE
print('\nBefore SMOTE:')
print(f'Fraud cases: {labels.sum()} ({labels.mean():.3%})')

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(features_scaled, labels)

print('\nAfter SMOTE:')
print(f'Total samples: {len(X_res)}')
print(f'Fraud cases: {y_res.sum()} ({y_res.mean():.3%})')

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train multiple models and compare
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42)
}

results = {}
for name, model in models.items():
    print(f'\nTraining {name}...')
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    
    # Metrics
    from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
    auc = roc_auc_score(y_test, y_prob)
    
    print(f'{name} Results:')
    print(f'AUC: {auc:.4f}')
    print(classification_report(y_test, y_pred))
    
    results[name] = {'model': model, 'auc': auc}

# Select best model
best_model_name = max(results.keys(), key=lambda k: results[k]['auc'])
best_model = results[best_model_name]['model']

print(f'\nBest model: {best_model_name} (AUC: {results[best_model_name]["auc"]:.4f})')

# Save best model and scaler
import joblib
joblib.dump(best_model, '../backend/app/model/model.pkl')
joblib.dump(scaler, '../backend/app/model/scaler.pkl')

print(f'\nSaved {best_model_name} model and scaler for backend use!')
print('\nModel is now ready for real-world transaction analysis!')

Dataset shape: (50000, 37)

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 37 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   V1             50000 non-null  float64
 1   V2             50000 non-null  float64
 2   V3             50000 non-null  float64
 3   V4             50000 non-null  float64
 4   V5             50000 non-null  float64
 5   V6             50000 non-null  float64
 6   V7             50000 non-null  float64
 7   V8             50000 non-null  float64
 8   V9             50000 non-null  float64
 9   V10            50000 non-null  float64
 10  V11            50000 non-null  float64
 11  V12            50000 non-null  float64
 12  V13            50000 non-null  float64
 13  V14            50000 non-null  float64
 14  V15            50000 non-null  float64
 15  V16            50000 non-null  float64
 16  V17            50000 non-null  float64
 17  V18     