# Credit Card Fraud Detection Model Training

This notebook demonstrates the training process for a credit card fraud detection model using imbalanced datasets, SMOTE, and various ML algorithms.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## 1. Generate Synthetic Fraud Data

Since we don't have real credit card data, we'll generate synthetic data that mimics real fraud patterns.

In [None]:
def generate_fraud_data(n_samples=50000):
    """Generate synthetic credit card fraud data"""
    np.random.seed(42)
    
    # Generate base features
    data = {
        'transaction_amount': np.random.lognormal(4, 1.5, n_samples),
        'merchant_category': np.random.choice(['grocery', 'gas', 'restaurant', 'retail', 'online', 'atm'], n_samples),
        'card_type': np.random.choice(['credit', 'debit'], n_samples, p=[0.7, 0.3]),
        'transaction_type': np.random.choice(['purchase', 'withdrawal', 'transfer'], n_samples, p=[0.8, 0.15, 0.05]),
        'hour_of_day': np.random.randint(0, 24, n_samples),
        'day_of_week': np.random.randint(0, 7, n_samples),
        'customer_age': np.random.randint(18, 80, n_samples),
        'account_balance': np.random.normal(5000, 2000, n_samples),
        'previous_transaction_amount': np.random.lognormal(4, 1.2, n_samples),
        'transaction_frequency_24h': np.random.poisson(3, n_samples)
    }
    
    df = pd.DataFrame(data)
    
    # Create fraud labels (imbalanced: ~1% fraud)
    fraud_conditions = (
        (df['transaction_amount'] > df['transaction_amount'].quantile(0.95)) |
        (df['hour_of_day'].isin([2, 3, 4])) & (df['transaction_amount'] > 1000) |
        (df['transaction_frequency_24h'] > 10) & (df['transaction_amount'] > 500) |
        (df['transaction_amount'] > df['account_balance'] * 0.8) |
        (np.random.random(n_samples) < 0.005)  # Random fraud
    )
    
    df['is_fraud'] = fraud_conditions.astype(int)
    
    return df

# Generate data
df = generate_fraud_data(50000)
print(f"Dataset shape: {df.shape}")
print(f"Fraud rate: {df['is_fraud'].mean():.3f}")
print(f"\nClass distribution:")
print(df['is_fraud'].value_counts())

## 2. Exploratory Data Analysis

Let's explore the data to understand fraud patterns.

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nNumerical columns summary:")
print(df.describe())

In [None]:
# Visualize fraud patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Transaction amount distribution
axes[0, 0].hist(df[df['is_fraud'] == 0]['transaction_amount'], bins=50, alpha=0.7, label='Normal', density=True)
axes[0, 0].hist(df[df['is_fraud'] == 1]['transaction_amount'], bins=50, alpha=0.7, label='Fraud', density=True)
axes[0, 0].set_xlabel('Transaction Amount')
axes[0, 0].set_ylabel('Density')
axes[0, 0].set_title('Transaction Amount Distribution')
axes[0, 0].legend()

# Hour of day patterns
fraud_by_hour = df.groupby('hour_of_day')['is_fraud'].agg(['count', 'sum']).reset_index()
fraud_by_hour['fraud_rate'] = fraud_by_hour['sum'] / fraud_by_hour['count']
axes[0, 1].bar(fraud_by_hour['hour_of_day'], fraud_by_hour['fraud_rate'])
axes[0, 1].set_xlabel('Hour of Day')
axes[0, 1].set_ylabel('Fraud Rate')
axes[0, 1].set_title('Fraud Rate by Hour of Day')

# Merchant category patterns
fraud_by_merchant = df.groupby('merchant_category')['is_fraud'].agg(['count', 'sum']).reset_index()
fraud_by_merchant['fraud_rate'] = fraud_by_merchant['sum'] / fraud_by_merchant['count']
axes[1, 0].bar(fraud_by_merchant['merchant_category'], fraud_by_merchant['fraud_rate'])
axes[1, 0].set_xlabel('Merchant Category')
axes[1, 0].set_ylabel('Fraud Rate')
axes[1, 0].set_title('Fraud Rate by Merchant Category')
axes[1, 0].tick_params(axis='x', rotation=45)

# Transaction frequency patterns
fraud_by_freq = df.groupby('transaction_frequency_24h')['is_fraud'].agg(['count', 'sum']).reset_index()
fraud_by_freq['fraud_rate'] = fraud_by_freq['sum'] / fraud_by_freq['count']
axes[1, 1].bar(fraud_by_freq['transaction_frequency_24h'], fraud_by_freq['fraud_rate'])
axes[1, 1].set_xlabel('Transaction Frequency (24h)')
axes[1, 1].set_ylabel('Fraud Rate')
axes[1, 1].set_title('Fraud Rate by Transaction Frequency')

plt.tight_layout()
plt.show()

## 3. Data Preprocessing

Prepare the data for machine learning models.

In [None]:
def preprocess_data(df):
    """Preprocess the data for modeling"""
    df_processed = df.copy()
    
    # Encode categorical variables
    categorical_cols = ['merchant_category', 'card_type', 'transaction_type']
    label_encoders = {}
    
    for col in categorical_cols:
        le = LabelEncoder()
        df_processed[col] = le.fit_transform(df_processed[col])
        label_encoders[col] = le
    
    # Create additional features
    df_processed['amount_to_balance_ratio'] = df_processed['transaction_amount'] / (df_processed['account_balance'] + 1)
    df_processed['amount_change_ratio'] = df_processed['transaction_amount'] / (df_processed['previous_transaction_amount'] + 1)
    
    # Select features
    feature_cols = [
        'transaction_amount', 'merchant_category', 'card_type', 'transaction_type',
        'hour_of_day', 'day_of_week', 'customer_age', 'account_balance',
        'previous_transaction_amount', 'transaction_frequency_24h',
        'amount_to_balance_ratio', 'amount_change_ratio'
    ]
    
    X = df_processed[feature_cols]
    y = df_processed['is_fraud']
    
    return X, y, label_encoders

# Preprocess data
X, y, encoders = preprocess_data(df)
print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"\nFeature columns: {list(X.columns)}")

## 4. Handle Class Imbalance with SMOTE

SMOTE (Synthetic Minority Over-sampling Technique) generates synthetic examples of the minority class.

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)

print(f"Original training set: {y_train.value_counts()}")
print(f"Balanced training set: {pd.Series(y_train_balanced).value_counts()}")
print(f"Test set: {y_test.value_counts()}")

## 5. Train Models

Train Random Forest and Logistic Regression models.

In [None]:
# Initialize models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'),
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
}

# Train models
trained_models = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_balanced, y_train_balanced)
    trained_models[name] = model
    print(f"{name} training completed!")
    print("-" * 50)

## 6. Model Evaluation

Evaluate model performance on test set.

In [None]:
# Evaluate models
results = {}

for name, model in trained_models.items():
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    # Calculate metrics
    auc_score = roc_auc_score(y_test, y_pred_proba)
    
    print(f"\n{name} Results:")
    print(f"AUC-ROC Score: {auc_score:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Store results
    results[name] = {
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba,
        'auc_score': auc_score
    }

In [None]:
# Plot ROC curves
plt.figure(figsize=(10, 8))

for name, result in results.items():
    fpr, tpr, _ = roc_curve(y_test, result['y_pred_proba'])
    plt.plot(fpr, tpr, label=f"{name} (AUC = {result['auc_score']:.3f})", linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Fraud Detection Models')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 7. Feature Importance Analysis

Analyze which features are most important for fraud detection.

In [None]:
# Get feature importance from Random Forest
rf_model = trained_models['Random Forest']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 8))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Feature Importance')
plt.title('Feature Importance for Fraud Detection')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("Top 10 Most Important Features:")
print(feature_importance.head(10))

## 8. Save the Best Model

Save the trained model for deployment.

In [None]:
import joblib

# Select best model (Random Forest based on AUC)
best_model = trained_models['Random Forest']

# Save model components
model_data = {
    'model': best_model,
    'scaler': scaler,
    'label_encoders': encoders,
    'feature_names': list(X.columns),
    'auc_score': results['Random Forest']['auc_score']
}

joblib.dump(model_data, '../fraud_detection_model_rf.pkl')
print("Model saved successfully!")
print(f"Model AUC-ROC Score: {results['Random Forest']['auc_score']:.4f}")

## 9. Test the Model

Test the model with sample transactions.

In [None]:
# Load the saved model
loaded_model_data = joblib.load('../fraud_detection_model_rf.pkl')
loaded_model = loaded_model_data['model']
loaded_scaler = loaded_model_data['scaler']
loaded_encoders = loaded_model_data['label_encoders']

# Test with sample transactions
test_transactions = [
    {  # Normal transaction
        'transaction_amount': 45.50,
        'merchant_category': 'grocery',
        'card_type': 'credit',
        'transaction_type': 'purchase',
        'hour_of_day': 14,
        'day_of_week': 2,
        'customer_age': 35,
        'account_balance': 2500,
        'previous_transaction_amount': 32.20,
        'transaction_frequency_24h': 2
    },
    {  # Suspicious transaction
        'transaction_amount': 2500,
        'merchant_category': 'online',
        'card_type': 'credit',
        'transaction_type': 'purchase',
        'hour_of_day': 3,
        'day_of_week': 1,
        'customer_age': 28,
        'account_balance': 3000,
        'previous_transaction_amount': 45.30,
        'transaction_frequency_24h': 8
    }
]

def predict_fraud(transaction, model, scaler, encoders):
    """Make fraud prediction for a single transaction"""
    df = pd.DataFrame([transaction])
    
    # Encode categorical variables
    categorical_cols = ['merchant_category', 'card_type', 'transaction_type']
    for col in categorical_cols:
        df[col] = encoders[col].transform(df[col])
    
    # Create additional features
    df['amount_to_balance_ratio'] = df['transaction_amount'] / (df['account_balance'] + 1)
    df['amount_change_ratio'] = df['transaction_amount'] / (df['previous_transaction_amount'] + 1)
    
    # Select features
    feature_cols = [
        'transaction_amount', 'merchant_category', 'card_type', 'transaction_type',
        'hour_of_day', 'day_of_week', 'customer_age', 'account_balance',
        'previous_transaction_amount', 'transaction_frequency_24h',
        'amount_to_balance_ratio', 'amount_change_ratio'
    ]
    
    X = df[feature_cols]
    X_scaled = scaler.transform(X)
    
    # Make prediction
    fraud_probability = loaded_model.predict_proba(X_scaled)[:, 1][0]
    is_fraud = fraud_probability > 0.5
    
    return {
        'fraud_probability': fraud_probability,
        'is_fraud': bool(is_fraud)
    }

# Test predictions
for i, transaction in enumerate(test_transactions):
    result = predict_fraud(transaction, loaded_model, loaded_scaler, loaded_encoders)
    print(f"\nTransaction {i+1}:")
    print(f"Amount: ${transaction['transaction_amount']:.2f}")
    print(f"Fraud Probability: {result['fraud_probability']:.3f}")
    print(f"Is Fraud: {result['is_fraud']}")

## Summary

This notebook demonstrated:
1. Generation of synthetic credit card transaction data with fraud patterns
2. Exploratory data analysis to understand fraud characteristics
3. Data preprocessing and feature engineering
4. Handling class imbalance using SMOTE
5. Training Random Forest and Logistic Regression models
6. Model evaluation with ROC curves and classification metrics
7. Feature importance analysis
8. Model saving and testing

The trained model can now be deployed as an API for real-time fraud detection.