In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

from imblearn.over_sampling import RandomOverSampler

import pickle

plt.style.use('seaborn-v0_8')

In [7]:
!pip install kagglehub --quiet

import kagglehub
import shutil
import os

# Download dataset (returns local download path)
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
print("Dataset downloaded to:", path)

target_path = "/content/creditcardfraud"
shutil.copytree(path, target_path, dirs_exist_ok=True)

print("Files copied to:", target_path)
print("Contents:", os.listdir(target_path))


Using Colab cache for faster access to the 'creditcardfraud' dataset.
Dataset downloaded to: /kaggle/input/creditcardfraud
Files copied to: /content/creditcardfraud
Contents: ['creditcard.csv']


In [None]:
df = pd.read_csv('creditcard.csv')

print("First 5 rows of the dataset:")
print(df.head())
print("\n" + "="*50 + "\n")

print(f"Dataset Shape: {df.shape[0]} rows and {df.shape[1]} columns")
print("\n" + "="*50 + "\n")

print(f"Total Missing Values: {df.isnull().sum().sum()}")
print("\n" + "="*50 + "\n")


class_counts = df['Class'].value_counts()
fraud_percentage = (class_counts[1] / len(df)) * 100

print("Class Distribution:")
print(f"Normal Transactions (0): {class_counts[0]}")
print(f"Fraud Transactions (1):  {class_counts[1]}")
print(f"Fraud Percentage: {fraud_percentage:.2f}%")

plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Class', palette=['lightblue', 'red'])
plt.title('Transaction Class Distribution (0: Normal, 1: Fraud)')
plt.show()

In [9]:
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Original training set class counts: {np.bincount(y_train)}")

ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

print(f"Resampled training set class counts: {np.bincount(y_train_resampled)}")


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

Original training set class counts: [227451    394]
Resampled training set class counts: [227451 227451]


In [10]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)
}

model_results = {}

print("Training and Evaluating Models...\n")

for name, model in models.items():
    print(f"--- Training {name} ---")

    model.fit(X_train_scaled, y_train_resampled)

    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    model_results[name] = {
        'accuracy': accuracy,
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'auc': auc,
    }

    print(f"{name} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}\n")

Training and Evaluating Models...

--- Training Logistic Regression ---
Logistic Regression - Accuracy: 0.9756, AUC: 0.9720

--- Training Random Forest ---
Random Forest - Accuracy: 0.9995, AUC: 0.9628

--- Training XGBoost ---
XGBoost - Accuracy: 0.9996, AUC: 0.9768



In [11]:
print("--- Training Hybrid Model (Random Forest + XGBoost) ---")
hybrid_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('xgb', XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False))
    ],
    voting='soft'
)

hybrid_model.fit(X_train_scaled, y_train_resampled)

y_pred_hybrid = hybrid_model.predict(X_test_scaled)
y_pred_proba_hybrid = hybrid_model.predict_proba(X_test_scaled)[:, 1]

accuracy_hybrid = accuracy_score(y_test, y_pred_hybrid)
auc_hybrid = roc_auc_score(y_test, y_pred_proba_hybrid)

model_results['Hybrid (RF + XGB)'] = {
    'accuracy': accuracy_hybrid,
    'precision': precision_score(y_test, y_pred_hybrid),
    'recall': recall_score(y_test, y_pred_hybrid),
    'f1_score': f1_score(y_test, y_pred_hybrid),
    'auc': auc_hybrid
}

print(f"Hybrid Model - Accuracy: {accuracy_hybrid:.4f}, AUC: {auc_hybrid:.4f}\n")

--- Training Hybrid Model (Random Forest + XGBoost) ---
Hybrid Model - Accuracy: 0.9995, AUC: 0.9776



In [12]:
print("=" * 60)
print("COMPREHENSIVE MODEL PERFORMANCE REPORT")
print("=" * 60)

for model_name, results in model_results.items():
    print(f"\n----- {model_name.upper()} -----")
    print(f"  Accuracy:  {results['accuracy']:.4f}  ({results['accuracy']*100:.2f}%)")
    print(f"  Precision: {results['precision']:.4f}  ({results['precision']*100:.2f}%)")
    print(f"  Recall:    {results['recall']:.4f}  ({results['recall']*100:.2f}%)")
    print(f"  F1-Score:  {results['f1_score']:.4f}  ({results['f1_score']*100:.2f}%)")
    print(f"  AUC Score: {results['auc']:.4f}  ({results['auc']*100:.2f}%)")

print("\n" + "=" * 60)
print("Report generated successfully.")

COMPREHENSIVE MODEL PERFORMANCE REPORT

----- LOGISTIC REGRESSION -----
  Accuracy:  0.9756  (97.56%)
  Precision: 0.0611  (6.11%)
  Recall:    0.9184  (91.84%)
  F1-Score:  0.1146  (11.46%)
  AUC Score: 0.9720  (97.20%)

----- RANDOM FOREST -----
  Accuracy:  0.9995  (99.95%)
  Precision: 0.9494  (94.94%)
  Recall:    0.7653  (76.53%)
  F1-Score:  0.8475  (84.75%)
  AUC Score: 0.9628  (96.28%)

----- XGBOOST -----
  Accuracy:  0.9996  (99.96%)
  Precision: 0.9022  (90.22%)
  Recall:    0.8469  (84.69%)
  F1-Score:  0.8737  (87.37%)
  AUC Score: 0.9768  (97.68%)

----- HYBRID (RF + XGB) -----
  Accuracy:  0.9995  (99.95%)
  Precision: 0.9000  (90.00%)
  Recall:    0.8265  (82.65%)
  F1-Score:  0.8617  (86.17%)
  AUC Score: 0.9776  (97.76%)

Report generated successfully.


In [13]:
with open('model.pkl', 'wb') as f:
    pickle.dump(hybrid_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("Best model ('model.pkl') and scaler ('scaler.pkl') have been saved successfully!")

Best model ('model.pkl') and scaler ('scaler.pkl') have been saved successfully!
