<a href="https://colab.research.google.com/github/Steven256-debug/Credit-card-fraud-detection/blob/main/CREDIT_CARD_FRAUD_DETECTION_SYSTEM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Kaggle setup and installation of libraries


In [None]:
# ===============================
# Credit Card Fraud Detection System
# Starter Notebook for Team Collaboration
# Team: Steven Tesla, Aaron, Kenzie, Kelvin, Fada Dem
# ===============================

# -------------------------------
# 1️⃣ Install Required Libraries
# -------------------------------
!pip install pandas numpy scikit-learn matplotlib seaborn imbalanced-learn xgboost lightgbm joblib kaggle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import joblib
import os


Kaggle API Setup and Dataset Download

In [None]:
# -------------------------------
# 2️⃣ Kaggle API Setup
# -------------------------------
# Step 1: Upload your kaggle.json API token
from google.colab import files
files.upload()  # Upload kaggle.json here

# Step 2: Move kaggle.json to correct location
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Step 3: Download and unzip dataset
!kaggle datasets download -d mlg-ulb/creditcardfraud
!unzip -o creditcardfraud.zip -d data


LOAD DATASETS FROM KAGGLE

In [None]:
# -------------------------------
# 3️⃣ Load Dataset
# -------------------------------
df = pd.read_csv('data/creditcard.csv')
print("Dataset shape:", df.shape)
df.head()


Exploratory Data Analysis (EDA)

In [4]:
# -------------------------------
# 4️⃣ Exploratory Data Analysis (EDA)
# -------------------------------
import pandas as pd # Ensure pandas is imported at the top
import seaborn as sns
import matplotlib.pyplot as plt # Needed for plt.show()

# Ensure 'df' is defined for the initial EDA
# Assuming this EDA is for the original creditcard.csv dataset (from previous cells).
# If the previous cell (GfLrOUsAYGLV) has not been run, 'df' would be undefined.
# To make this cell runnable independently for the initial EDA, we conditionally load df here.
# NOTE: This 'df' will be overwritten later in this cell by 'cleaned_data.csv'.
# It is recommended to separate EDA and model evaluation into different cells
# or use distinct variable names (e.g., raw_df, cleaned_df) for clarity.
if 'df' not in locals() and 'df' not in globals():
    df = pd.read_csv('data/creditcard.csv')

# Check class distribution
sns.countplot(x='Class', data=df)
plt.title('Fraud vs Non-Fraud Transactions')
plt.show()

# --- Model Evaluation Part --- (This part redefines 'df')
# This section proceeds with loading cleaned data for model evaluation,
# overwriting the 'df' variable used for the initial EDA.
# This practice is confusing and prone to errors. Consider using a different
# variable name for the cleaned data, e.g., 'cleaned_df'.
import joblib
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score
import shap

# This line explicitly redefines 'df' for model evaluation purposes
df = pd.read_csv("../data/cleaned_data.csv")
model = joblib.load("../models/fraud_model.pkl")

X = df.drop(columns=["Class"])
y = df["Class"]

y_pred = model.predict(X)
y_proba = model.predict_proba(X)[:,1]

print(classification_report(y, y_pred))
print("ROC-AUC:", roc_auc_score(y, y_proba))
print("PR-AUC:", average_precision_score(y, y_proba))

# SHAP Summary
explainer = shap.TreeExplainer(model)
sample = X.sample(300, random_state=42)
shap_values = explainer.shap_values(sample)
shap.summary_plot(shap_values, sample, plot_type="bar")

FileNotFoundError: [Errno 2] No such file or directory: 'data/creditcard.csv'

Preprocessing and Feature Engineering

In [None]:
# -------------------------------
# 5️⃣ Preprocessing
# -------------------------------
# Feature scaling
scaler = StandardScaler()
df['norm_amount'] = scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['hour'] = (df['Time'] // 3600) % 24
df = df.drop(['Time', 'Amount'], axis=1)

# Split features and labels
X = df.drop('Class', axis=1)
y = df['Class']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)
print("Resampled dataset shape:", X_res.shape)


Model Training

In [None]:
# -------------------------------
# 6️⃣ Model Training
# -------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=200, class_weight='balanced'),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_res, y_res)
    y_pred = model.predict(X_test)
    print(f"{name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    print("ROC-AUC:", roc_auc_score(y_test, y_pred))
    trained_models[name] = model


Confusion Matrix Visualization

In [None]:
# -------------------------------
# 7️⃣ Confusion Matrix Visualization
# -------------------------------
from sklearn.metrics import plot_confusion_matrix

for name, model in trained_models.items():
    plot_confusion_matrix(model, X_test, y_test, cmap='Blues')
    plt.title(f'{name} Confusion Matrix')
    plt.show()


Save Best Model

In [None]:
# -------------------------------
# 8️⃣ Save Best Model
# -------------------------------
best_model = trained_models['XGBoost']
os.makedirs('models', exist_ok=True)
joblib.dump(best_model, 'models/fraud_model.pkl')
print("Best model saved as models/fraud_model.pkl")
