<h1 style="text-align: center;">Credit Card Fraud Detection using Machine Learning</h1>

<h2 style="text-align: center;">Import Neccessary Libraries</h2>




In [None]:
! pip install kaggle

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! mkdir ~/.kaggle

In [None]:
! cp /content/drive/MyDrive/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d mlg-ulb/creditcardfraud

In [None]:
! unzip /content/creditcardfraud.zip

In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier, IsolationForest, StackingClassifier

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, roc_auc_score, roc_curve,
precision_recall_curve, average_precision_score)

In [None]:
# Create a requirements.txt file
! pip freeze > requirements.txt

<h2 style="text-align: center;">Data Preprocessing</h2>

#### Steps to prepare the credit card fraud dataset:
##### 1. Load the creditcard.csv dataset into a DataFrame.
##### 2. Check for and handle any missing/null values.
##### 3. Identify and remove duplicate records to avoid data leakage.



In [None]:
# Read creditcard.csv into a pandas dataframe
df = pd.read_csv('creditcard.csv')
df.head()

In [None]:
# Print the shape of the dataframe
df.shape

In [None]:
# Print the dataframe's information
df.info()

In [None]:
# Print the amount of null values in the dataframe
df.isnull().sum()

In [None]:
# Check for duplicated values in the dataframe
df.duplicated().any()

In [None]:
# Print the sum of the duplicated values
df.duplicated().sum()

In [None]:
# Removes the duplicated values in the dataframe
df = df.drop_duplicates()

In [None]:
# Print the shape of the dataframe
df.shape

<h2 style="text-align: center;">Data Visualization</h2>



#### Plot the following distributions:
##### 1. Normal vs fraudulent transactions to visualize the class imbalance in the dataset
##### 2. Fraud transactions over time (hours elapsed since the first transaction) to show when fraudulent activity is most frequent
##### 3. Transaction amount by class to show the difference in the amount range for fraud vs normal transactions

In [None]:
# Print the unique values in class column
df['Class'].unique()

In [None]:
# Print the value count for the class column
df['Class'].value_counts()

In [None]:
# Convert Time (seconds) to hours
df['Hours'] = df['Time'].apply(lambda x: x/ 3600)

In [None]:
# Plot the distribution of normal vs fraudulent transactions
# Add value labels to show the exact count per class

plt.figure(figsize = (6, 4))
normal_vs_fraud_plot = sns.countplot(x = 'Class', data = df)

# Add the count labels aboves bar
for patch in normal_vs_fraud_plot.patches:
    height = patch.get_height()
    if height > 0:
        plt.text(patch.get_x() + patch.get_width() / 2, height + 0.01, f'{int(height):,}', ha = 'center', va = 'bottom', fontsize = 9)

plt.title('Distribution of Normal vs Fraudulent Transactions')
plt.xticks([0, 1], ['Normal', 'Fraudulent'])
plt.xlabel('Transaction Class')
plt.ylabel('Count')
plt.show()

In [None]:
# Plot the distribution of fradulent transactions over time
# Convert Time (seconds) to hours elapsed since the first transaction
# Filter fraud transactions and plot distribution over time
# Add value labels to show the exact amount of fraud cases in each hour

# Filter fraud transactions
fraud_df = df[df['Class'] == 1]

# Plot the histogram
plt.figure(figsize = (16, 6))
fraud_over_time_plot = sns.histplot(fraud_df, x = 'Hours', bins = 48, kde = True)

# Add count labels above bars
for patch in fraud_over_time_plot.patches:
    height = patch.get_height()
    if height > 0:
        plt.text(patch.get_x() + patch.get_width() / 2, height + 0.01, int(height), ha ='center', fontsize = 9)

plt.title('Distribution of Fraudulent Transactions over Time')
plt.xlabel('Time Elapsed since the First Transaction (Hours)')
plt.ylabel('Number of Fraudulent Transactions')
plt.xticks(range(0, 49))
plt.tight_layout()
plt.show()

In [None]:
# Plot the distribution of transaction amount by class

plt.figure(figsize = (10, 6))
transactions_by_amount_plot = sns.violinplot(x = 'Class', y = 'Amount', data = df, cut = 0, scale = 'width', inner = 'box')
plt.title('Distribution of Transactions Amount by Class')
plt.xticks([0, 1], ['Normal', 'Fraudulent'])
plt.xlabel('Transaction Class')
plt.ylabel('Transaction Amount')
plt.grid(axis ='y', linestyle= '--')
plt.show()

In [None]:
# Save all the data visualization plots to drive
save_dir = '/content/drive/MyDrive/Credit Card Fraud Detection/assets'
os.makedirs(save_dir, exist_ok = True)

figures_to_save = {
'normal_vs_fraud_countplot.png': normal_vs_fraud_plot,
'fraud_over_time_histplot .png': fraud_over_time_plot,
'transactions_by_amount_violinplot.png': transactions_by_amount_plot,
}

for filename, ax in figures_to_save.items():
    filepath = os.path.join(save_dir, filename)
    ax.figure.savefig(filepath, dpi = 300, bbox_inches = 'tight')
    print(f'Saved {filepath}')

<h2 style="text-align: center;">Model Development</h2>

#### The following was done to train, test and evaluate the models
##### 1. Normalize the Time and Amount features using StandardScaler to ensure they are on the same scale as the other PCA-transformed features.
##### 2. Separate features (x) and target variable (y), then split into training and testing sets
##### 3. Apply sampling techniques to handle class imbalance:
#####       - Random Under Sampling (RUS): This method randomly remove samples from the majority class to balance the class distribution
#####       - Random Over Sampling (ROS): This method randomly duplicating samples from the minority class to balance the class distribution
#####       - Synthetic Minority Oversampling Technique (SMOTE): This method generates synthetic samples for the minority class to balance the class imbalance
##### 4. Train the following models on each sampled dataset:
#####       - Logistic Regression (LR)
#####       - Decision Tree (DT)
#####       - Random Forest (RF)
#####       - XGBoost (XGB)
#####       - Stacking Ensemble Model: Combines Random Forest and XGBoost with Logistic Regression as meta-learner
#####       - Multi-layer Perceptron (MLP)
#####       - Isolation Forest (unsupervised anomaly detection)
##### 5. Evaluate model performance using:
#####       - Classification report (precision, recall, F1-score)
#####       - Confusion matrix
#####       - ROC curve and AUC score
#####       - Precision-Recall curve
#####       - Average precision score


##### Note: Overfitting was observed in all models trained on ROS and SMOTE apart from Isolation Forest (Unsupervised) and Logistic Regression (supervised), while RUS produced more balanced performance.

In [None]:
# Normalize the values in the Amount, Time and Hours Column column using standardscaler

df[['Amount', 'Time']] = StandardScaler().fit_transform(df[['Amount', 'Time']])

In [None]:
# Spilt the features (independent variables) and target variable (dependent variable)

x = df.drop(['Class', 'Hours'], axis = 1)
y = df['Class']

<h3 style="text-align: center;">Random Under Sampling</h3>

In [None]:
# Initialize RandomUnderSampler

RUS = RandomUnderSampler(random_state = 42)

In [None]:
# Apply RandomUnderSampler

x_under_sampled, y_under_sampled = RUS.fit_resample(x, y)

In [None]:
# Convert the resampled arrays back to a dataframe and series

x_under_sampled = pd.DataFrame(x_under_sampled, columns = x.columns)
y_under_sampled = pd.Series(y_under_sampled, name = 'Class')

In [None]:
# Print the value_count of the under_sampled series

y_under_sampled.value_counts()

In [None]:
# Spilt the data into the training and testing set using train_test_spilt

x_train, x_test, y_train, y_test = train_test_split(x_under_sampled, y_under_sampled, test_size = 0.2, random_state = 42)

In [None]:
# Defines a dictionary of classification models along with their hyperparameters for GridSearchCV during training and evaluation.

def get_models():
    models = dict()

    models['Decision_Tree'] = {
        'model': DecisionTreeClassifier(random_state = 42),
        'params': {
            'max_depth': [3, 5, 10, None],
            'min_samples_split': [2, 5, 10]
        }
    }

    models['Logistic_Regression'] = {
        'model': LogisticRegression(solver = 'liblinear', max_iter = 1000),
        'params': {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['l1', 'l2']
        }
    }

    models['Random_Forest'] = {
        'model': RandomForestClassifier(random_state = 42),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [None, 10],
            'min_samples_split': [2, 5]
        }
    }

    models['XGBoost'] = {
        'model': XGBClassifier(use_label_encoder = False, eval_metric = 'logloss'),
        'params': {
            'n_estimators': [50, 100],
            'learning_rate': [0.01, 0.1],
            'max_depth': [3, 5]
        }
    }

    base_learners = [
        ('Random_Forest', RandomForestClassifier(n_estimators = 100, random_state = 42)),
        ('XGBoost', XGBClassifier(use_label_encoder = False, eval_metric = 'logloss')),
    ]

    final_estimator = LogisticRegression(max_iter = 1000)

    models['Stacked_Model'] = {

        'model': StackingClassifier(estimators = base_learners, final_estimator = final_estimator, cv = 5, n_jobs = -1),
        'params': {} # No hyperparameter tuning applied to stacking models
    }

    return models

In [None]:
# Function to train, evaluate and save the traditonal machine learning models

def evaluate_models(x_train, y_train, x_test, y_test, save_dir ='saved_models'):
    models = get_models()
    results =[]

    for name, mp in models.items():
        print(f'\nModel Name: {name}')

        if mp['params']:
            grid = GridSearchCV(mp['model'], mp['params'], scoring = 'roc_auc', cv = 5, n_jobs = -1)
            grid.fit(x_train, y_train)
            best_model = grid.best_estimator_
            print('Best Parameters:', grid.best_params_)

        else:
            best_model = mp['model'].fit(x_train, y_train)
            best_params = None
            print('No hyperparameter tuning applied.')

        y_pred = best_model.predict(x_test)
        y_probs = best_model.predict_proba(x_test)[:, 1] if hasattr(best_model, "predict_proba") else y_pred

        # Classification Report
        print('\n Classification Report:')
        print(classification_report(y_test, y_pred, target_names=['Normal', 'Fraud']))

        # Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6, 5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Normal', 'Fraud'], yticklabels = ['Normal', 'Fraud'])
        plt.title(f'{name} - Confusion Matrix')
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.show()

        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test, y_probs)
        auc_score = roc_auc_score(y_test, y_probs)

        plt.figure(figsize=(6, 5))
        plt.plot(fpr, tpr, label = f'AUC = {auc_score:.2f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'{name} - ROC Curve')
        plt.legend(loc='lower right')
        plt.show()

        print(f'AUC Score: {auc_score:.2f}')

        # Precision-Recall Curve and Average Precision
        precision, recall, _ = precision_recall_curve(y_test, y_probs)
        avg_precision = average_precision_score(y_test, y_probs)

        # Plot the Precision-Recall Curve
        plt.figure(figsize=(6, 5))
        plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'{name} - Precision-Recall Curve')
        plt.legend(loc='lower left')
        plt.show()

        print(f'Average Precision (PR-AUC): {avg_precision:.2f}')

In [None]:
# Function to train, evaluate and save isolation forest model

def isolation_forest_model(x_train, x_test, y_test, save_dir = 'saved_models'):
    model = IsolationForest(random_state = 42)

    # Fit the model on the training data
    model.fit(x_train)

    # Predict the anomalies
    y_pred = model.predict(x_test)

    # Convert anomalies to 1 (fraud), normal to 0
    y_pred = np.where(y_pred == -1, 1, 0)

    labels = ['Normal', 'Fraud']

    # Printing the classification report
    print('Model: Isolation_Forest')
    print('Classification Report:')
    print(classification_report(y_test, y_pred, target_names = labels))

    # Print confusion matrix
    cnn_cm = confusion_matrix(y_test, y_pred)

    plt.figure(figsize = (10, 8))
    sns.heatmap(cnn_cm, annot = True, fmt = "d", cmap = "Blues", cbar = True,
                xticklabels = labels, yticklabels = labels)
    plt.title(f'Isolation_Forest Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

    # Get the anomaly scores from the Isolation Forest model
    y_scores = -model.decision_function(x_test)

    # Compute the AUC score
    roc_auc = roc_auc_score(y_test, y_scores)

    # Calculate the ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)

    # Plot the ROC curve
    plt.figure()
    plt.plot(fpr, tpr, label = f'ROC Curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.title('ROC Curve for Isolation Forest')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc = 'lower right')
    plt.show()

    # Precision-Recall Curve and Average Precision
    precision, recall, _ = precision_recall_curve(y_test, y_scores)
    avg_precision = average_precision_score(y_test, y_scores)

    # Plot the Precision-Recall Curve
    plt.figure(figsize=(6, 5))
    plt.plot(recall, precision, label=f'AP = {avg_precision:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower left')
    plt.show()

    print(f'Average Precision (PR-AUC): {avg_precision:.2f}')

In [None]:
# Function to train, evaluate and save an MLP model

def build_mlp(x_train, y_train, x_test, y_test):

    # Clears the background session before training a new model

    tf.keras.backend.clear_session()

    # Input shape for the MLP model

    input_shape = (x_train.shape[1],)

    model = Sequential([

        Dense(64, kernel_regularizer=l2(0.001), activation = 'relu', input_shape = input_shape),
        BatchNormalization(),
        Dropout(0.3),
        Dense(32, activation = 'relu'),
        BatchNormalization(),
        Dropout(0.2),
        Dense(16, activation = 'relu'),
        Dense(1, activation = 'sigmoid')

    ])

    model.compile(optimizer = Adam(1e-3), loss = 'binary_crossentropy', metrics = ['accuracy'])


    history = model.fit(

        x_train,
        y_train,
        epochs = 30,
        batch_size = 256,
        validation_split = 0.1,
        verbose = 2
    )

    y_pred = model.predict(x_test).flatten()
    y_preds = (y_pred > 0.5).astype(int)

    # Printing the classification report
    print(classification_report(y_test, y_preds, target_names = ['Normal','Fraud']))

    # Print confusion matrix
    cm = confusion_matrix(y_test, y_preds)
    sns.heatmap(cm, annot=True, fmt='d', cmap ='Blues', xticklabels=['Normal','Fraud'], yticklabels=['Normal','Fraud'])
    plt.title('MLP Classifier - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Compute the AUC score
    roc_auc = roc_auc_score(y_test, y_pred)

    # Calculate the ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred)

    # Plot the ROC curve
    plt.plot(fpr, tpr, label = f'ROC-AUC = {roc_auc:.2f}')
    plt.plot([0,1],[0,1],'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.show()

    # Precision-Recall Curve and Average Precision
    precision, recall, _ = precision_recall_curve(y_test, y_pred)
    ap = average_precision_score(y_test, y_pred)

    # Plot the Precision-Recall Curve
    plt.plot(recall, precision, label=f'AP={ap:.2f}')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.show()

    return model, history

In [None]:
# Apply the function evaluate_models to the under_sampled data
evaluate_models(x_train, y_train, x_test, y_test)

In [None]:
# Apply the function isolation_forest_model to the under_sampled data
isolation_forest_model(x_train, x_test, y_test)

In [None]:
# Apply the function build_mlp to the under_sampled data
mlp_model, mlp_history = build_mlp(x_train, y_train, x_test, y_test)
mlp_model, mlp_history

<h3 style="text-align: center;">Random Over Sampling</h3>

In [None]:
# Initialize RandomOverSampler
ROS = RandomOverSampler(random_state = 42)

In [None]:
# Apply RandomOverSampler
x_over_sampled, y_over_sampled = ROS.fit_resample(x, y)

In [None]:
# Convert the resampled arrays back to a dataframe and series
x_over_sampled = pd.DataFrame(x_over_sampled, columns = x.columns)
y_over_sampled = pd.Series(y_over_sampled, name = 'Class')

In [None]:
# Print the value_count of the over_sampled series
y_over_sampled.value_counts()

In [None]:
# Spilt the data into the training and testing set using train_test_spilt
x_train, x_test, y_train, y_test = train_test_split(x_over_sampled, y_over_sampled, test_size = 0.2, random_state = 42)

In [None]:
# Apply the function evaluate_models to the over_sampled data
evaluate_models(x_train, y_train, x_test, y_test)

In [None]:
# Apply the function isolation_forest_model to the over_sampled data
isolation_forest_model(x_train, x_test, y_test)

In [None]:
# Apply the function build_mlp to the over_sampled data
mlp_model, mlp_history = build_mlp(x_train, y_train, x_test, y_test)
mlp_model, mlp_history

<h3 style="text-align: center;">SMOTE (Synthetic Minority Over Sampling)</h3>

In [None]:
# Initialize SMOTE
Smote = SMOTE(random_state = 42)

In [None]:
# Apply SMOTE
x_smote, y_smote = Smote.fit_resample(x, y)

In [None]:
# Convert the resampled arrays back to a dataframe and series
x_smote = pd.DataFrame(x_smote, columns = x.columns)
y_smote = pd.Series(y_smote, name = 'Class')

In [None]:
# Print the value count the smote series
y_smote.value_counts()

In [None]:
# Spilt the data into the training and testing set using train_test_spilt
x_train, x_test, y_train, y_test = train_test_split(x_smote, y_smote, test_size = 0.2, random_state = 42)

In [None]:
# Apply the function evaluate_models to the smote data
evaluate_models(x_train, y_train, x_test, y_test)

In [None]:
# Apply the function isolation_forest_model to the smote data
isolation_forest_model(x_train, x_test, y_test)

In [None]:
# Apply the function build_mlp to the smote data
mlp_model, mlp_history = build_mlp(x_train, y_train, x_test, y_test)
mlp_model, mlp_history