In [2]:
import pandas as pd
import  numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc
from imblearn.over_sampling import RandomOverSampler
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Load the dataset
credit = pd.read_csv('credit_fraud/creditcard.csv')

#Normalize amount and time as the other datapoints are normalized via PCA
scaler = StandardScaler()

credit[['Amount', 'Time']] = scaler.fit_transform(credit[['Amount', 'Time']])

In [4]:
#Create the labels and feature splits

#drop the classes from the features it should be the labels
X = credit.drop('Class', axis = 1)
labels = credit['Class']

#split the data
features_train, features_test, labels_train, labels_test = train_test_split(X, labels, test_size=0.2, random_state=42, stratify = labels)



In [5]:
#Apply an oversampler to cover for the large class disparity
oversampler = RandomOverSampler(random_state = 42)
features_train, labels_train = oversampler.fit_resample(features_train, labels_train)


In [6]:
#initialize the classifiers
models = {
    "Logistic Regression": LogisticRegression(max_iter = 1000, random_state = 42),
    "XGBoost": XGBClassifier(use_label_encoder = False, eval_metric = 'logloss', random_state = 42),
    "Random Forest": RandomForestClassifier(random_state = 42)
}

In [None]:
results = {}
#dict to store results per model

for model_name, model in models.items():
    #fit the model
    model.fit(features_train, labels_train)
    labels_pred = model.predict(features_test)
    labels_prob = model.predict_proba(features_test)[:, 1]


    #get the metrics
    report = classification_report(labels_test, labels_pred, output_dict = True)
    roc_auc = roc_auc_score(labels_test, labels_prob)
    precision, recall, _ = precision_recall_curve(labels_test, labels_prob)
    pr_auc = auc(recall, precision)

    results[model_name] = {
        "Classification Report": report,
        "ROC AUC": roc_auc,
        "PR AUC": pr_auc
    }

ValueError: Found input variables with inconsistent numbers of samples: [56962, 1]