In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.utils import shuffle

# **Load the Data**

In [2]:
# Load the data and shuffle it
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return shuffle(df)

def explore_data(df):
    print(df.info())
    print('Class Distribution:\n', df['Class'].value_counts())
    return df.describe()

df = load_dataset('./creditcard.csv')
explore_data(df)

<class 'pandas.core.frame.DataFrame'>
Index: 284807 entries, 31575 to 70896
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     2848

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.176758e-15,3.385972e-16,-1.398098e-15,2.093455e-15,1.005114e-15,1.496244e-15,-5.638796e-16,1.140633e-16,-2.412393e-15,...,1.606229e-16,-3.507221e-16,2.636528e-16,4.472604e-15,5.14582e-16,1.685202e-15,-3.656286e-16,-1.22845e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


# **Preprocess the Data**

In [3]:
#Scale and split the data
def preprocess_data(data):
    #Separate data
    X = data.drop('Class', axis=1)
    y = data['Class']

    #Scale numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    #Split the data
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = preprocess_data(df)

# **Train the model**

In [4]:
#Train a RandomForestClassifier to detect fraud
def train_model(X_train, y_train):
    model = RandomForestClassifier(n_estimators = 100, random_state = 42, class_weight='balanced')
    model.fit(X_train, y_train)
    return model

model = train_model(X_train, y_train)

# **Evaluate the model**

In [5]:
#Evaluate the model's performance using classification metrics
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] #Probability for positive class (all values in second column)
    
    print('Confusion matrix: \n', confusion_matrix(y_test, y_pred))
    print('\nClassification Report: \n', classification_report(y_test, y_pred))
    print('\nROC-AUC Score:', roc_auc_score(y_test, y_prob))

evaluate_model(model, X_test, y_test)

Confusion matrix: 
 [[56859     5]
 [   18    80]]

Classification Report: 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.82      0.87        98

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962


ROC-AUC Score: 0.968483610734671
