In [None]:
# Library Imports

import kagglehub
import pandas as pd

from sklearn.preprocessing import RobustScaler
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, classification_report


import pickle

## Importing Dataset

In [None]:
# Download latest dataset version
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

Downloading from https://www.kaggle.com/api/v1/datasets/download/mlg-ulb/creditcardfraud?dataset_version_number=3...


100%|██████████| 66.0M/66.0M [00:00<00:00, 103MB/s]

Extracting files...





In [None]:
print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/mlg-ulb/creditcardfraud/versions/3


In [None]:
data = pd.read_csv(path + "/creditcard.csv")
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


# One-Class SVM
Accuracy: ~ 87%

## Preprocessing Data

It already has PCA done on it so no need for feature extraction.
We also need to scale the non-PCA columns Time & Amount.

In [None]:
# We have no nulls
data.isnull().sum().max()

0

In [None]:
# Our data is heavily imbalanced, we need a sub sample
data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


In [None]:
# Scale data and drop the old columns

scaler = RobustScaler()

data['amount_s'] = scaler.fit_transform(data['Amount'].values.reshape(-1,1))

data.drop(columns=['Time', 'Amount'], inplace=True)

data.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Class', 'amount_s'],
      dtype='object')

## Good Training

In [None]:
# Create relatively higher balanced train data

half = int(len(data) * 4/5)

negatives = data[data['Class'] == 0]

first_half = data[0: half]
second_half = data[half:]

positive_count = first_half['Class'].value_counts()[1]

print("Positives:", positive_count)

X_train = first_half[first_half['Class'] == 1]._append(first_half[first_half['Class'] == 0].sample(positive_count * 4))
y_train = X_train['Class']
X_train = X_train.loc[:,X_train.columns != 'Class']

Positives: 417


In [None]:
# Create Test Data
y_test = second_half['Class']
X_test = second_half.loc[:,second_half.columns != 'Class']

In [None]:
# Train
ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.25)

ocsvm.fit(X_train, y_train)

In [None]:
# Test

y_predict = ocsvm.predict(X_test)

y_predict[y_predict == 1] = 0
y_predict[y_predict == -1] = 1

print("Accuracy Score :")
print(accuracy_score(y_test,y_predict))
print("Classification Report :")
print(classification_report(y_test,y_predict))

Accuracy Score :
0.8680348302377023
Classification Report :
              precision    recall  f1-score   support

           0       1.00      0.87      0.93     56887
           1       0.01      0.84      0.02        75

    accuracy                           0.87     56962
   macro avg       0.50      0.85      0.47     56962
weighted avg       1.00      0.87      0.93     56962



# Isolation Forest

Accuracy = ~ 98%

In [None]:
# We will use the same data prepared for the previous example

iforest = IsolationForest(n_estimators=100, max_samples=positive_count * 5, contamination=0.1, random_state=42)
iforest.fit(X_train, y_train)

y_predict = iforest.predict(X_test)

y_predict[y_predict == 1] = 0
y_predict[y_predict == -1] = 1

print("Accuracy Score :")
print(accuracy_score(y_test,y_predict))
print("Classification Report :")
print(classification_report(y_test,y_predict))


Accuracy Score :
0.985657104736491
Classification Report :
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     56887
           1       0.02      0.20      0.04        75

    accuracy                           0.99     56962
   macro avg       0.51      0.59      0.51     56962
weighted avg       1.00      0.99      0.99     56962



# Exporting

In [None]:
with open("iforest.pkl", "wb") as file:
    pickle.dump(iforest, file)