In [1]:
# unbalanced data set 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import (ensemble, model_selection, preprocessing, tree)
from sklearn.metrics import (auc, confusion_matrix, roc_auc_score, roc_curve,
                             accuracy_score, classification_report
                             )
from sklearn.model_selection import (train_test_split, StratifiedKFold)
from sklearn.linear_model import LogisticRegression

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv('creditcard.csv')
data.head()

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data['Class'].value_counts()

In [8]:
df = data 
# df.columns = df.columns.str.replace(' ', '_')

In [None]:
df.columns

In [None]:
df.describe()

In [52]:
# one-hot encoding
# df = pd.get_dummies(df, drop_first=True)
# df = df.drop(columns = ['AON', 'MSISDN'])

In [58]:
# df = df.dropna(axis=0, how='any')

In [21]:
# prepare the data input 
y = df.Class
X = df.drop(columns = ['Class'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
y.value_counts()

In [None]:
# X_train.isnull().sum()

In [None]:
# df.head(10)

In [None]:
# linear classifier
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(y_pred[:10])

In [None]:
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("confusion matrix: \n", confusion_matrix(y_test, y_pred))

In [None]:
len(y_test)

In [None]:
y_test.value_counts()

In [None]:
df.hist(bins = 30, figsize=(30, 30))

In [None]:
# scale the column 'amount' 
from sklearn.preprocessing import StandardScaler 
df['Amount'] = StandardScaler().fit_transform(df['Amount'].to_numpy().reshape(-1, 1))
df['Time'] = StandardScaler().fit_transform(df['Time'].to_numpy().reshape(-1, 1))
df.head()

In [32]:
# prepare the data input 
y = df.Class
X = df.drop(columns = ['Class'])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
# linear classifier
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(y_pred[:10])

In [35]:
# improvement in f1-score after scaling on 'Time' and 'Amount' columns
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("confusion matrix: \n", confusion_matrix(y_test, y_pred))

Accuracy:  0.9991397773954567
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.86      0.60      0.71        98

    accuracy                           1.00     56962
   macro avg       0.93      0.80      0.85     56962
weighted avg       1.00      1.00      1.00     56962

confusion matrix: 
 [[56854    10]
 [   39    59]]


In [36]:
# use SMOTE method for imbalance dataset 
from imblearn.over_sampling import SMOTE 
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [40]:
# check the class ratios 
y_train_resampled.value_counts()

Class
0    227451
1    227451
Name: count, dtype: int64

In [41]:
# linear classifier
lr = LogisticRegression()
lr.fit(X_train_resampled, y_train_resampled)
y_pred = lr.predict(X_test)
print(y_pred[:10])

[1 0 0 0 0 0 0 0 0 0]


In [None]:
# no improvement after oversampling on fraud cases, getting worse in terms of f1-score?
print("Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("confusion matrix: \n", confusion_matrix(y_test, y_pred))

Accuracy:  0.9745795442575752
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     56864
           1       0.06      0.93      0.11        98

    accuracy                           0.97     56962
   macro avg       0.53      0.95      0.55     56962
weighted avg       1.00      0.97      0.99     56962

confusion matrix: 
 [[55423  1441]
 [    7    91]]
