In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
import warnings

In [9]:
df = pd.read_csv('creditcardfraud/creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [10]:
from sklearn.model_selection import StratifiedShuffleSplit

sss = StratifiedShuffleSplit(n_splits = 10,test_size = 0.2,random_state = 42)

for train_index, test_index in sss.split(df,df['Class']):
    train_set,test_set = df.iloc[train_index], df.iloc[test_index]


In [11]:
X_train , X_test = train_set.drop('Class',axis =1), test_set.drop('Class',axis= 1)
Y_train, Y_test = train_set['Class'], test_set['Class']
print(X_train.shape, X_test.shape)
print(sum(Y_train == 1))
print(sum(Y_test == 1))

(227845, 30) (56962, 30)
394
98


In [12]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ss.fit_transform(X_train)
ss.fit_transform(X_test)

array([[-0.43185767, -1.43676687,  1.38197685, ..., -2.75019997,
         0.3605243 , -0.32201245],
       [-0.82685066,  0.69210307,  0.30329832, ..., -0.01705376,
         0.00728771, -0.34762983],
       [-1.85018408,  0.61671435, -0.33337874, ..., -0.14879302,
        -0.06117607, -0.19585196],
       ...,
       [ 1.15566408,  1.05168515,  0.3507202 , ..., -0.16931227,
         0.00378995, -0.24003683],
       [ 0.54103137, -0.44971045,  0.22312743, ..., -0.52802222,
        -0.55612652, -0.35935235],
       [-0.46522013, -0.73749655,  0.15741126, ..., -0.19798302,
         0.13843414,  0.71424142]])

In [13]:
'''
def evaluate_model(true, predicted):
    r2 = r2_score(true,predicted)
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    
    return mae, mse, r2
'''
def evaluate_model(true,predicted):
    accuracy = accuracy_score(true, predicted)
    return accuracy

In [14]:
models = {"Logistic Regression": LogisticRegression(),
          "SVC": SVC(),
          "KNeighborsClassifier": KNeighborsClassifier(),
          "RidgeClassifier": RidgeClassifier(),
          "AdaBoostClassifier": AdaBoostClassifier(),
          "RandomForestClassifier": RandomForestClassifier(),
          "XGBClassifier": XGBClassifier(),
          "DecisionTreeClassifier": DecisionTreeClassifier()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,Y_train)

    Y_test_pred = model.predict(X_test)
    Y_train_pred = model.predict(X_train)

    # train_mae, train_mse, train_r2 = evaluate_model(Y_train_pred, Y_train)
    # test_mae, test_mse, test_r2 = evaluate_model(Y_test_pred, Y_test)
    train_acc = evaluate_model(Y_train_pred, Y_train)
    test_acc = evaluate_model(Y_test_pred, Y_test)
    
    print(list(models.keys())[i])
    
    print("Model Performance for training set")
    # print("Mean Absolute Error: ", train_mae)
    # print("Mean Squared Error: ", train_mse)
    # print("R2 Score: ", train_r2)
    print("Train Accuracy: ",train_acc)
    print("---------------------------------------")

    print("Model Performance for testing set")
    # print("Mean Absolute Error: ", test_mae)
    # print("Mean Squared Error: ", test_mse)
    # print("R2 Score: ", test_r2)
    print("Test Accuracy: ",test_acc)

    print("="*35)
    print('\n')

Logistic Regression
Model Performance for training set
Train Accuracy:  0.9988852070486515
---------------------------------------
Model Performance for testing set
Test Accuracy:  0.9989642217618764


SVC
Model Performance for training set
Train Accuracy:  0.9982707542408216
---------------------------------------
Model Performance for testing set
Test Accuracy:  0.9982795547909132


KNeighborsClassifier
Model Performance for training set
Train Accuracy:  0.99847264587768
---------------------------------------
Model Performance for testing set
Test Accuracy:  0.9982619992275552


RidgeClassifier
Model Performance for training set
Train Accuracy:  0.9988676512541421
---------------------------------------
Model Performance for testing set
Test Accuracy:  0.9988237772550121


AdaBoostClassifier
Model Performance for training set
Train Accuracy:  0.9992143781957032
---------------------------------------
Model Performance for testing set
Test Accuracy:  0.9992099996488887


RandomForest