# 0. SMOTE & ADASYN Models

**Model Options**
- Decision Trees
- Extra Trees
- Random Forest
- Logistic Regression
- knn
<br><br>
**Sampling Used**
- SMOTE
- ADASYN

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

pd.options.display.max_rows = 99
pd.options.display.max_columns = 99

from imblearn.over_sampling import SMOTE, ADASYN
from collections import Counter

%matplotlib inline

Using TensorFlow backend.


In [2]:
# Creating our baseline model
df = pd.read_csv('creditcard.csv')
X = df.drop(columns=['Time', 'Class', 'Amount'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [3]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
counter = Counter(y_train)
print(counter)

Counter({0: 213236, 1: 369})


# 1. SMOTE Models

In [5]:
# transform the dataset
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)
counter = Counter(y_train)
print(counter)

Counter({0: 213236, 1: 213236})


In [6]:
def basic_analysis_functions(func):
    if func.lower() == 'logistic':
        pipe = Pipeline(
        [
         ('log_model', LogisticRegression(solver='liblinear'))   
        ]
        )

        pipe_params = {
            'log_model__penalty': ['l1', 'l2'],
            'log_model__C': [1, 10, 1000, 1e9]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Logistic Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'decision tree':
        pipe = Pipeline(
            [
             ('dct_model', DecisionTreeClassifier())   
            ]
        )

        pipe_params = {
            'dct_model__min_samples_leaf': [1, 2, 4],
            'dct_model__max_depth': [None, 100, 200]
        }
        
        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Decision Tree Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'knn':
        pipe = Pipeline(
            [
             ('knn_model', KNeighborsClassifier(n_jobs=4))   
            ]
        )

        pipe_params = {
            'knn_model__n_neighbors': [3, 5, 7, 9],
            'knn_model__p': [1, 2]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a k-NN Model:')
        print('')
        confusion_credit_matrix(gs)
    
    elif func.lower() == 'rf':
        pipe = Pipeline(
            [
             ('rf_model', RandomForestClassifier())   
            ]
        )

        pipe_params = {
            'rf_model__n_estimators': [100, 150, 200],
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Random Forest Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'et':
        pipe = Pipeline(
            [
             ('et_model', ExtraTreesClassifier())   
            ]
        )

        pipe_params = {
            'et_model__n_estimators': [100, 150, 200],
            'et_model__max_depth': [None, 100, 150, 200]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for an Extra Trees Model:')
        print('')
        confusion_credit_matrix(gs)
        
    else:
    
        return print("Logistic, Decision Tree, RF, ET or KNN")


In [7]:
def confusion_credit_matrix(model):
    y_preds = model.predict(X_test)
    y_tr_preds = model.predict(X_train)
    
    tn, fp, fn, tp = confusion_matrix(y_train, y_tr_preds).ravel()
    cm = confusion_matrix(y_train, y_tr_preds)
    cm_df_train = pd.DataFrame(cm, columns=['Predicted Non-Fraud', 'Predicted Fraud'], index=['Actual Non-Fraud', 'Actual Fraud'])
    tr_sens = (tp / (tp + fn))
    tr_spec = (tn / (tn + fp))
    tr_fnr = (fn / (fn + tp))
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
    cm = confusion_matrix(y_test, y_preds)
    cm_df = pd.DataFrame(cm, columns=['Predicted Non-Fraud', 'Predicted Fraud'], index=['Actual Non-Fraud', 'Actual Fraud'])
    t_sens = (tp / (tp + fn))
    t_spec = (tn / (tn + fp))
    t_fnr = (fn / (fn + tp))


    print(f'Best Parameters for Model: {model.best_params_}')
    print(f'Training Score: {round(model.score(X_train, y_train), 4)}')
    print('')
    print(f'Testing Score: {round(model.score(X_test, y_test), 4)}')
    print('\n' * 2)
    print(f'Training Sensitivity: {round(tr_sens, 4)}')  
    print(f'Training Specificity: {round(tr_spec, 4)}')
    print(f'Training False Negative Rate: {round(tr_fnr, 4)}')
    print("Training Confusion Matrix")
    display(cm_df_train)
    print('')
    print(f'Testing Sensitivity: {round(t_sens, 4)}')  
    print(f'Testing Specificity: {round(t_spec, 4)}')
    print(f'Testing False Negative Rate: {round(t_fnr, 4)}')
    print('')
    print("Testing Confusion Matrix")
    return display(cm_df)

In [8]:
funky = ['logistic', 'decision tree', 'rf', 'et', 'knn']

In [9]:
# for funk in funky:
#     basic_analysis_functions(funk)
#     print('*' * 40)

In [10]:
basic_analysis_functions('logistic')

Printing Results for a Logistic Model:

Best Parameters for Model: {'log_model__C': 1000, 'log_model__penalty': 'l2'}
Training Score: 0.9524

Testing Score: 0.9744



Training Sensitivity: 0.9296
Training Specificity: 0.9753
Training False Negative Rate: 0.0704
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,207966,5270
Actual Fraud,15013,198223



Testing Sensitivity: 0.8943
Testing Specificity: 0.9745
Testing False Negative Rate: 0.1057

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,69267,1812
Actual Fraud,13,110


In [11]:
basic_analysis_functions('decision tree')

Printing Results for a Decision Tree Model:

Best Parameters for Model: {'dct_model__max_depth': None, 'dct_model__min_samples_leaf': 1}
Training Score: 1.0

Testing Score: 0.9978



Training Sensitivity: 1.0
Training Specificity: 1.0
Training False Negative Rate: 0.0
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,213236,0
Actual Fraud,0,213236



Testing Sensitivity: 0.7398
Testing Specificity: 0.9983
Testing False Negative Rate: 0.2602

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,70955,124
Actual Fraud,32,91


In [12]:
basic_analysis_functions('rf')

Printing Results for a Random Forest Model:

Best Parameters for Model: {'rf_model__n_estimators': 150}
Training Score: 1.0

Testing Score: 0.9995



Training Sensitivity: 1.0
Training Specificity: 1.0
Training False Negative Rate: 0.0
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,213236,0
Actual Fraud,0,213236



Testing Sensitivity: 0.8049
Testing Specificity: 0.9998
Testing False Negative Rate: 0.1951

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,71064,15
Actual Fraud,24,99


In [13]:
basic_analysis_functions('et')

Printing Results for an Extra Trees Model:

Best Parameters for Model: {'et_model__max_depth': 200, 'et_model__n_estimators': 100}
Training Score: 1.0

Testing Score: 0.9995



Training Sensitivity: 1.0
Training Specificity: 1.0
Training False Negative Rate: 0.0
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,213236,0
Actual Fraud,0,213236



Testing Sensitivity: 0.813
Testing Specificity: 0.9998
Testing False Negative Rate: 0.187

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,71066,13
Actual Fraud,23,100


In [14]:
basic_analysis_functions('knn')

Printing Results for a k-NN Model:

Best Parameters for Model: {'knn_model__n_neighbors': 3, 'knn_model__p': 2}
Training Score: 0.9997

Testing Score: 0.9987



Training Sensitivity: 1.0
Training Specificity: 0.9994
Training False Negative Rate: 0.0
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,213114,122
Actual Fraud,0,213236



Testing Sensitivity: 0.8374
Testing Specificity: 0.999
Testing False Negative Rate: 0.1626

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,71008,71
Actual Fraud,20,103


# 2. ADASYN Models

In [15]:
# Creating our baseline model (again)
df = pd.read_csv('creditcard.csv')
X = df.drop(columns=['Time', 'Class', 'Amount'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [16]:
counter = Counter(y_train)
print(counter)

Counter({0: 213236, 1: 369})


In [17]:
# transform the dataset
oversample = ADASYN()
X_train, y_train = oversample.fit_resample(X_train, y_train)
counter = Counter(y_train)
print(counter)

Counter({1: 213239, 0: 213236})


In [18]:
basic_analysis_functions('logistic')

Printing Results for a Logistic Model:

Best Parameters for Model: {'log_model__C': 1, 'log_model__penalty': 'l1'}
Training Score: 0.9075

Testing Score: 0.9155



Training Sensitivity: 0.8973
Training Specificity: 0.9176
Training False Negative Rate: 0.1027
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,195673,17563
Actual Fraud,21900,191339



Testing Sensitivity: 0.9024
Testing Specificity: 0.9155
Testing False Negative Rate: 0.0976

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,65074,6005
Actual Fraud,12,111


In [19]:
basic_analysis_functions('decision tree')

Printing Results for a Decision Tree Model:

Best Parameters for Model: {'dct_model__max_depth': None, 'dct_model__min_samples_leaf': 1}
Training Score: 1.0

Testing Score: 0.9976



Training Sensitivity: 1.0
Training Specificity: 1.0
Training False Negative Rate: 0.0
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,213236,0
Actual Fraud,0,213239



Testing Sensitivity: 0.748
Testing Specificity: 0.9981
Testing False Negative Rate: 0.252

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,70941,138
Actual Fraud,31,92


In [20]:
basic_analysis_functions('rf')

Printing Results for a Random Forest Model:

Best Parameters for Model: {'rf_model__n_estimators': 200}
Training Score: 1.0

Testing Score: 0.9994



Training Sensitivity: 1.0
Training Specificity: 1.0
Training False Negative Rate: 0.0
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,213236,0
Actual Fraud,0,213239



Testing Sensitivity: 0.7886
Testing Specificity: 0.9998
Testing False Negative Rate: 0.2114

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,71065,14
Actual Fraud,26,97


In [21]:
basic_analysis_functions('et')

Printing Results for an Extra Trees Model:

Best Parameters for Model: {'et_model__max_depth': 100, 'et_model__n_estimators': 200}
Training Score: 1.0

Testing Score: 0.9995



Training Sensitivity: 1.0
Training Specificity: 1.0
Training False Negative Rate: 0.0
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,213236,0
Actual Fraud,0,213239



Testing Sensitivity: 0.8049
Testing Specificity: 0.9998
Testing False Negative Rate: 0.1951

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,71067,12
Actual Fraud,24,99


In [22]:
basic_analysis_functions('knn')

Printing Results for a k-NN Model:

Best Parameters for Model: {'knn_model__n_neighbors': 9, 'knn_model__p': 1}
Training Score: 0.9989

Testing Score: 0.9971



Training Sensitivity: 1.0
Training Specificity: 0.9979
Training False Negative Rate: 0.0
Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,212780,456
Actual Fraud,0,213239



Testing Sensitivity: 0.878
Testing Specificity: 0.9973
Testing False Negative Rate: 0.122

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,70887,192
Actual Fraud,15,108
