**Models Used**
- Decision Trees
- Extra Trees
- Random Forest
- Logistic Regression
- knn

<br><br>
**Additional Models to Use**
<br>
- Gradient Boosting<br>
- AdaBoost<br>
- Averaging<br>
- Majority Vote<br>
- Stacking<br>
- Neural Networks<br>
- SMOTE <br>
- ADASYN <br>

SMOTE:<br>
https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html
<br><br>

ADASYN:<br>
https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.ADASYN.html

#### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

pd.options.display.max_rows = 99
pd.options.display.max_columns = 99

from imblearn.over_sampling import SMOTE, ADASYN


%matplotlib inline

Using TensorFlow backend.


In [2]:
def confusion_credit_matrix(model):
    y_preds = model.predict(X_test)
    y_vals = model.predict(X_val)
    y_tr_preds = model.predict(X_train)
    
    tn, fp, fn, tp = confusion_matrix(y_train, y_tr_preds).ravel()
    cm = confusion_matrix(y_train, y_tr_preds)
    cm_df_train = pd.DataFrame(cm, columns=['Predicted Non-Fraud', 'Predicted Fraud'], index=['Actual Non-Fraud', 'Actual Fraud'])
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
    cm = confusion_matrix(y_test, y_preds)
    cm_df = pd.DataFrame(cm, columns=['Predicted Non-Fraud', 'Predicted Fraud'], index=['Actual Non-Fraud', 'Actual Fraud'])
    t_sens = (tp / (tp + fn))
    t_spec = (tn / (tn + fp))
    t_fnr = (fn / (fn + tp))
    
    tn, fp, fn, tp = confusion_matrix(y_val, y_vals).ravel()
    cm = confusion_matrix(y_val, y_vals)
    cm_df_v = pd.DataFrame(cm, columns=['Predicted Non-Fraud', 'Predicted Fraud'], index=['Actual Non-Fraud', 'Actual Fraud'])
    val_sens = (tp / (tp + fn))
    val_spec = (tn / (tn + fp))
    val_fnr = (fn / (fn + tp))

    print(f'Best Parameters for Model: {model.best_params_}')
    print(f'Training Score: {round(model.score(X_train, y_train), 4)}')
    print('')
    print(f'Testing Score: {round(model.score(X_test, y_test), 4)}')
    print('')
    print(f'Validation Score: {round(model.score(X_val, y_val), 4)}')
    print('\n' * 2)
    print("Training Confusion Matrix")
    display(cm_df_train)
    print('')
    print(f'Testing Sensitivity: {round(t_sens, 4)}')  
    print(f'Testing Specificity: {round(t_spec, 4)}')
    print(f'Testing False Negative Rate: {round(t_fnr, 4)}')
    print('')
    print("Testing Confusion Matrix")
    display(cm_df)
    print('')
    print(f'Validation Sensitivity: {round(val_sens, 4)}') 
    print(f'Validation Specificity: {round(val_spec, 4)}')
    print(f'Validation False Negative Rate: {round(val_fnr, 4)}')
    print('')
    print("Validation Confusion Matrix")
    return display(cm_df_v)

In [3]:
df = pd.read_csv('datasets/oversample_credit_training.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,63578.0,-0.639191,-0.085595,1.265452,1.401166,-0.260542,1.009795,1.301999,-0.135258,-0.431521,-0.091353,-0.921052,-0.613816,0.198913,0.016087,2.206264,-0.527141,-0.057279,0.448656,1.164435,0.861307,0.340331,0.76017,0.353377,-0.778893,-0.070681,-0.033355,-0.061962,-0.062687,290.18,1
1,166028.0,-0.95639,2.361594,-3.171195,1.970759,0.474761,-1.902598,-0.055178,0.277831,-1.745854,-2.516628,0.874052,-2.513104,0.021575,-3.565119,0.461153,-2.015713,-1.731413,-0.465815,0.52762,0.190877,0.473211,0.7194,0.122458,-0.25565,-0.619259,-0.48428,0.683535,0.443299,39.9,1
2,56806.0,0.016828,2.400826,-4.22036,3.462217,-0.624142,-1.294303,-2.986028,0.751883,-1.606672,-5.974925,3.264922,-5.095032,0.307808,-10.018106,0.273283,-3.562534,-4.377106,-1.792635,0.080281,0.590418,0.285832,-0.771508,-0.2652,-0.873077,0.939776,-0.219085,0.874494,0.470434,1.0,1
3,8614.0,-2.169929,3.639654,-4.508498,2.730668,-2.122693,-2.341017,-4.235253,1.703538,-1.305279,-6.71672,6.353612,-8.601648,0.44993,-7.506169,-0.438082,-3.694516,-6.304753,-1.267587,0.357987,0.500779,0.645103,-0.503529,-0.000523,0.071696,0.092007,0.308498,0.552591,0.298954,1.0,1
4,101051.0,-1.465316,-1.093377,-0.059768,1.064785,11.095089,-5.430971,-9.378025,-0.446456,1.99211,1.785922,1.368585,-1.471697,-0.724759,3.442422,-0.957403,-1.626129,1.418215,-1.417917,-1.651766,-1.45761,1.160623,-1.259697,-15.981649,-0.88367,-3.536716,-0.592965,0.675525,0.424849,0.92,1


In [4]:
X = df.drop(columns=['Time', 'Class', 'Amount'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [5]:
X_train.shape, X_test.shape

((341163, 28), (113721, 28))

In [6]:
df_test = pd.read_csv('datasets/oversample_credit_testing.csv')
df_test.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
1,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,1.017614,0.83639,1.006844,-0.443523,0.150219,0.739453,-0.54098,0.476677,0.451773,0.203711,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0
2,10.0,1.449044,-1.176339,0.91386,-1.375667,-1.971383,-0.629152,-1.423236,0.048456,-1.720408,1.626659,1.199644,-0.67144,-0.513947,-0.095045,0.23093,0.031967,0.253415,0.854344,-0.221365,-0.387226,-0.009302,0.313894,0.02774,0.500512,0.251367,-0.129478,0.04285,0.016253,7.8,0
3,13.0,-0.436905,0.918966,0.924591,-0.727219,0.915679,-0.127867,0.707642,0.087962,-0.665271,-0.73798,0.324098,0.277192,0.252624,-0.291896,-0.18452,1.143174,-0.928709,0.68047,0.025436,-0.047021,-0.194796,-0.672638,-0.156858,-0.888386,-0.342413,-0.049027,0.079692,0.131024,0.89,0
4,23.0,1.322707,-0.174041,0.434555,0.576038,-0.836758,-0.831083,-0.264905,-0.220982,-1.071425,0.868559,-0.641506,-0.111316,0.361485,0.171945,0.782167,-1.355871,-0.216935,1.271765,-1.240622,-0.522951,-0.284376,-0.323357,-0.03771,0.347151,0.559639,-0.280158,0.042335,0.028822,16.0,0


In [7]:
X_val = df_test.drop(columns=['Time', 'Class', 'Amount'])
y_val = df_test['Class']

#### 1. Logistic Regression Model

In [8]:
pipe = Pipeline(
    [
     ('log_model', LogisticRegression(solver='liblinear'))   
    ]
)

pipe_params = {
    'log_model__penalty': ['l1', 'l2'],
    'log_model__C': [1, 10, 1000, 1e9]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')

In [9]:
results = gs.fit(X_train, y_train)

In [10]:
confusion_credit_matrix(gs)

Best Parameters for Model: {'log_model__C': 10, 'log_model__penalty': 'l1'}
Training Score: 0.9497

Testing Score: 0.949

Validation Score: 0.9753



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,166422,4159
Actual Fraud,13018,157564



Testing Sensitivity: 0.9233
Testing Specificity: 0.9747
Testing False Negative Rate: 0.0767

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55424,1437
Actual Fraud,4364,52496



Validation Sensitivity: 0.9318
Validation Specificity: 0.9754
Validation False Negative Rate: 0.0682

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55472,1401
Actual Fraud,6,82


#### 2. DecisionTrees Model

In [11]:
pipe = Pipeline(
    [
     ('dct_model', DecisionTreeClassifier())   
    ]
)

pipe_params = {
    'dct_model__min_samples_leaf': [1, 2, 4],
    'dct_model__max_depth': [None, 100, 200]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')

In [12]:
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('dct_model',
                                        DecisionTreeClassifier(ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features=None,
                                                               max_leaf_nodes=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                                               min_samples_leaf=1,
                                                               min_samples_split=2,
                                   

In [13]:
#Making Sure validation is highly imbalanced
df_test['Class'].value_counts(normalize=True)

0    0.998455
1    0.001545
Name: Class, dtype: float64

In [14]:
confusion_credit_matrix(gs)

Best Parameters for Model: {'dct_model__max_depth': 200, 'dct_model__min_samples_leaf': 1}
Training Score: 1.0

Testing Score: 0.9998

Validation Score: 0.9991



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,170581,0
Actual Fraud,0,170582



Testing Sensitivity: 1.0
Testing Specificity: 0.9996
Testing False Negative Rate: 0.0

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56839,22
Actual Fraud,0,56860



Validation Sensitivity: 0.7614
Validation Specificity: 0.9995
Validation False Negative Rate: 0.2386

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56843,30
Actual Fraud,21,67


#### 3. $k$-NN Model

In [15]:
pipe = Pipeline(
    [
     ('knn_model', KNeighborsClassifier(n_jobs=4))   
    ]
)

pipe_params = {
    'knn_model__n_neighbors': [3, 5, 7, 9],
    'knn_model__p': [1, 2]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')

In [16]:
results = gs.fit(X_train, y_train)

In [17]:
confusion_credit_matrix(gs)

Best Parameters for Model: {'knn_model__n_neighbors': 3, 'knn_model__p': 1}
Training Score: 0.9999

Testing Score: 0.9997

Validation Score: 0.9993



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,170535,46
Actual Fraud,0,170582



Testing Sensitivity: 1.0
Testing Specificity: 0.9994
Testing False Negative Rate: 0.0

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56828,33
Actual Fraud,0,56860



Validation Sensitivity: 0.8636
Validation Specificity: 0.9995
Validation False Negative Rate: 0.1364

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56843,30
Actual Fraud,12,76


#### 4. Random Forest Model

In [18]:
pipe = Pipeline(
    [
     ('rf_model', RandomForestClassifier())   
    ]
)

pipe_params = {
    'rf_model__n_estimators': [100, 150, 200],
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')

In [19]:
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('rf_model',
                                        RandomForestClassifier(bootstrap=True,
                                                               ccp_alpha=0.0,
                                                               class_weight=None,
                                                               criterion='gini',
                                                               max_depth=None,
                                                               max_features='auto',
                                                               max_leaf_nodes=None,
                                                               max_samples=None,
                                                               min_impurity_decrease=0.0,
                                                               min_impurity_split=None,
                                         

In [20]:
confusion_credit_matrix(gs)

Best Parameters for Model: {'rf_model__n_estimators': 150}
Training Score: 1.0

Testing Score: 0.9999

Validation Score: 0.9996



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,170581,0
Actual Fraud,0,170582



Testing Sensitivity: 1.0
Testing Specificity: 0.9999
Testing False Negative Rate: 0.0

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56854,7
Actual Fraud,0,56860



Validation Sensitivity: 0.8182
Validation Specificity: 0.9999
Validation False Negative Rate: 0.1818

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56866,7
Actual Fraud,16,72


#### 5. Extra Trees Model

In [21]:
pipe = Pipeline(
    [
     ('et_model', ExtraTreesClassifier())   
    ]
)

pipe_params = {
    'et_model__n_estimators': [100, 150, 200],
    'et_model__max_depth': [None, 100, 150, 200]
}

gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')

In [22]:
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('et_model',
                                        ExtraTreesClassifier(bootstrap=False,
                                                             ccp_alpha=0.0,
                                                             class_weight=None,
                                                             criterion='gini',
                                                             max_depth=None,
                                                             max_features='auto',
                                                             max_leaf_nodes=None,
                                                             max_samples=None,
                                                             min_impurity_decrease=0.0,
                                                             min_impurity_split=None,
                                                            

In [23]:
confusion_credit_matrix(gs)

Best Parameters for Model: {'et_model__max_depth': 150, 'et_model__n_estimators': 200}
Training Score: 1.0

Testing Score: 1.0

Validation Score: 0.9996



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,170581,0
Actual Fraud,0,170582



Testing Sensitivity: 1.0
Testing Specificity: 0.9999
Testing False Negative Rate: 0.0

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56856,5
Actual Fraud,0,56860



Validation Sensitivity: 0.8182
Validation Specificity: 0.9999
Validation False Negative Rate: 0.1818

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56867,6
Actual Fraud,16,72


In [24]:
def basic_analysis_functions(func):
    if func.lower() == 'logistic':
        pipe = Pipeline(
        [
         ('log_model', LogisticRegression(solver='liblinear'))   
        ]
        )

        pipe_params = {
            'log_model__penalty': ['l1', 'l2'],
            'log_model__C': [1, 10, 1000, 1e9]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Logistic Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'decision tree':
        pipe = Pipeline(
            [
             ('dct_model', DecisionTreeClassifier())   
            ]
        )

        pipe_params = {
            'dct_model__min_samples_leaf': [1, 2, 4],
            'dct_model__max_depth': [None, 100, 200]
        }
        
        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Decision Tree Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'knn':
        pipe = Pipeline(
            [
             ('knn_model', KNeighborsClassifier(n_jobs=4))   
            ]
        )

        pipe_params = {
            'knn_model__n_neighbors': [3, 5, 7, 9],
            'knn_model__p': [1, 2]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a k-NN Model:')
        print('')
        confusion_credit_matrix(gs)
    
    elif func.lower() == 'rf':
        pipe = Pipeline(
            [
             ('rf_model', RandomForestClassifier())   
            ]
        )

        pipe_params = {
            'rf_model__n_estimators': [100, 150, 200],
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Random Forest Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'et':
        pipe = Pipeline(
            [
             ('et_model', ExtraTreesClassifier())   
            ]
        )

        pipe_params = {
            'et_model__n_estimators': [100, 150, 200],
            'et_model__max_depth': [None, 100, 150, 200]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for an Extra Trees Model:')
        print('')
        confusion_credit_matrix(gs)
        
    else:
    
        return print("Logistic, Decision Tree, RF, ET or KNN")
        
 

In [25]:
basic_analysis_functions('Logistic')

Printing Results for a Logistic Model:

Best Parameters for Model: {'log_model__C': 1000, 'log_model__penalty': 'l1'}
Training Score: 0.9497

Testing Score: 0.949

Validation Score: 0.9753



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,166424,4157
Actual Fraud,13018,157564



Testing Sensitivity: 0.9233
Testing Specificity: 0.9748
Testing False Negative Rate: 0.0767

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55427,1434
Actual Fraud,4364,52496



Validation Sensitivity: 0.9318
Validation Specificity: 0.9753
Validation False Negative Rate: 0.0682

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55471,1402
Actual Fraud,6,82
