# 0. Running Multiple Models on Multiple Undersampling Sets

**Model Options**
- Decision Trees
- Extra Trees
- Random Forest
- Logistic Regression
- knn

<br><br>
**Combining Models**
- Averaging
- Majority Vote
- Stacking

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline

pd.options.display.max_rows = 99
pd.options.display.max_columns = 99

from imblearn.over_sampling import SMOTE, ADASYN


%matplotlib inline

Using TensorFlow backend.


In [2]:
# Creating our Validation set
df_test = pd.read_csv('datasets/creditcard.csv')
df_test.head()
X_val = df_test.drop(columns=['Time', 'Class', 'Amount'])
y_val = df_test['Class']

In [3]:
def confusion_credit_matrix(model):
    y_preds = model.predict(X_test)
    y_vals = model.predict(X_val)
    y_tr_preds = model.predict(X_train)
    
    tn, fp, fn, tp = confusion_matrix(y_train, y_tr_preds).ravel()
    cm = confusion_matrix(y_train, y_tr_preds)
    cm_df_train = pd.DataFrame(cm, columns=['Predicted Non-Fraud', 'Predicted Fraud'], index=['Actual Non-Fraud', 'Actual Fraud'])
    
    tn, fp, fn, tp = confusion_matrix(y_test, y_preds).ravel()
    cm = confusion_matrix(y_test, y_preds)
    cm_df = pd.DataFrame(cm, columns=['Predicted Non-Fraud', 'Predicted Fraud'], index=['Actual Non-Fraud', 'Actual Fraud'])
    t_sens = (tp / (tp + fn))
    t_spec = (tn / (tn + fp))
    t_fnr = (fn / (fn + tp))
    
    tn, fp, fn, tp = confusion_matrix(y_val, y_vals).ravel()
    cm = confusion_matrix(y_val, y_vals)
    cm_df_v = pd.DataFrame(cm, columns=['Predicted Non-Fraud', 'Predicted Fraud'], index=['Actual Non-Fraud', 'Actual Fraud'])
    val_sens = (tp / (tp + fn))
    val_spec = (tn / (tn + fp))
    val_fnr = (fn / (fn + tp))

    print(f'Best Parameters for Model: {model.best_params_}')
    print(f'Training Score: {round(model.score(X_train, y_train), 4)}')
    print('')
    print(f'Testing Score: {round(model.score(X_test, y_test), 4)}')
    print('')
    print(f'Validation Score: {round(model.score(X_val, y_val), 4)}')
    print('\n' * 2)
    print("Training Confusion Matrix")
    display(cm_df_train)
    print('')
    print(f'Testing Sensitivity: {round(t_sens, 4)}')  
    print(f'Testing Specificity: {round(t_spec, 4)}')
    print(f'Testing False Negative Rate: {round(t_fnr, 4)}')
    print('')
    print("Testing Confusion Matrix")
    display(cm_df)
    print('')
    print(f'Validation Sensitivity: {round(val_sens, 4)}') 
    print(f'Validation Specificity: {round(val_spec, 4)}')
    print(f'Validation False Negative Rate: {round(val_fnr, 4)}')
    print('')
    print("Validation Confusion Matrix")
    return display(cm_df_v)

In [4]:
def basic_analysis_functions(func):
    if func.lower() == 'logistic':
        pipe = Pipeline(
        [
         ('log_model', LogisticRegression(solver='liblinear'))   
        ]
        )

        pipe_params = {
            'log_model__penalty': ['l1', 'l2'],
            'log_model__C': [1, 10, 1000, 1e9]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Logistic Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'decision tree':
        pipe = Pipeline(
            [
             ('dct_model', DecisionTreeClassifier())   
            ]
        )

        pipe_params = {
            'dct_model__min_samples_leaf': [1, 2, 4],
            'dct_model__max_depth': [None, 100, 200]
        }
        
        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Decision Tree Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'knn':
        pipe = Pipeline(
            [
             ('knn_model', KNeighborsClassifier(n_jobs=4))   
            ]
        )

        pipe_params = {
            'knn_model__n_neighbors': [3, 5, 7, 9],
            'knn_model__p': [1, 2]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a k-NN Model:')
        print('')
        confusion_credit_matrix(gs)
    
    elif func.lower() == 'rf':
        pipe = Pipeline(
            [
             ('rf_model', RandomForestClassifier())   
            ]
        )

        pipe_params = {
            'rf_model__n_estimators': [100, 150, 200],
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for a Random Forest Model:')
        print('')
        confusion_credit_matrix(gs)
        
    elif func.lower() == 'et':
        pipe = Pipeline(
            [
             ('et_model', ExtraTreesClassifier())   
            ]
        )

        pipe_params = {
            'et_model__n_estimators': [100, 150, 200],
            'et_model__max_depth': [None, 100, 150, 200]
        }

        gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3, scoring = 'accuracy')
        gs.fit(X_train, y_train)
        print('Printing Results for an Extra Trees Model:')
        print('')
        confusion_credit_matrix(gs)
        
    else:
    
        return print("Logistic, Decision Tree, RF, ET or KNN")

In [5]:
def combined_analysis(dataset, func):
    df = pd.read_csv('datasets/' + dataset)
    X = df.drop(columns=['Time', 'Class', 'Amount'])
    y = df['Class']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)
    
    print(X_train.shape)
    
    return basic_analysis_functions(func)

In [6]:
df = pd.read_csv('datasets/credit_undersample_42.csv')
X = df.drop(columns=['Time', 'Class', 'Amount'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [7]:
funky = ['logistic', 'decision tree', 'rf', 'et', 'knn']

# 1. Model for `credit_undersample_42.csv`

<br><br>
**Models run**
1. Logistic Regression
2. Decision Tree
3. Random Forest
4. Extra Trees
5. $k$-NN

In [8]:
for funk in funky:
    basic_analysis_functions(funk)
    print('*' * 40)

Printing Results for a Logistic Model:

Best Parameters for Model: {'log_model__C': 1, 'log_model__penalty': 'l2'}
Training Score: 0.9553

Testing Score: 0.9472

Validation Score: 0.9578



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,362,7
Actual Fraud,26,343



Testing Sensitivity: 0.9431
Testing Specificity: 0.9512
Testing False Negative Rate: 0.0569

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,117,6
Actual Fraud,7,116



Validation Sensitivity: 0.9318
Validation Specificity: 0.9578
Validation False Negative Rate: 0.0682

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,54475,2398
Actual Fraud,6,82


****************************************
Printing Results for a Decision Tree Model:

Best Parameters for Model: {'dct_model__max_depth': 100, 'dct_model__min_samples_leaf': 2}
Training Score: 0.9878

Testing Score: 0.9146

Validation Score: 0.9225



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,9,360



Testing Sensitivity: 0.9268
Testing Specificity: 0.9024
Testing False Negative Rate: 0.0732

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,111,12
Actual Fraud,9,114



Validation Sensitivity: 0.9659
Validation Specificity: 0.9224
Validation False Negative Rate: 0.0341

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,52462,4411
Actual Fraud,3,85


****************************************
Printing Results for a Random Forest Model:

Best Parameters for Model: {'rf_model__n_estimators': 150}
Training Score: 1.0

Testing Score: 0.9431

Validation Score: 0.9715



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.9268
Testing Specificity: 0.9593
Testing False Negative Rate: 0.0732

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,118,5
Actual Fraud,9,114



Validation Sensitivity: 0.9773
Validation Specificity: 0.9715
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55250,1623
Actual Fraud,2,86


****************************************
Printing Results for an Extra Trees Model:

Best Parameters for Model: {'et_model__max_depth': None, 'et_model__n_estimators': 100}
Training Score: 1.0

Testing Score: 0.9512

Validation Score: 0.9818



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.9187
Testing Specificity: 0.9837
Testing False Negative Rate: 0.0813

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,121,2
Actual Fraud,10,113



Validation Sensitivity: 0.9773
Validation Specificity: 0.9819
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55841,1032
Actual Fraud,2,86


****************************************
Printing Results for a k-NN Model:

Best Parameters for Model: {'knn_model__n_neighbors': 5, 'knn_model__p': 1}
Training Score: 0.9526

Testing Score: 0.9472

Validation Score: 0.9762



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,364,5
Actual Fraud,30,339



Testing Sensitivity: 0.9187
Testing Specificity: 0.9756
Testing False Negative Rate: 0.0813

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,120,3
Actual Fraud,10,113



Validation Sensitivity: 0.9545
Validation Specificity: 0.9762
Validation False Negative Rate: 0.0455

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55522,1351
Actual Fraud,4,84


****************************************


# 1. Model for `credit_undersample_79.csv`

<br><br>
**Models run**
1. Logistic Regression
2. Decision Tree
3. Random Forest
4. Extra Trees
5. $k$-NN

In [9]:
df = pd.read_csv('datasets/credit_undersample_79.csv')
X = df.drop(columns=['Time', 'Class', 'Amount'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

for funk in funky:
    basic_analysis_functions(funk)
    print('*' * 40)

Printing Results for a Logistic Model:

Best Parameters for Model: {'log_model__C': 1, 'log_model__penalty': 'l1'}
Training Score: 0.9499

Testing Score: 0.9634

Validation Score: 0.9678



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,362,7
Actual Fraud,30,339



Testing Sensitivity: 0.9431
Testing Specificity: 0.9837
Testing False Negative Rate: 0.0569

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,121,2
Actual Fraud,7,116



Validation Sensitivity: 0.9318
Validation Specificity: 0.9678
Validation False Negative Rate: 0.0682

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55043,1830
Actual Fraud,6,82


****************************************
Printing Results for a Decision Tree Model:

Best Parameters for Model: {'dct_model__max_depth': 100, 'dct_model__min_samples_leaf': 4}
Training Score: 0.9797

Testing Score: 0.8902

Validation Score: 0.915



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,365,4
Actual Fraud,11,358



Testing Sensitivity: 0.8943
Testing Specificity: 0.8862
Testing False Negative Rate: 0.1057

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,109,14
Actual Fraud,13,110



Validation Sensitivity: 0.9659
Validation Specificity: 0.915
Validation False Negative Rate: 0.0341

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,52036,4837
Actual Fraud,3,85


****************************************
Printing Results for a Random Forest Model:

Best Parameters for Model: {'rf_model__n_estimators': 200}
Training Score: 1.0

Testing Score: 0.9553

Validation Score: 0.9666



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.935
Testing Specificity: 0.9756
Testing False Negative Rate: 0.065

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,120,3
Actual Fraud,8,115



Validation Sensitivity: 0.9773
Validation Specificity: 0.9666
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,54971,1902
Actual Fraud,2,86


****************************************
Printing Results for an Extra Trees Model:

Best Parameters for Model: {'et_model__max_depth': 100, 'et_model__n_estimators': 200}
Training Score: 1.0

Testing Score: 0.9553

Validation Score: 0.9792



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.935
Testing Specificity: 0.9756
Testing False Negative Rate: 0.065

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,120,3
Actual Fraud,8,115



Validation Sensitivity: 0.9773
Validation Specificity: 0.9792
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55691,1182
Actual Fraud,2,86


****************************************
Printing Results for a k-NN Model:

Best Parameters for Model: {'knn_model__n_neighbors': 5, 'knn_model__p': 1}
Training Score: 0.9417

Testing Score: 0.935

Validation Score: 0.9737



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,358,11
Actual Fraud,32,337



Testing Sensitivity: 0.9106
Testing Specificity: 0.9593
Testing False Negative Rate: 0.0894

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,118,5
Actual Fraud,11,112



Validation Sensitivity: 0.9318
Validation Specificity: 0.9737
Validation False Negative Rate: 0.0682

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55380,1493
Actual Fraud,6,82


****************************************


***********************************************************

# 1. Model for `credit_undersample_804.csv`

<br><br>
**Models run**
1. Logistic Regression
2. Decision Tree
3. Random Forest
4. Extra Trees
5. $k$-NN

In [10]:
df = pd.read_csv('datasets/credit_undersample_804.csv')
X = df.drop(columns=['Time', 'Class', 'Amount'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

for funk in funky:
    basic_analysis_functions(funk)
    print('*' * 40)

Printing Results for a Logistic Model:

Best Parameters for Model: {'log_model__C': 10, 'log_model__penalty': 'l1'}
Training Score: 0.9526

Testing Score: 0.9634

Validation Score: 0.9638



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,361,8
Actual Fraud,27,342



Testing Sensitivity: 0.935
Testing Specificity: 0.9919
Testing False Negative Rate: 0.065

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,122,1
Actual Fraud,8,115



Validation Sensitivity: 0.9318
Validation Specificity: 0.9639
Validation False Negative Rate: 0.0682

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,54819,2054
Actual Fraud,6,82


****************************************
Printing Results for a Decision Tree Model:

Best Parameters for Model: {'dct_model__max_depth': None, 'dct_model__min_samples_leaf': 2}
Training Score: 0.9892

Testing Score: 0.9228

Validation Score: 0.9146



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,367,2
Actual Fraud,6,363



Testing Sensitivity: 0.9268
Testing Specificity: 0.9187
Testing False Negative Rate: 0.0732

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,113,10
Actual Fraud,9,114



Validation Sensitivity: 0.9545
Validation Specificity: 0.9145
Validation False Negative Rate: 0.0455

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,52013,4860
Actual Fraud,4,84


****************************************
Printing Results for a Random Forest Model:

Best Parameters for Model: {'rf_model__n_estimators': 200}
Training Score: 1.0

Testing Score: 0.9593

Validation Score: 0.9738



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.935
Testing Specificity: 0.9837
Testing False Negative Rate: 0.065

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,121,2
Actual Fraud,8,115



Validation Sensitivity: 0.9773
Validation Specificity: 0.9738
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55383,1490
Actual Fraud,2,86


****************************************
Printing Results for an Extra Trees Model:

Best Parameters for Model: {'et_model__max_depth': 200, 'et_model__n_estimators': 150}
Training Score: 1.0

Testing Score: 0.9593

Validation Score: 0.9817



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.935
Testing Specificity: 0.9837
Testing False Negative Rate: 0.065

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,121,2
Actual Fraud,8,115



Validation Sensitivity: 0.9773
Validation Specificity: 0.9817
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55832,1041
Actual Fraud,2,86


****************************************
Printing Results for a k-NN Model:

Best Parameters for Model: {'knn_model__n_neighbors': 3, 'knn_model__p': 1}
Training Score: 0.9648

Testing Score: 0.9553

Validation Score: 0.9659



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,364,5
Actual Fraud,21,348



Testing Sensitivity: 0.935
Testing Specificity: 0.9756
Testing False Negative Rate: 0.065

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,120,3
Actual Fraud,8,115



Validation Sensitivity: 0.9432
Validation Specificity: 0.9659
Validation False Negative Rate: 0.0568

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,54935,1938
Actual Fraud,5,83


****************************************


*********************************************************************

# 1. Model for `credit_undersample_20200202.csv`

<br><br>
**Models run**
1. Logistic Regression
2. Decision Tree
3. Random Forest
4. Extra Trees
5. $k$-NN

In [11]:
df = pd.read_csv('datasets/credit_undersample_20200202.csv')
X = df.drop(columns=['Time', 'Class', 'Amount'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

for funk in funky:
    basic_analysis_functions(funk)
    print('*' * 40)

Printing Results for a Logistic Model:

Best Parameters for Model: {'log_model__C': 10, 'log_model__penalty': 'l1'}
Training Score: 0.9472

Testing Score: 0.9593

Validation Score: 0.9641



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,358,11
Actual Fraud,28,341



Testing Sensitivity: 0.9431
Testing Specificity: 0.9756
Testing False Negative Rate: 0.0569

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,120,3
Actual Fraud,7,116



Validation Sensitivity: 0.9318
Validation Specificity: 0.9642
Validation False Negative Rate: 0.0682

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,54836,2037
Actual Fraud,6,82


****************************************
Printing Results for a Decision Tree Model:

Best Parameters for Model: {'dct_model__max_depth': None, 'dct_model__min_samples_leaf': 2}
Training Score: 0.9919

Testing Score: 0.9228

Validation Score: 0.9099



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,6,363



Testing Sensitivity: 0.9268
Testing Specificity: 0.9187
Testing False Negative Rate: 0.0732

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,113,10
Actual Fraud,9,114



Validation Sensitivity: 0.9659
Validation Specificity: 0.9098
Validation False Negative Rate: 0.0341

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,51745,5128
Actual Fraud,3,85


****************************************
Printing Results for a Random Forest Model:

Best Parameters for Model: {'rf_model__n_estimators': 100}
Training Score: 1.0

Testing Score: 0.9593

Validation Score: 0.9811



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.935
Testing Specificity: 0.9837
Testing False Negative Rate: 0.065

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,121,2
Actual Fraud,8,115



Validation Sensitivity: 0.9773
Validation Specificity: 0.9811
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55799,1074
Actual Fraud,2,86


****************************************
Printing Results for an Extra Trees Model:

Best Parameters for Model: {'et_model__max_depth': 100, 'et_model__n_estimators': 100}
Training Score: 1.0

Testing Score: 0.9472

Validation Score: 0.9853



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.9106
Testing Specificity: 0.9837
Testing False Negative Rate: 0.0894

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,121,2
Actual Fraud,11,112



Validation Sensitivity: 0.9773
Validation Specificity: 0.9854
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,56040,833
Actual Fraud,2,86


****************************************
Printing Results for a k-NN Model:

Best Parameters for Model: {'knn_model__n_neighbors': 3, 'knn_model__p': 1}
Training Score: 0.958

Testing Score: 0.9472

Validation Score: 0.972



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,365,4
Actual Fraud,27,342



Testing Sensitivity: 0.9106
Testing Specificity: 0.9837
Testing False Negative Rate: 0.0894

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,121,2
Actual Fraud,11,112



Validation Sensitivity: 0.9205
Validation Specificity: 0.972
Validation False Negative Rate: 0.0795

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55283,1590
Actual Fraud,7,81


****************************************


*********************************************************************

# 1. Model for `credit_undersample_20200304.csv`

<br><br>
**Models run**
1. Logistic Regression
2. Decision Tree
3. Random Forest
4. Extra Trees
5. $k$-NN

In [12]:
df = pd.read_csv('datasets/credit_undersample_20200304.csv')
X = df.drop(columns=['Time', 'Class', 'Amount'])
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

for funk in funky:
    basic_analysis_functions(funk)
    print('*' * 40)

Printing Results for a Logistic Model:

Best Parameters for Model: {'log_model__C': 1, 'log_model__penalty': 'l1'}
Training Score: 0.9363

Testing Score: 0.9512

Validation Score: 0.973



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,357,12
Actual Fraud,35,334



Testing Sensitivity: 0.9268
Testing Specificity: 0.9756
Testing False Negative Rate: 0.0732

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,120,3
Actual Fraud,9,114



Validation Sensitivity: 0.9318
Validation Specificity: 0.973
Validation False Negative Rate: 0.0682

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55339,1534
Actual Fraud,6,82


****************************************
Printing Results for a Decision Tree Model:

Best Parameters for Model: {'dct_model__max_depth': 100, 'dct_model__min_samples_leaf': 2}
Training Score: 0.9905

Testing Score: 0.8862

Validation Score: 0.889



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,368,1
Actual Fraud,6,363



Testing Sensitivity: 0.8699
Testing Specificity: 0.9024
Testing False Negative Rate: 0.1301

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,111,12
Actual Fraud,16,107



Validation Sensitivity: 0.9659
Validation Specificity: 0.8889
Validation False Negative Rate: 0.0341

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,50553,6320
Actual Fraud,3,85


****************************************
Printing Results for a Random Forest Model:

Best Parameters for Model: {'rf_model__n_estimators': 150}
Training Score: 1.0

Testing Score: 0.9431

Validation Score: 0.9774



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.9187
Testing Specificity: 0.9675
Testing False Negative Rate: 0.0813

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,119,4
Actual Fraud,10,113



Validation Sensitivity: 0.9773
Validation Specificity: 0.9774
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55588,1285
Actual Fraud,2,86


****************************************
Printing Results for an Extra Trees Model:

Best Parameters for Model: {'et_model__max_depth': 150, 'et_model__n_estimators': 200}
Training Score: 1.0

Testing Score: 0.9512

Validation Score: 0.9822



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,369,0
Actual Fraud,0,369



Testing Sensitivity: 0.9187
Testing Specificity: 0.9837
Testing False Negative Rate: 0.0813

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,121,2
Actual Fraud,10,113



Validation Sensitivity: 0.9773
Validation Specificity: 0.9822
Validation False Negative Rate: 0.0227

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55859,1014
Actual Fraud,2,86


****************************************
Printing Results for a k-NN Model:

Best Parameters for Model: {'knn_model__n_neighbors': 3, 'knn_model__p': 1}
Training Score: 0.9634

Testing Score: 0.939

Validation Score: 0.9685



Training Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,363,6
Actual Fraud,21,348



Testing Sensitivity: 0.9106
Testing Specificity: 0.9675
Testing False Negative Rate: 0.0894

Testing Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,119,4
Actual Fraud,11,112



Validation Sensitivity: 0.9545
Validation Specificity: 0.9686
Validation False Negative Rate: 0.0455

Validation Confusion Matrix


Unnamed: 0,Predicted Non-Fraud,Predicted Fraud
Actual Non-Fraud,55085,1788
Actual Fraud,4,84


****************************************
