In [582]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import re
from datetime import timedelta, datetime
import datetime
from sklearn.model_selection import learning_curve
from sklearn.cluster import KMeans
import sklearn.preprocessing
import explore
import prepare
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from kmodes.kmodes import KModes

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Acquire:

In [583]:
df = pd.read_csv('accident_data.csv')

# Prepare:

In [584]:
from prepare import clean_collision_data
df = clean_collision_data()
#cross validation on train

-------

In [585]:
train, test = prepare.collision_data()

In [586]:
train.shape, test.shape

((21336, 47), (5334, 47))

----

# Modeling: 

In [587]:
train.columns

Index(['crash_date', 'crash_day', 'crash_hour', 'crash_id', 'crash_latitude',
       'crash_longitude', 'crash_occupant_count', 'crash_vehicle_count',
       'damage_air', 'damage_airbag', 'damage_burned', 'damage_concentrated',
       'damage_distributed', 'damage_rollover', 'damage_zone', 'dl_cdl',
       'dl_class_a', 'dl_class_b', 'dl_class_m', 'dl_state', 'dl_unlicensed',
       'driver_age', 'driver_age_bin', 'driver_male', 'driver_race',
       'factors_road', 'factors_spd_lmt_mph', 'factors_weather', 'fault_class',
       'fault_distraction', 'fault_fatigue', 'fault_intoxication',
       'fault_maneuver', 'fault_narrative', 'fault_speed', 'fault_yield',
       'injury_class', 'injury_crash_total', 'speed_speed_lm',
       'speed_yield_occu', 'vehicle_color', 'vehicle_id', 'vehicle_make',
       'vehicle_occupant_count', 'vehicle_type', 'vehicle_year',
       'vehicle_year_bin'],
      dtype='object')

In [588]:
train.injury_class.value_counts()

0    17852
1     3484
Name: injury_class, dtype: int64

In [589]:
train.shape

(21336, 47)

In [590]:
#splitting the dataset into train features and target
X_train = train.select_dtypes(np.number).drop(columns = ['injury_class', 'injury_crash_total'])
y_train = train.injury_class

X_train.shape, y_train.shape

((21336, 32), (21336,))

In [591]:
#splitting the dataset into test features and target
X_test = test.select_dtypes(np.number).drop(columns = ['injury_class', 'injury_crash_total'])
y_test = test.injury_class

X_test.shape, y_test.shape

((5334, 32), (5334,))

In [592]:
#utilizing dummy classifier to create baseline
dummy = DummyClassifier(strategy ='most_frequent')
#fitting on X_train, y_train
dummy.fit(X_train, y_train)
#creating the baseline
baseline = pd.Series(dummy.predict(X_train), index = X_train.index)

-----

----

# Creating the Baseline

In [12]:
#setting the baseline prediciton to no injury
train['baseline_prediction'] = 0

In [13]:
#baseline accuracy calculation
baseline_accuracy = (train.baseline_prediction == y_train).mean()
baseline_accuracy

0.8367079115110612

In [14]:
#utilizing smote 
smote = SMOTE(random_state = 19)
# fitting on train
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [15]:
#need to scale the data
scaler = StandardScaler()

In [16]:
#fit the scalar to train
scaler.fit(X_smote)

StandardScaler()

In [17]:
#will need to scale the X test dataset
X_smote = pd.DataFrame(scaler.transform(X_smote), columns = X_smote.columns, index = X_smote.index)
#scaling the test dataset
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

# SMOTE

In [14]:
#utilizing smote 
smote = SMOTE(random_state = 19)
# fitting on train
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [15]:
#need to scale the data
scaler = StandardScaler()

In [16]:
#fit the scalar to train
scaler.fit(X_smote)

StandardScaler()

In [17]:
#will need to scale the X test dataset
X_smote = pd.DataFrame(scaler.transform(X_smote), columns = X_smote.columns, index = X_smote.index)
#scaling the test dataset
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

## Decision Tree

In [458]:
#creating the object
clf = DecisionTreeClassifier(max_depth=4, random_state=19)

In [459]:
# fitting our thing
clf = clf.fit(X_smote, y_smote)
#creating the prediction
y_pred = clf.predict(X_smote)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_smote)

NameError: name 'X_smote' is not defined

In [239]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_smote, y_smote)))

Accuracy of Decision Tree classifier on training set: 0.73


In [158]:
labels = sorted(y_smote.unique())

pd.DataFrame(confusion_matrix(y_smote, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,12292,5560
1,4109,13743


In [159]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.69      0.72     17852
           1       0.71      0.77      0.74     17852

    accuracy                           0.73     35704
   macro avg       0.73      0.73      0.73     35704
weighted avg       0.73      0.73      0.73     35704



## Random Forest

In [42]:
#create our thing
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight= 'balanced', 
                            criterion='gini',
                            min_samples_leaf=6,
                            n_estimators=150,
                            max_depth=12, 
                            random_state=19)

In [43]:
#fit our thing
rf.fit(X_smote, y_smote)

RandomForestClassifier(class_weight='balanced', max_depth=12,
                       min_samples_leaf=6, n_estimators=150, random_state=19)

In [44]:
y_pred = rf.predict(X_smote)
y_pred_proba = rf.predict_proba(X_smote)

In [45]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_smote, y_smote)))

Accuracy of random forest classifier on training set: 0.86


In [46]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.86     17852
           1       0.88      0.84      0.86     17852

    accuracy                           0.86     35704
   macro avg       0.86      0.86      0.86     35704
weighted avg       0.86      0.86      0.86     35704



In [47]:
y_pred = rf.predict(X_test_scaled)
y_pred_proba = rf.predict_proba(X_test_scaled)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(rf.score(X_test_scaled, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.78
[[3879  584]
 [ 588  283]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      4463
           1       0.33      0.32      0.33       871

    accuracy                           0.78      5334
   macro avg       0.60      0.60      0.60      5334
weighted avg       0.78      0.78      0.78      5334



## KNN

In [342]:
#create our thing 
knn = KNeighborsClassifier(n_neighbors=3)

In [343]:
#fit our thing
knn.fit(X_smote, y_smote)

KNeighborsClassifier(n_neighbors=3)

In [344]:
#create the prediction
y_pred = knn.predict(X_smote)

In [345]:
#estmating the probability
y_pred_proba = knn.predict_proba(X_smote)

In [346]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_smote, y_smote)))

Accuracy of KNN classifier on training set: 0.90


In [347]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89     17852
           1       0.88      0.92      0.90     17852

    accuracy                           0.90     35704
   macro avg       0.90      0.90      0.90     35704
weighted avg       0.90      0.90      0.90     35704



## Logistical Regression

In [34]:
#create out thing
logit = LogisticRegression(C=1, random_state=19, intercept_scaling=1, solver='lbfgs', class_weight = 'balanced')

In [35]:
#fit the thing
logit.fit(X_smote, y_smote)

LogisticRegression(C=1, class_weight='balanced', random_state=19)

In [36]:
#make the predicition
y_pred = logit.predict(X_smote)

In [37]:
#estimate the probability
y_pred_proba = logit.predict_proba(X_smote)

In [38]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_smote, y_smote)))

Accuracy of Logistic Regression classifier on training set: 0.75


In [39]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75     17852
           1       0.74      0.77      0.76     17852

    accuracy                           0.75     35704
   macro avg       0.75      0.75      0.75     35704
weighted avg       0.75      0.75      0.75     35704



## Gradient Booster Classifier

In [452]:
#creating the object
clf = GradientBoostingClassifier(learning_rate=1.0, max_depth=6, random_state=19).fit(X_resampled, y_resampled)
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [453]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 1.00


In [454]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,3482,2
1,5,3479


In [455]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3484
           1       1.00      1.00      1.00      3484

    accuracy                           1.00      6968
   macro avg       1.00      1.00      1.00      6968
weighted avg       1.00      1.00      1.00      6968



In [456]:
########## Test Eval #############

In [457]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.65
[[2920 1543]
 [ 302  569]]
              precision    recall  f1-score   support

           0       0.91      0.65      0.76      4463
           1       0.27      0.65      0.38       871

    accuracy                           0.65      5334
   macro avg       0.59      0.65      0.57      5334
weighted avg       0.80      0.65      0.70      5334



## Evaluate on Test

In [41]:
y_pred = logit.predict(X_test)
y_pred_proba = logit.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.69
[[3321 1142]
 [ 508  363]]
              precision    recall  f1-score   support

           0       0.87      0.74      0.80      4463
           1       0.24      0.42      0.31       871

    accuracy                           0.69      5334
   macro avg       0.55      0.58      0.55      5334
weighted avg       0.77      0.69      0.72      5334



----

-----

-----

In [1039]:
train.injury_class.value_counts()

0    17852
1     3484
Name: injury_class, dtype: int64

# Undersampling

## Random Under

In [593]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=19,)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [594]:
X_resampled.shape , y_resampled.shape

((6968, 32), (6968,))

In [595]:
#need to scale the data
#scaler = StandardScaler()

In [596]:
#fit the scalar to train
#scaler.fit(X_resampled)

----

## Gradient Booster Classifier

In [1040]:
#creating the object
clf = GradientBoostingClassifier(learning_rate= 0.5, max_depth=6, random_state=19).fit(X_resampled, y_resampled)
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [1041]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.99


In [1042]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,3473,11
1,30,3454


In [1043]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3484
           1       1.00      0.99      0.99      3484

    accuracy                           0.99      6968
   macro avg       0.99      0.99      0.99      6968
weighted avg       0.99      0.99      0.99      6968



In [1044]:
########## Test Eval #############

In [1045]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.64
[[2845 1618]
 [ 298  573]]
              precision    recall  f1-score   support

           0       0.91      0.64      0.75      4463
           1       0.26      0.66      0.37       871

    accuracy                           0.64      5334
   macro avg       0.58      0.65      0.56      5334
weighted avg       0.80      0.64      0.69      5334



## Decision Tree

In [1046]:
#creating the object
clf = DecisionTreeClassifier(max_depth=4, random_state=19,criterion = 'entropy')
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [1047]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.67


In [1048]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2398,1086
1,1216,2268


In [1049]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.69      0.68      3484
           1       0.68      0.65      0.66      3484

    accuracy                           0.67      6968
   macro avg       0.67      0.67      0.67      6968
weighted avg       0.67      0.67      0.67      6968



In [1050]:
########## Test Eval #############

In [1051]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.68
[[3066 1397]
 [ 314  557]]
              precision    recall  f1-score   support

           0       0.91      0.69      0.78      4463
           1       0.29      0.64      0.39       871

    accuracy                           0.68      5334
   macro avg       0.60      0.66      0.59      5334
weighted avg       0.81      0.68      0.72      5334



----

## Random Forest

In [1052]:
#create our thing
rf = RandomForestClassifier(bootstrap=True, 
                         
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=200,
                            max_depth=8, 
                            random_state=19)

#creating the prediction
y_pred = clf.predict(X_resampled)

In [1053]:
#fit our thing
rf.fit(X_resampled, y_resampled)

RandomForestClassifier(max_depth=8, min_samples_leaf=5, n_estimators=200,
                       random_state=19)

In [1054]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(rf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.72


In [1055]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2398,1086
1,1216,2268


In [1056]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.69      0.68      3484
           1       0.68      0.65      0.66      3484

    accuracy                           0.67      6968
   macro avg       0.67      0.67      0.67      6968
weighted avg       0.67      0.67      0.67      6968



In [1057]:
########## Test Eval #############

In [1058]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.68
[[3066 1397]
 [ 314  557]]
              precision    recall  f1-score   support

           0       0.91      0.69      0.78      4463
           1       0.29      0.64      0.39       871

    accuracy                           0.68      5334
   macro avg       0.60      0.66      0.59      5334
weighted avg       0.81      0.68      0.72      5334



In [1059]:
########## Test Eval #############

In [1060]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(rf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.71
[[3222 1241]
 [ 306  565]]
              precision    recall  f1-score   support

           0       0.91      0.72      0.81      4463
           1       0.31      0.65      0.42       871

    accuracy                           0.71      5334
   macro avg       0.61      0.69      0.61      5334
weighted avg       0.82      0.71      0.74      5334



--------

## KNN

In [1061]:
#create our thing 
knn = KNeighborsClassifier(n_neighbors=10, leaf_size = 20)

In [1062]:
#fit our thing
knn.fit(X_resampled, y_resampled)

KNeighborsClassifier(leaf_size=20, n_neighbors=10)

In [1063]:
#create the prediction
y_pred = knn.predict(X_resampled)

In [1064]:
#estmating the probability
y_pred_proba = knn.predict_proba(X_resampled)

In [1065]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_resampled, y_resampled)))

Accuracy of KNN classifier on training set: 0.69


In [1066]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.82      0.72      3484
           1       0.76      0.56      0.64      3484

    accuracy                           0.69      6968
   macro avg       0.70      0.69      0.68      6968
weighted avg       0.70      0.69      0.68      6968



In [1067]:
########## Test Eval #############

In [1068]:
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(knn.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.57
[[2671 1792]
 [ 496  375]]
              precision    recall  f1-score   support

           0       0.84      0.60      0.70      4463
           1       0.17      0.43      0.25       871

    accuracy                           0.57      5334
   macro avg       0.51      0.51      0.47      5334
weighted avg       0.73      0.57      0.63      5334



-----

## Logistical Regression

In [1069]:
#create out thing
logit = LogisticRegression(C=1, random_state=19)

In [1070]:
#fit the thing
logit.fit(X_resampled, y_resampled)

LogisticRegression(C=1, random_state=19)

In [1071]:
#make the predicition
y_pred = logit.predict(X_resampled)

In [1072]:
#estimate the probability
y_pred_proba = logit.predict_proba(X_resampled)

In [1073]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_resampled, y_resampled)))

Accuracy of Logistic Regression classifier on training set: 0.50


In [1074]:
print(confusion_matrix(y_resampled, y_pred))

[[3484    0]
 [3484    0]]


In [1075]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67      3484
           1       0.00      0.00      0.00      3484

    accuracy                           0.50      6968
   macro avg       0.25      0.50      0.33      6968
weighted avg       0.25      0.50      0.33      6968



In [1076]:
########## Test Eval #############

In [1077]:
y_pred = logit.predict(X_test)
y_pred_proba = logit.predict_proba(X_test)


print('Accuracy: {:.2f}'.format(logit.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.84
[[4463    0]
 [ 871    0]]
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      4463
           1       0.00      0.00      0.00       871

    accuracy                           0.84      5334
   macro avg       0.42      0.50      0.46      5334
weighted avg       0.70      0.84      0.76      5334



------

------

----

----

# NearMiss

In [1078]:
from imblearn.under_sampling import NearMiss
nm1 = NearMiss(version=3)
X_resampled, y_resampled = nm1.fit_resample(X_train, y_train)


## Decision Tree

In [1079]:
#creating the object
clf = DecisionTreeClassifier(max_depth = 8, random_state=19)
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [1080]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.73


In [1081]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2744,740
1,1164,2320


In [1082]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.79      0.74      3484
           1       0.76      0.67      0.71      3484

    accuracy                           0.73      6968
   macro avg       0.73      0.73      0.73      6968
weighted avg       0.73      0.73      0.73      6968



In [1083]:
########## Test Eval #############

In [1084]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.71
[[3270 1193]
 [ 375  496]]
              precision    recall  f1-score   support

           0       0.90      0.73      0.81      4463
           1       0.29      0.57      0.39       871

    accuracy                           0.71      5334
   macro avg       0.60      0.65      0.60      5334
weighted avg       0.80      0.71      0.74      5334



----

## Random Forest

In [1085]:
#create our thing
rf = RandomForestClassifier(bootstrap=True, 
                         
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=8, 
                            random_state=19)

#creating the prediction
y_pred = clf.predict(X_resampled)

In [1086]:
#fit our thing
rf.fit(X_resampled, y_resampled)

RandomForestClassifier(max_depth=8, min_samples_leaf=5, random_state=19)

In [1087]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(rf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.72


In [1088]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2744,740
1,1164,2320


In [1089]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.79      0.74      3484
           1       0.76      0.67      0.71      3484

    accuracy                           0.73      6968
   macro avg       0.73      0.73      0.73      6968
weighted avg       0.73      0.73      0.73      6968



In [1090]:
########## Test Eval #############

In [1091]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.71
[[3270 1193]
 [ 375  496]]
              precision    recall  f1-score   support

           0       0.90      0.73      0.81      4463
           1       0.29      0.57      0.39       871

    accuracy                           0.71      5334
   macro avg       0.60      0.65      0.60      5334
weighted avg       0.80      0.71      0.74      5334



In [1092]:
########## Test Eval #############

In [1093]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(rf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.71
[[3207 1256]
 [ 304  567]]
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      4463
           1       0.31      0.65      0.42       871

    accuracy                           0.71      5334
   macro avg       0.61      0.68      0.61      5334
weighted avg       0.82      0.71      0.74      5334



--------

## KNN

In [1094]:
#create our thing 
knn = KNeighborsClassifier(n_neighbors=5)

In [1095]:
#fit our thing
knn.fit(X_resampled, y_resampled)

KNeighborsClassifier()

In [1096]:
#create the prediction
y_pred = knn.predict(X_resampled)

In [1097]:
#estmating the probability
y_pred_proba = knn.predict_proba(X_resampled)

In [1098]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_resampled, y_resampled)))

Accuracy of KNN classifier on training set: 0.76


In [1099]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.89      0.79      3484
           1       0.85      0.63      0.72      3484

    accuracy                           0.76      6968
   macro avg       0.78      0.76      0.76      6968
weighted avg       0.78      0.76      0.76      6968



In [1100]:
########## Test Eval #############

In [1101]:
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(knn.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.55
[[2531 1932]
 [ 454  417]]
              precision    recall  f1-score   support

           0       0.85      0.57      0.68      4463
           1       0.18      0.48      0.26       871

    accuracy                           0.55      5334
   macro avg       0.51      0.52      0.47      5334
weighted avg       0.74      0.55      0.61      5334



-----

## Logistical Regression

In [1102]:
#create out thing
logit = LogisticRegression(C=1, random_state=19, intercept_scaling=1, solver='liblinear', class_weight = 'balanced',)

In [1103]:
#fit the thing
logit.fit(X_resampled, y_resampled)

LogisticRegression(C=1, class_weight='balanced', random_state=19,
                   solver='liblinear')

In [1104]:
#make the predicition
y_pred = logit.predict(X_resampled)

In [1105]:
#estimate the probability
y_pred_proba = logit.predict_proba(X_resampled)

In [1106]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_resampled, y_resampled)))

Accuracy of Logistic Regression classifier on training set: 0.54


In [1107]:
print(confusion_matrix(y_resampled, y_pred))

[[1922 1562]
 [1633 1851]]


In [1108]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.54      0.55      0.55      3484
           1       0.54      0.53      0.54      3484

    accuracy                           0.54      6968
   macro avg       0.54      0.54      0.54      6968
weighted avg       0.54      0.54      0.54      6968



In [1109]:
########## Test Eval #############

In [1110]:
y_pred = logit.predict(X_test)
y_pred_proba = logit.predict_proba(X_test)


print('Accuracy: {:.2f}'.format(logit.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.55
[[2497 1966]
 [ 435  436]]
              precision    recall  f1-score   support

           0       0.85      0.56      0.68      4463
           1       0.18      0.50      0.27       871

    accuracy                           0.55      5334
   macro avg       0.52      0.53      0.47      5334
weighted avg       0.74      0.55      0.61      5334



## Gradient Booster Classifier

In [1111]:
#creating the object
clf = GradientBoostingClassifier(max_depth=3, random_state=19, n_estimators = 100, min_samples_leaf=6,learning_rate = 0.05).fit(X_resampled, y_resampled)
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [1112]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.70


In [1113]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2516,968
1,1128,2356


In [1114]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.72      0.71      3484
           1       0.71      0.68      0.69      3484

    accuracy                           0.70      6968
   macro avg       0.70      0.70      0.70      6968
weighted avg       0.70      0.70      0.70      6968



In [1115]:
########## Test Eval #############

In [1116]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.69
[[3131 1332]
 [ 298  573]]
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      4463
           1       0.30      0.66      0.41       871

    accuracy                           0.69      5334
   macro avg       0.61      0.68      0.60      5334
weighted avg       0.81      0.69      0.73      5334



----

----