In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import re
from datetime import timedelta, datetime
import datetime
from sklearn.model_selection import learning_curve
from sklearn.cluster import KMeans
import sklearn.preprocessing
import explore
import prepare
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from kmodes.kmodes import KModes

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Acquire:

In [104]:
#acquiring the dataset
df = pd.read_csv('accident_data.csv')

# Prepare:

In [105]:
from prepare import clean_collision_data
df = clean_collision_data()
#cross validation on train

-------

In [106]:
train, test = prepare.collision_data()

In [107]:
train.shape, test.shape

((21336, 47), (5334, 47))

----

# Modeling: 

In [108]:
train.columns

Index(['crash_date', 'crash_day', 'crash_hour', 'crash_id', 'crash_latitude',
       'crash_longitude', 'crash_occupant_count', 'crash_vehicle_count',
       'damage_air', 'damage_airbag', 'damage_burned', 'damage_concentrated',
       'damage_distributed', 'damage_rollover', 'damage_zone', 'dl_cdl',
       'dl_class_a', 'dl_class_b', 'dl_class_m', 'dl_state', 'dl_unlicensed',
       'driver_age', 'driver_age_bin', 'driver_male', 'driver_race',
       'factors_road', 'factors_spd_lmt_mph', 'factors_weather', 'fault_class',
       'fault_distraction', 'fault_fatigue', 'fault_intoxication',
       'fault_maneuver', 'fault_narrative', 'fault_speed', 'fault_yield',
       'injury_class', 'injury_crash_total', 'speed_speed_lm',
       'speed_yield_occu', 'vehicle_color', 'vehicle_id', 'vehicle_make',
       'vehicle_occupant_count', 'vehicle_type', 'vehicle_year',
       'vehicle_year_bin'],
      dtype='object')

In [109]:
train.injury_class.value_counts()

0    17852
1     3484
Name: injury_class, dtype: int64

In [110]:
train.shape

(21336, 47)

In [111]:
#splitting the dataset into train features and target
X_train = train.select_dtypes(np.number).drop(columns = ['injury_class', 'injury_crash_total'])
y_train = train.injury_class

X_train.shape, y_train.shape

((21336, 32), (21336,))

In [112]:
#splitting the dataset into test features and target
X_test = test.select_dtypes(np.number).drop(columns = ['injury_class', 'injury_crash_total'])
y_test = test.injury_class

X_test.shape, y_test.shape

((5334, 32), (5334,))

In [113]:
#utilizing dummy classifier to create baseline
dummy = DummyClassifier(strategy ='most_frequent')
#fitting on X_train, y_train
dummy.fit(X_train, y_train)
#creating the baseline
baseline = pd.Series(dummy.predict(X_train), index = X_train.index)

-----

----

# Creating the Baseline

In [114]:
#setting the baseline prediciton to no injury
train['baseline_prediction'] = 0

In [115]:
#baseline accuracy calculation
baseline_accuracy = (train.baseline_prediction == y_train).mean()
baseline_accuracy

0.8367079115110612

In [116]:
#utilizing smote 
smote = SMOTE(random_state = 19)
# fitting on train
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [117]:
#need to scale the data
scaler = StandardScaler()

In [118]:
#fit the scalar to train
scaler.fit(X_smote)

StandardScaler()

In [119]:
#will need to scale the X test dataset
X_smote = pd.DataFrame(scaler.transform(X_smote), columns = X_smote.columns, index = X_smote.index)
#scaling the test dataset
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

# SMOTE

In [120]:
#utilizing smote 
smote = SMOTE(random_state = 19)
# fitting on train
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [121]:
#need to scale the data
scaler = StandardScaler()

In [122]:
#fit the scalar to train
scaler.fit(X_smote)

StandardScaler()

In [123]:
#will need to scale the X test dataset
X_smote = pd.DataFrame(scaler.transform(X_smote), columns = X_smote.columns, index = X_smote.index)
#scaling the test dataset
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

## Decision Tree

In [124]:
#creating the object
clf = DecisionTreeClassifier(max_depth=4, random_state=19)

In [125]:
# fitting our thing
clf = clf.fit(X_smote, y_smote)
#creating the prediction
y_pred = clf.predict(X_smote)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_smote)

In [126]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_smote, y_smote)))

Accuracy of Decision Tree classifier on training set: 0.73


In [127]:
labels = sorted(y_smote.unique())

pd.DataFrame(confusion_matrix(y_smote, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,12292,5560
1,4109,13743


In [128]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.69      0.72     17852
           1       0.71      0.77      0.74     17852

    accuracy                           0.73     35704
   macro avg       0.73      0.73      0.73     35704
weighted avg       0.73      0.73      0.73     35704



## Random Forest

In [129]:
#create our thing
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight= 'balanced', 
                            criterion='gini',
                            min_samples_leaf=6,
                            n_estimators=150,
                            max_depth=12, 
                            random_state=19)

In [130]:
#fit our thing
rf.fit(X_smote, y_smote)

RandomForestClassifier(class_weight='balanced', max_depth=12,
                       min_samples_leaf=6, n_estimators=150, random_state=19)

In [131]:
y_pred = rf.predict(X_smote)
y_pred_proba = rf.predict_proba(X_smote)

In [132]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_smote, y_smote)))

Accuracy of random forest classifier on training set: 0.86


In [133]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.86     17852
           1       0.88      0.83      0.86     17852

    accuracy                           0.86     35704
   macro avg       0.86      0.86      0.86     35704
weighted avg       0.86      0.86      0.86     35704



In [134]:
y_pred = rf.predict(X_test_scaled)
y_pred_proba = rf.predict_proba(X_test_scaled)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(rf.score(X_test_scaled, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.78
[[3874  589]
 [ 600  271]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      4463
           1       0.32      0.31      0.31       871

    accuracy                           0.78      5334
   macro avg       0.59      0.59      0.59      5334
weighted avg       0.78      0.78      0.78      5334



## KNN

In [135]:
#create our thing 
knn = KNeighborsClassifier(n_neighbors=3)

In [136]:
#fit our thing
knn.fit(X_smote, y_smote)

KNeighborsClassifier(n_neighbors=3)

In [137]:
#create the prediction
y_pred = knn.predict(X_smote)

In [138]:
#estmating the probability
y_pred_proba = knn.predict_proba(X_smote)

In [139]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_smote, y_smote)))

Accuracy of KNN classifier on training set: 0.90


In [140]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89     17852
           1       0.88      0.92      0.90     17852

    accuracy                           0.90     35704
   macro avg       0.90      0.90      0.90     35704
weighted avg       0.90      0.90      0.90     35704



## Logistical Regression

In [141]:
#create out thing
logit = LogisticRegression(C=1, random_state=19, intercept_scaling=1, solver='lbfgs', class_weight = 'balanced')

In [142]:
#fit the thing
logit.fit(X_smote, y_smote)

LogisticRegression(C=1, class_weight='balanced', random_state=19)

In [143]:
#make the predicition
y_pred = logit.predict(X_smote)

In [144]:
#estimate the probability
y_pred_proba = logit.predict_proba(X_smote)

In [145]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_smote, y_smote)))

Accuracy of Logistic Regression classifier on training set: 0.75


In [146]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75     17852
           1       0.75      0.77      0.76     17852

    accuracy                           0.75     35704
   macro avg       0.75      0.75      0.75     35704
weighted avg       0.75      0.75      0.75     35704



## Gradient Booster Classifier

In [237]:
#creating the object
clf = GradientBoostingClassifier(max_depth=6, random_state=19).fit(X_smote, y_smote)
# fitting our thing
clf = clf.fit(X_smote, y_smote)
#creating the prediction
y_pred = clf.predict(X_smote)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_smote)

In [238]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_smote, y_smote)))

Accuracy of Decision Tree classifier on training set: 0.89


In [239]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_smote, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,16934,918
1,2896,14956


In [240]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90     17852
           1       0.94      0.84      0.89     17852

    accuracy                           0.89     35704
   macro avg       0.90      0.89      0.89     35704
weighted avg       0.90      0.89      0.89     35704



In [None]:
########## Test Eval #############

In [None]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

## Evaluate on Test

In [None]:
y_pred = logit.predict(X_test)
y_pred_proba = logit.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

----

In [None]:
train.injury_class.value_counts()

# Undersampling

## Random Under

In [148]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=19,)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [149]:
X_resampled.shape , y_resampled.shape

((6968, 32), (6968,))

In [150]:
#need to scale the data
#scaler = StandardScaler()

In [151]:
#fit the scalar to train
#scaler.fit(X_resampled)

----

## Gradient Booster Classifier

In [152]:
#creating the object
clf = GradientBoostingClassifier(learning_rate= 0.5, max_depth=6, random_state=19).fit(X_resampled, y_resampled)
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [153]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.99


In [154]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,3461,23
1,44,3440


In [155]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3484
           1       0.99      0.99      0.99      3484

    accuracy                           0.99      6968
   macro avg       0.99      0.99      0.99      6968
weighted avg       0.99      0.99      0.99      6968



In [156]:
########## Test Eval #############

In [157]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.66
[[2969 1494]
 [ 307  564]]
              precision    recall  f1-score   support

           0       0.91      0.67      0.77      4463
           1       0.27      0.65      0.39       871

    accuracy                           0.66      5334
   macro avg       0.59      0.66      0.58      5334
weighted avg       0.80      0.66      0.70      5334



## Decision Tree

In [158]:
#creating the object
clf = DecisionTreeClassifier(max_depth=4, random_state=19,criterion = 'entropy')
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [159]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.67


In [160]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2456,1028
1,1304,2180


In [161]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.70      0.68      3484
           1       0.68      0.63      0.65      3484

    accuracy                           0.67      6968
   macro avg       0.67      0.67      0.66      6968
weighted avg       0.67      0.67      0.66      6968



In [162]:
########## Test Eval #############

In [163]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.70
[[3207 1256]
 [ 336  535]]
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      4463
           1       0.30      0.61      0.40       871

    accuracy                           0.70      5334
   macro avg       0.60      0.67      0.60      5334
weighted avg       0.81      0.70      0.74      5334



----

## Random Forest

In [164]:
#create our thing
rf = RandomForestClassifier(bootstrap=True, 
                         
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=8, 
                            random_state=19)

#creating the prediction
y_pred = clf.predict(X_resampled)

In [165]:
#fit our thing
rf.fit(X_resampled, y_resampled)

RandomForestClassifier(max_depth=8, min_samples_leaf=5, random_state=19)

In [166]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(rf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.73


In [167]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2456,1028
1,1304,2180


In [168]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.70      0.68      3484
           1       0.68      0.63      0.65      3484

    accuracy                           0.67      6968
   macro avg       0.67      0.67      0.66      6968
weighted avg       0.67      0.67      0.66      6968



In [169]:
########## Test Eval #############

In [170]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.70
[[3207 1256]
 [ 336  535]]
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      4463
           1       0.30      0.61      0.40       871

    accuracy                           0.70      5334
   macro avg       0.60      0.67      0.60      5334
weighted avg       0.81      0.70      0.74      5334



In [171]:
########## Test Eval #############

In [172]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(rf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.70
[[3138 1325]
 [ 300  571]]
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      4463
           1       0.30      0.66      0.41       871

    accuracy                           0.70      5334
   macro avg       0.61      0.68      0.60      5334
weighted avg       0.81      0.70      0.73      5334



--------

## KNN

In [173]:
#create our thing 
knn = KNeighborsClassifier(n_neighbors=10, leaf_size = 20)

In [174]:
#fit our thing
knn.fit(X_resampled, y_resampled)

KNeighborsClassifier(leaf_size=20, n_neighbors=10)

In [175]:
#create the prediction
y_pred = knn.predict(X_resampled)

In [176]:
#estmating the probability
y_pred_proba = knn.predict_proba(X_resampled)

In [177]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_resampled, y_resampled)))

Accuracy of KNN classifier on training set: 0.65


In [178]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.71      0.67      3484
           1       0.67      0.59      0.63      3484

    accuracy                           0.65      6968
   macro avg       0.65      0.65      0.65      6968
weighted avg       0.65      0.65      0.65      6968



In [179]:
########## Test Eval #############

In [180]:
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(knn.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.59
[[2745 1718]
 [ 479  392]]
              precision    recall  f1-score   support

           0       0.85      0.62      0.71      4463
           1       0.19      0.45      0.26       871

    accuracy                           0.59      5334
   macro avg       0.52      0.53      0.49      5334
weighted avg       0.74      0.59      0.64      5334



-----

## Logistical Regression

In [181]:
#create out thing
logit = LogisticRegression(C=1, random_state=19)

In [182]:
#fit the thing
logit.fit(X_resampled, y_resampled)

LogisticRegression(C=1, random_state=19)

In [183]:
#make the predicition
y_pred = logit.predict(X_resampled)

In [184]:
#estimate the probability
y_pred_proba = logit.predict_proba(X_resampled)

In [185]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_resampled, y_resampled)))

Accuracy of Logistic Regression classifier on training set: 0.50


In [186]:
print(confusion_matrix(y_resampled, y_pred))

[[3484    0]
 [3484    0]]


In [187]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67      3484
           1       0.00      0.00      0.00      3484

    accuracy                           0.50      6968
   macro avg       0.25      0.50      0.33      6968
weighted avg       0.25      0.50      0.33      6968



In [188]:
########## Test Eval #############

In [189]:
y_pred = logit.predict(X_test)
y_pred_proba = logit.predict_proba(X_test)


print('Accuracy: {:.2f}'.format(logit.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.84
[[4463    0]
 [ 871    0]]
              precision    recall  f1-score   support

           0       0.84      1.00      0.91      4463
           1       0.00      0.00      0.00       871

    accuracy                           0.84      5334
   macro avg       0.42      0.50      0.46      5334
weighted avg       0.70      0.84      0.76      5334



------

# NearMiss

In [190]:
from imblearn.under_sampling import NearMiss
nm1 = NearMiss(version=3)
X_resampled, y_resampled = nm1.fit_resample(X_train, y_train)


## Decision Tree

In [191]:
#creating the object
clf = DecisionTreeClassifier(max_depth = 8, random_state=19)
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [192]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.73


In [193]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2764,720
1,1186,2298


In [194]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.79      0.74      3484
           1       0.76      0.66      0.71      3484

    accuracy                           0.73      6968
   macro avg       0.73      0.73      0.73      6968
weighted avg       0.73      0.73      0.73      6968



In [195]:
########## Test Eval #############

In [196]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.70
[[3263 1200]
 [ 377  494]]
              precision    recall  f1-score   support

           0       0.90      0.73      0.81      4463
           1       0.29      0.57      0.39       871

    accuracy                           0.70      5334
   macro avg       0.59      0.65      0.60      5334
weighted avg       0.80      0.70      0.74      5334



----

## Random Forest

In [197]:
#create our thing
rf = RandomForestClassifier(bootstrap=True, 
                         
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=100,
                            max_depth=8, 
                            random_state=19)

#creating the prediction
y_pred = clf.predict(X_resampled)

In [198]:
#fit our thing
rf.fit(X_resampled, y_resampled)

RandomForestClassifier(max_depth=8, min_samples_leaf=5, random_state=19)

In [199]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(rf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.72


In [200]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2764,720
1,1186,2298


In [201]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.79      0.74      3484
           1       0.76      0.66      0.71      3484

    accuracy                           0.73      6968
   macro avg       0.73      0.73      0.73      6968
weighted avg       0.73      0.73      0.73      6968



In [202]:
########## Test Eval #############

In [203]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.70
[[3263 1200]
 [ 377  494]]
              precision    recall  f1-score   support

           0       0.90      0.73      0.81      4463
           1       0.29      0.57      0.39       871

    accuracy                           0.70      5334
   macro avg       0.59      0.65      0.60      5334
weighted avg       0.80      0.70      0.74      5334



In [204]:
########## Test Eval #############

In [205]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(rf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.71
[[3230 1233]
 [ 304  567]]
              precision    recall  f1-score   support

           0       0.91      0.72      0.81      4463
           1       0.32      0.65      0.42       871

    accuracy                           0.71      5334
   macro avg       0.61      0.69      0.62      5334
weighted avg       0.82      0.71      0.75      5334



--------

## KNN

In [206]:
#create our thing 
knn = KNeighborsClassifier(n_neighbors=5)

In [207]:
#fit our thing
knn.fit(X_resampled, y_resampled)

KNeighborsClassifier()

In [208]:
#create the prediction
y_pred = knn.predict(X_resampled)

In [209]:
#estmating the probability
y_pred_proba = knn.predict_proba(X_resampled)

In [210]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_resampled, y_resampled)))

Accuracy of KNN classifier on training set: 0.76


In [211]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.89      0.79      3484
           1       0.85      0.63      0.72      3484

    accuracy                           0.76      6968
   macro avg       0.78      0.76      0.75      6968
weighted avg       0.78      0.76      0.75      6968



In [212]:
########## Test Eval #############

In [213]:
y_pred = knn.predict(X_test)
y_pred_proba = knn.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(knn.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.55
[[2533 1930]
 [ 455  416]]
              precision    recall  f1-score   support

           0       0.85      0.57      0.68      4463
           1       0.18      0.48      0.26       871

    accuracy                           0.55      5334
   macro avg       0.51      0.52      0.47      5334
weighted avg       0.74      0.55      0.61      5334



-----

## Logistical Regression

In [214]:
#create out thing
logit = LogisticRegression(C=1, random_state=19, intercept_scaling=1, solver='liblinear', class_weight = 'balanced',)

In [215]:
#fit the thing
logit.fit(X_resampled, y_resampled)

LogisticRegression(C=1, class_weight='balanced', random_state=19,
                   solver='liblinear')

In [216]:
#make the predicition
y_pred = logit.predict(X_resampled)

In [217]:
#estimate the probability
y_pred_proba = logit.predict_proba(X_resampled)

In [218]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_resampled, y_resampled)))

Accuracy of Logistic Regression classifier on training set: 0.65


In [219]:
print(confusion_matrix(y_resampled, y_pred))

[[2287 1197]
 [1222 2262]]


In [220]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.66      0.65      3484
           1       0.65      0.65      0.65      3484

    accuracy                           0.65      6968
   macro avg       0.65      0.65      0.65      6968
weighted avg       0.65      0.65      0.65      6968



In [221]:
########## Test Eval #############

In [222]:
y_pred = logit.predict(X_test)
y_pred_proba = logit.predict_proba(X_test)


print('Accuracy: {:.2f}'.format(logit.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.66
[[2950 1513]
 [ 305  566]]
              precision    recall  f1-score   support

           0       0.91      0.66      0.76      4463
           1       0.27      0.65      0.38       871

    accuracy                           0.66      5334
   macro avg       0.59      0.66      0.57      5334
weighted avg       0.80      0.66      0.70      5334



## Gradient Booster Classifier

In [229]:
#creating the object
clf = GradientBoostingClassifier(max_depth=3, random_state=19, n_estimators = 100, min_samples_leaf=6,learning_rate = 0.05).fit(X_resampled, y_resampled)
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [230]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.70


In [231]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2500,984
1,1125,2359


In [232]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.72      0.70      3484
           1       0.71      0.68      0.69      3484

    accuracy                           0.70      6968
   macro avg       0.70      0.70      0.70      6968
weighted avg       0.70      0.70      0.70      6968



In [233]:
########## Test Eval #############

In [234]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.69
[[3129 1334]
 [ 298  573]]
              precision    recall  f1-score   support

           0       0.91      0.70      0.79      4463
           1       0.30      0.66      0.41       871

    accuracy                           0.69      5334
   macro avg       0.61      0.68      0.60      5334
weighted avg       0.81      0.69      0.73      5334



----

----