In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import re
from datetime import timedelta, datetime
import datetime
from sklearn.model_selection import learning_curve
from sklearn.cluster import KMeans
import sklearn.preprocessing
import explore
import prepare
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from kmodes.kmodes import KModes

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Acquire:

In [2]:
df = pd.read_csv('accident_data.csv')

# Prepare:

In [3]:
from prepare import clean_collision_data
df = clean_collision_data()
#cross validation on train

-------

In [4]:
train, test = prepare.collision_data()

In [5]:
train.shape, test.shape

((21336, 47), (5334, 47))

----

# Modeling: 

In [6]:
train.columns

Index(['crash_date', 'crash_day', 'crash_hour', 'crash_id', 'crash_latitude',
       'crash_longitude', 'crash_occupant_count', 'crash_vehicle_count',
       'damage_air', 'damage_airbag', 'damage_burned', 'damage_concentrated',
       'damage_distributed', 'damage_rollover', 'damage_zone', 'dl_cdl',
       'dl_class_a', 'dl_class_b', 'dl_class_m', 'dl_state', 'dl_unlicensed',
       'driver_age', 'driver_age_bin', 'driver_male', 'driver_race',
       'factors_road', 'factors_spd_lmt_mph', 'factors_weather', 'fault_class',
       'fault_distraction', 'fault_fatigue', 'fault_intoxication',
       'fault_maneuver', 'fault_narrative', 'fault_speed', 'fault_yield',
       'injury_class', 'injury_crash_total', 'speed_speed_lm',
       'speed_yield_occu', 'vehicle_color', 'vehicle_id', 'vehicle_make',
       'vehicle_occupant_count', 'vehicle_type', 'vehicle_year',
       'vehicle_year_bin'],
      dtype='object')

In [7]:
train.injury_class.value_counts()

0    17852
1     3484
Name: injury_class, dtype: int64

In [8]:
train.shape

(21336, 47)

In [9]:
#splitting the dataset into train features and target
X_train = train.select_dtypes(np.number).drop(columns = ['injury_class', 'injury_crash_total'])
y_train = train.injury_class

X_train.shape, y_train.shape

((21336, 32), (21336,))

In [10]:
#splitting the dataset into test features and target
X_test = test.select_dtypes(np.number).drop(columns = ['injury_class', 'injury_crash_total'])
y_test = test.injury_class

X_test.shape, y_test.shape

((5334, 32), (5334,))

In [11]:
#utilizing dummy classifier to create baseline
dummy = DummyClassifier(strategy ='most_frequent')
#fitting on X_train, y_train
dummy.fit(X_train, y_train)
#creating the baseline
baseline = pd.Series(dummy.predict(X_train), index = X_train.index)

-----

----

# Creating the Baseline

In [12]:
#setting the baseline prediciton to no injury
train['baseline_prediction'] = 0

In [13]:
#baseline accuracy calculation
baseline_accuracy = (train.baseline_prediction == y_train).mean()
baseline_accuracy

0.8367079115110612

In [14]:
#utilizing smote 
smote = SMOTE(random_state = 19)
# fitting on train
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [15]:
#need to scale the data
scaler = StandardScaler()

In [16]:
#fit the scalar to train
scaler.fit(X_smote)

StandardScaler()

In [17]:
#will need to scale the X test dataset
X_smote = pd.DataFrame(scaler.transform(X_smote), columns = X_smote.columns, index = X_smote.index)
#scaling the test dataset
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

-----

-----

# Undersampling

In [18]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=19)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

In [19]:
X_resampled.shape , y_resampled.shape

((6968, 32), (6968,))

In [20]:
#need to scale the data
#scaler = StandardScaler()

In [21]:
#fit the scalar to train
#scaler.fit(X_resampled)

## Decision Tree

In [22]:
#creating the object
clf = DecisionTreeClassifier(max_depth=4, random_state=19)
# fitting our thing
clf = clf.fit(X_resampled, y_resampled)
#creating the prediction
y_pred = clf.predict(X_resampled)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_resampled)

In [23]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_resampled, y_resampled)))

Accuracy of Decision Tree classifier on training set: 0.67


In [24]:
labels = sorted(y_resampled.unique())

pd.DataFrame(confusion_matrix(y_resampled, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,2456,1028
1,1304,2180


In [25]:
print(classification_report(y_resampled, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.70      0.68      3484
           1       0.68      0.63      0.65      3484

    accuracy                           0.67      6968
   macro avg       0.67      0.67      0.66      6968
weighted avg       0.67      0.67      0.66      6968



In [26]:
########## Test Eval #############

In [27]:
y_pred = clf.predict(X_test)

print('Accuracy: {:.2f}'.format(clf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Accuracy: 0.70
[[3207 1256]
 [ 336  535]]
              precision    recall  f1-score   support

           0       0.91      0.72      0.80      4463
           1       0.30      0.61      0.40       871

    accuracy                           0.70      5334
   macro avg       0.60      0.67      0.60      5334
weighted avg       0.81      0.70      0.74      5334



----

# Random Forest

In [42]:
#create our thing
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight= 'balanced', 
                            criterion='gini',
                            min_samples_leaf=5,
                            n_estimators=200,
                            max_depth=8, 
                            random_state=19)

In [43]:
#fit our thing
rf.fit(X_resampled, y_resampled)

RandomForestClassifier(class_weight='balanced', max_depth=8, min_samples_leaf=5,
                       n_estimators=200, random_state=19)

In [44]:
y_pred = rf.predict(X_resampled)

In [39]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_resampled, y_resampled)))

Accuracy of random forest classifier on training set: 0.73


In [45]:
print(classification_report(X_resampled, y_resampled))

ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [46]:
########## Test Eval #############

In [47]:
y_pred = rf.predict(X_test)
y_pred_proba = rf.predict_proba(X_test)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(rf.score(X_test, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.70
[[3154 1309]
 [ 305  566]]
              precision    recall  f1-score   support

           0       0.91      0.71      0.80      4463
           1       0.30      0.65      0.41       871

    accuracy                           0.70      5334
   macro avg       0.61      0.68      0.60      5334
weighted avg       0.81      0.70      0.73      5334



------

------

# SMOTE

## Decision Tree

In [155]:
#creating the object
clf = DecisionTreeClassifier(max_depth=4, random_state=19)

In [156]:
# fitting our thing
clf = clf.fit(X_smote, y_smote)
#creating the prediction
y_pred = clf.predict(X_smote)
#creating prediction probabaility
y_pred_proba = clf.predict_proba(X_smote)

In [239]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(clf.score(X_smote, y_smote)))

Accuracy of Decision Tree classifier on training set: 0.73


In [158]:
labels = sorted(y_smote.unique())

pd.DataFrame(confusion_matrix(y_smote, y_pred), index=labels, columns=labels)

Unnamed: 0,0,1
0,12292,5560
1,4109,13743


In [159]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.69      0.72     17852
           1       0.71      0.77      0.74     17852

    accuracy                           0.73     35704
   macro avg       0.73      0.73      0.73     35704
weighted avg       0.73      0.73      0.73     35704



# Random Forest

In [42]:
#create our thing
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight= 'balanced', 
                            criterion='gini',
                            min_samples_leaf=6,
                            n_estimators=150,
                            max_depth=12, 
                            random_state=19)

In [43]:
#fit our thing
rf.fit(X_smote, y_smote)

RandomForestClassifier(class_weight='balanced', max_depth=12,
                       min_samples_leaf=6, n_estimators=150, random_state=19)

In [44]:
y_pred = rf.predict(X_smote)
y_pred_proba = rf.predict_proba(X_smote)

In [45]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_smote, y_smote)))

Accuracy of random forest classifier on training set: 0.86


In [46]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.89      0.86     17852
           1       0.88      0.84      0.86     17852

    accuracy                           0.86     35704
   macro avg       0.86      0.86      0.86     35704
weighted avg       0.86      0.86      0.86     35704



In [47]:
y_pred = rf.predict(X_test_scaled)
y_pred_proba = rf.predict_proba(X_test_scaled)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(rf.score(X_test_scaled, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.78
[[3879  584]
 [ 588  283]]
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      4463
           1       0.33      0.32      0.33       871

    accuracy                           0.78      5334
   macro avg       0.60      0.60      0.60      5334
weighted avg       0.78      0.78      0.78      5334



# KNN

In [342]:
#create our thing 
knn = KNeighborsClassifier(n_neighbors=3)

In [343]:
#fit our thing
knn.fit(X_smote, y_smote)

KNeighborsClassifier(n_neighbors=3)

In [344]:
#create the prediction
y_pred = knn.predict(X_smote)

In [345]:
#estmating the probability
y_pred_proba = knn.predict_proba(X_smote)

In [346]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn.score(X_smote, y_smote)))

Accuracy of KNN classifier on training set: 0.90


In [347]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89     17852
           1       0.88      0.92      0.90     17852

    accuracy                           0.90     35704
   macro avg       0.90      0.90      0.90     35704
weighted avg       0.90      0.90      0.90     35704



# Logistical Regression

In [34]:
#create out thing
logit = LogisticRegression(C=1, random_state=19, intercept_scaling=1, solver='lbfgs', class_weight = 'balanced')

In [35]:
#fit the thing
logit.fit(X_smote, y_smote)

LogisticRegression(C=1, class_weight='balanced', random_state=19)

In [36]:
#make the predicition
y_pred = logit.predict(X_smote)

In [37]:
#estimate the probability
y_pred_proba = logit.predict_proba(X_smote)

In [38]:
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
     .format(logit.score(X_smote, y_smote)))

Accuracy of Logistic Regression classifier on training set: 0.75


In [39]:
print(classification_report(y_smote, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75     17852
           1       0.74      0.77      0.76     17852

    accuracy                           0.75     35704
   macro avg       0.75      0.75      0.75     35704
weighted avg       0.75      0.75      0.75     35704



# Evaluate on Test

In [41]:
y_pred = logit.predict(X_test_scaled)
y_pred_proba = logit.predict_proba(X_test_scaled)

print("Model 1: solver = lbfgs, c = 1")

print('Accuracy: {:.2f}'.format(logit.score(X_test_scaled, y_test)))

print(confusion_matrix(y_test, y_pred))

print(classification_report(y_test, y_pred))

Model 1: solver = lbfgs, c = 1
Accuracy: 0.69
[[3321 1142]
 [ 508  363]]
              precision    recall  f1-score   support

           0       0.87      0.74      0.80      4463
           1       0.24      0.42      0.31       871

    accuracy                           0.69      5334
   macro avg       0.55      0.58      0.55      5334
weighted avg       0.77      0.69      0.72      5334

