In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# so values can be viewed as scrollable element
np.set_printoptions(threshold=sys.maxsize)


In [10]:
data_filepath = ".\\modified_data\\cleaned_data.csv"
df = pd.read_csv(data_filepath)

# using drop Reported_Location since we can't use it
df = df.drop(columns=['Reported_Location'])
df["Injury Type"] = df["Injury Type"].map({0: 0, 1: 1, 2: 1, 3: 1})
print(df.head())

   Month  Day  Weekend?  Hour  Injury Type   Latitude  Longitude
0      1    5         0     0            0  39.159207 -86.525874
1      1    6         0    15            0  39.161440 -86.534848
2      1    6         1    23            1  39.149780 -86.568890
3      1    7         1     9            1  39.165655 -86.575956
4      1    7         1    11            0  39.164848 -86.579625


# Creating Model

We determined through EDA that a simple model will not be sufficient to predict injury type. We will instead use more complicated models (Decision trees, naive bayes, SVM). If these models don't work, we will increase complexity even more to random forest and neural networks.

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# decision tree
X = df.drop(columns=['Injury Type'])
Y = df['Injury Type']

# splitting data: Since we have a lot of data, we can use a 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

# create decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# train classifier
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.6535594483786806
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      8265
           1       0.26      0.28      0.27      2467

    accuracy                           0.65     10732
   macro avg       0.52      0.52      0.52     10732
weighted avg       0.66      0.65      0.66     10732



There's a big disparity between 0 values and 1 values which is leading to innacuracies for predicting 1 values. I will attempt to remedy by oversampling.

In [15]:
from imblearn.over_sampling import RandomOverSampler
print(y_train.value_counts())
ros = RandomOverSampler()
x_os, y_os = ros.fit_resample(X_train, y_train)
print(y_os.value_counts())

# train classifier
clf.fit(x_os, y_os)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))



Injury Type
0    33091
1     9837
Name: count, dtype: int64
Injury Type
1    33091
0    33091
Name: count, dtype: int64
Accuracy: 0.6448005963473723
              precision    recall  f1-score   support

           0       0.78      0.76      0.77      8265
           1       0.25      0.27      0.26      2467

    accuracy                           0.64     10732
   macro avg       0.51      0.51      0.51     10732
weighted avg       0.66      0.64      0.65     10732



Oversampling barely affected the model performance; this indicates: our data is not good (either needs further cleaning + feature selection or injury type is random and can't be predicted), decision trees are not good for modeling this data, or we need to try different normalization. For now we will try other models.

# Naive Bayes Attempt

Issue: Getting 0 predictions for 1-label so I must oversample as did previously to attempt to balance the number of 0 and 1 values in the original dataset.

In [20]:
from sklearn.naive_bayes import CategoricalNB

# only use categorical attributes
X_nb = df.drop(columns=['Injury Type', 'Latitude', 'Longitude'])
Y_nb = df['Injury Type']

# splitting data:
X_nb_train, X_nb_test, y_nb_train, y_nb_test = train_test_split(X_nb, Y_nb, test_size=0.2, random_state=5)

print(y_nb_train.value_counts())
ros = RandomOverSampler()
x_nb_os, y_nb_os = ros.fit_resample(X_nb_train, y_nb_train)
print(y_nb_os.value_counts())

clf_cat = CategoricalNB()
clf_cat.fit(x_nb_os, np.asarray(y_nb_os))
print(classification_report(y_nb_test, clf_cat.predict(X_nb_test)))

# trying on training dataset
print(classification_report(y_nb_os, clf_cat.predict(x_nb_os)))

Injury Type
0    33091
1     9837
Name: count, dtype: int64
Injury Type
1    33091
0    33091
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.80      0.50      0.61      8265
           1       0.26      0.59      0.36      2467

    accuracy                           0.52     10732
   macro avg       0.53      0.54      0.49     10732
weighted avg       0.68      0.52      0.55     10732

              precision    recall  f1-score   support

           0       0.54      0.49      0.51     33091
           1       0.53      0.58      0.56     33091

    accuracy                           0.54     66182
   macro avg       0.54      0.54      0.54     66182
weighted avg       0.54      0.54      0.54     66182



**Still have poor performance**

Let's try an SVM model

# SVM Model

We want to one-hot encode categorical data, try different kernels, and then try oversampling

In [22]:
from sklearn.svm import SVC
from sklearn import preprocessing

cats = ['Month', 'Day', 'Weekend?', 'Hour']
nums = [col for col in df.drop(columns=['Injury Type']).columns if col not in cats]

df_svm = df.copy()

df_svm = pd.get_dummies(df_svm, columns=cats)
df_svm = df_svm.astype(float)

# split data
svm_train, svm_test = train_test_split(df_svm, test_size=0.2)
X_svm_train, y_svm_train = svm_train.drop(columns=['Injury Type']), svm_train['Injury Type']
X_svm_test, y_svm_test = svm_test.drop(columns=['Injury Type']), svm_test['Injury Type']

In [23]:
svc_li = SVC(kernel='linear')

scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)

Z_svm_train = scaler.transform(X_svm_train)
Z_svm_test = scaler.transform(X_svm_test)

svc_li.fit(Z_svm_train, np.asarray(y_svm_train))

print('Linear Kernel')
print(classification_report(y_svm_test, svc_li.predict(Z_svm_test)))

svc_rbf = SVC(kernel='rbf')
svc_rbf.fit(Z_svm_train, np.asarray(y_svm_train))
print('rbf Kernel')
print(classification_report(y_svm_test, svc_rbf.predict(Z_svm_test)))

Linear Kernel


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.77      1.00      0.87      8226
         1.0       0.00      0.00      0.00      2506

    accuracy                           0.77     10732
   macro avg       0.38      0.50      0.43     10732
weighted avg       0.59      0.77      0.67     10732

rbf Kernel
              precision    recall  f1-score   support

         0.0       0.77      1.00      0.87      8226
         1.0       0.00      0.00      0.00      2506

    accuracy                           0.77     10732
   macro avg       0.38      0.50      0.43     10732
weighted avg       0.59      0.77      0.67     10732



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
# due to class imbalance, we had no predictions for 1. Will attempt to remedy with oversampling
print(y_svm_train.value_counts())
svc_li = SVC(kernel='linear')
svc_rbf = SVC(kernel='rbf')
ros = RandomOverSampler()
x_os, y_os = ros.fit_resample(X_svm_train, y_svm_train)
print(y_os.value_counts())

scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)
Z_x_os = scaler.transform(x_os)

Z_x_test = scaler.transform(X_svm_test)

svc_li.fit(Z_x_os, y_os)
svc_rbf.fit(Z_x_os, y_os)

# svc rbf.fit(x os, y os)
# Predict and print the classification report
y_pred_rbf = svc_rbf.predict(Z_x_test)
print(classification_report(y_svm_test, y_pred_rbf))
y_pred_li = svc_li.predict(Z_x_test)
print(classification_report(y_svm_test, y_pred_li))

Injury Type
0.0    33130
1.0     9798
Name: count, dtype: int64
Injury Type
1.0    33130
0.0    33130
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.78      0.53      0.63      8226
         1.0       0.25      0.51      0.34      2506

    accuracy                           0.53     10732
   macro avg       0.52      0.52      0.49     10732
weighted avg       0.66      0.53      0.56     10732

              precision    recall  f1-score   support

         0.0       0.80      0.44      0.57      8226
         1.0       0.26      0.63      0.36      2506

    accuracy                           0.48     10732
   macro avg       0.53      0.54      0.47     10732
weighted avg       0.67      0.48      0.52     10732

