In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# so values can be viewed as scrollable element
np.set_printoptions(threshold=sys.maxsize)


In [10]:
data_filepath = ".\\modified_data\\cleaned_data.csv"
df = pd.read_csv(data_filepath)

# using drop Reported_Location since we can't use it
df = df.drop(columns=['Reported_Location'])
df["Injury Type"] = df["Injury Type"].map({0: 0, 1: 1, 2: 1, 3: 1})
print(df.head())

   Month  Day  Weekend?  Hour  Injury Type   Latitude  Longitude
0      1    5         0     0            0  39.159207 -86.525874
1      1    6         0    15            0  39.161440 -86.534848
2      1    6         1    23            1  39.149780 -86.568890
3      1    7         1     9            1  39.165655 -86.575956
4      1    7         1    11            0  39.164848 -86.579625


# Creating Model

We determined through EDA that a simple model will not be sufficient to predict injury type. We will instead use more complicated models (Decision trees, naive bayes, SVM). If these models don't work, we will increase complexity even more to random forest and neural networks.

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# decision tree
X = df.drop(columns=['Injury Type'])
Y = df['Injury Type']

# splitting data: Since we have a lot of data, we can use a 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=5)

# create decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# train classifier
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.6511367871785315
              precision    recall  f1-score   support

           0       0.78      0.76      0.77     12409
           1       0.26      0.28      0.27      3689

    accuracy                           0.65     16098
   macro avg       0.52      0.52      0.52     16098
weighted avg       0.66      0.65      0.66     16098



There's a big disparity between 0 values and 1 values which is leading to innacuracies for predicting 1 values. I will attempt to remedy by oversampling.

In [13]:
from imblearn.over_sampling import RandomOverSampler
print(y_train.value_counts())
ros = RandomOverSampler()
x_os, y_os = ros.fit_resample(X_train, y_train)
print(y_os.value_counts())

# train classifier
clf.fit(x_os, y_os)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))



Injury Type
0    28947
1     8615
Name: count, dtype: int64
Injury Type
0    28947
1    28947
Name: count, dtype: int64
Accuracy: 0.653373089824823
              precision    recall  f1-score   support

           0       0.78      0.76      0.77     12409
           1       0.26      0.28      0.27      3689

    accuracy                           0.65     16098
   macro avg       0.52      0.52      0.52     16098
weighted avg       0.66      0.65      0.66     16098



Oversampling barely affected the model performance; this indicates: our data is not good (either needs further cleaning + feature selection or injury type is random and can't be predicted), decision trees are not good for modeling this data, or we need to try different normalization. For now we will try other models.