In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

# so values can be viewed as scrollable element
np.set_printoptions(threshold=sys.maxsize)


In [2]:
data_filepath = ".\\modified_data\\cleaned_data.csv"
df = pd.read_csv(data_filepath)
df = df.drop(columns=['Reported_Location'])

df["Injury Type"] = df["Injury Type"].map({0: 0, 1: 1, 2: 1, 3: 1})
print(df.head())

   Month  Day  Weekend?  Hour  Collision Type  Injury Type   Latitude  \
0      1    5         0     0               4            0  39.159207   
1      1    6         0    15               4            0  39.161440   
2      1    6         1    23               4            1  39.149780   
3      1    7         1     9               4            1  39.165655   
4      1    7         1    11               4            0  39.164848   

   Longitude  
0 -86.525874  
1 -86.534848  
2 -86.568890  
3 -86.575956  
4 -86.579625  


# Creating Model

We determined through EDA that a simple model will not be sufficient to predict injury type. We will instead use more complicated models (Decision trees, naive bayes, SVM). If these models don't work, we will increase complexity even more to random forest and neural networks.

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# decision tree
X = df.drop(columns=['Injury Type'])
Y = df['Injury Type']

# splitting data: Since we have a lot of data, we can use a 70-30 split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=5)

# create decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# train classifier
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))

Accuracy: 0.682931829830856
              precision    recall  f1-score   support

           0       0.80      0.79      0.79      7511
           1       0.32      0.34      0.33      2244

    accuracy                           0.68      9755
   macro avg       0.56      0.56      0.56      9755
weighted avg       0.69      0.68      0.69      9755



There's a big disparity between 0 values and 1 values which is leading to innacuracies for predicting 1 values. I will attempt to remedy by oversampling.

In [4]:
from imblearn.over_sampling import RandomOverSampler
print(y_train.value_counts())
ros = RandomOverSampler()
x_os, y_os = ros.fit_resample(X_train, y_train)
print(y_os.value_counts())

# train classifier
clf.fit(x_os, y_os)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print(classification_report(y_test, y_pred))



Injury Type
0    29771
1     9249
Name: count, dtype: int64
Injury Type
0    29771
1    29771
Name: count, dtype: int64
Accuracy: 0.6948231676063558
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      7511
           1       0.34      0.35      0.35      2244

    accuracy                           0.69      9755
   macro avg       0.57      0.58      0.57      9755
weighted avg       0.70      0.69      0.70      9755



Oversampling barely affected the model performance; this indicates: our data is not good (either needs further cleaning + feature selection or injury type is random and can't be predicted), decision trees are not good for modeling this data, or we need to try different normalization. For now we will try other models.

# Naive Bayes Attempt

Issue: Getting 0 predictions for 1-label so I must oversample as did previously to attempt to balance the number of 0 and 1 values in the original dataset.

In [5]:
from sklearn.naive_bayes import CategoricalNB

# only use categorical attributes
X_nb = df.drop(columns=['Injury Type', 'Latitude', 'Longitude'])
Y_nb = df['Injury Type']

# splitting data:
X_nb_train, X_nb_test, y_nb_train, y_nb_test = train_test_split(X_nb, Y_nb, test_size=0.2, random_state=5)

print(y_nb_train.value_counts())
ros = RandomOverSampler()
x_nb_os, y_nb_os = ros.fit_resample(X_nb_train, y_nb_train)
print(y_nb_os.value_counts())

clf_cat = CategoricalNB()
clf_cat.fit(x_nb_os, np.asarray(y_nb_os))
print(classification_report(y_nb_test, clf_cat.predict(X_nb_test)))

# trying on training dataset
print(classification_report(y_nb_os, clf_cat.predict(x_nb_os)))

Injury Type
0    29771
1     9249
Name: count, dtype: int64
Injury Type
0    29771
1    29771
Name: count, dtype: int64
              precision    recall  f1-score   support

           0       0.81      0.86      0.84      7511
           1       0.42      0.35      0.38      2244

    accuracy                           0.74      9755
   macro avg       0.62      0.60      0.61      9755
weighted avg       0.72      0.74      0.73      9755

              precision    recall  f1-score   support

           0       0.57      0.86      0.69     29771
           1       0.72      0.35      0.47     29771

    accuracy                           0.61     59542
   macro avg       0.64      0.61      0.58     59542
weighted avg       0.64      0.61      0.58     59542



**Still have poor performance**

Let's try an SVM model

# SVM Model

We want to one-hot encode categorical data, try different kernels, and then try oversampling

In [6]:
from sklearn.svm import SVC
from sklearn import preprocessing

cats = ['Month', 'Day', 'Weekend?', 'Hour']
nums = [col for col in df.drop(columns=['Injury Type']).columns if col not in cats]

df_svm = df.copy()

df_svm = pd.get_dummies(df_svm, columns=cats)
df_svm = df_svm.astype(float)

# split data
svm_train, svm_test = train_test_split(df_svm, test_size=0.2)
X_svm_train, y_svm_train = svm_train.drop(columns=['Injury Type']), svm_train['Injury Type']
X_svm_test, y_svm_test = svm_test.drop(columns=['Injury Type']), svm_test['Injury Type']

In [7]:
svc_li = SVC(kernel='linear')

scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)

Z_svm_train = scaler.transform(X_svm_train)
Z_svm_test = scaler.transform(X_svm_test)

svc_li.fit(Z_svm_train, np.asarray(y_svm_train))

print('Linear Kernel')
print(classification_report(y_svm_test, svc_li.predict(Z_svm_test)))

svc_rbf = SVC(kernel='rbf')
svc_rbf.fit(Z_svm_train, np.asarray(y_svm_train))
print('rbf Kernel')
print(classification_report(y_svm_test, svc_rbf.predict(Z_svm_test)))

Linear Kernel
              precision    recall  f1-score   support

         0.0       0.79      0.99      0.88      7498
         1.0       0.87      0.13      0.23      2257

    accuracy                           0.79      9755
   macro avg       0.83      0.56      0.55      9755
weighted avg       0.81      0.79      0.73      9755

rbf Kernel
              precision    recall  f1-score   support

         0.0       0.79      0.99      0.88      7498
         1.0       0.87      0.14      0.25      2257

    accuracy                           0.80      9755
   macro avg       0.83      0.57      0.56      9755
weighted avg       0.81      0.80      0.74      9755



In [8]:
# due to class imbalance, we had no predictions for 1. Will attempt to remedy with oversampling
print(y_svm_train.value_counts())
svc_li = SVC(kernel='linear')
svc_rbf = SVC(kernel='rbf')
ros = RandomOverSampler()
x_os, y_os = ros.fit_resample(X_svm_train, y_svm_train)
print(y_os.value_counts())

scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)
Z_x_os = scaler.transform(x_os)

Z_x_test = scaler.transform(X_svm_test)

svc_li.fit(Z_x_os, y_os)
svc_rbf.fit(Z_x_os, y_os)

# svc rbf.fit(x os, y os)
# Predict and print the classification report
y_pred_rbf = svc_rbf.predict(Z_x_test)
print(classification_report(y_svm_test, y_pred_rbf))
y_pred_li = svc_li.predict(Z_x_test)
print(classification_report(y_svm_test, y_pred_li))

Injury Type
0.0    29784
1.0     9236
Name: count, dtype: int64
Injury Type
0.0    29784
1.0    29784
Name: count, dtype: int64
              precision    recall  f1-score   support

         0.0       0.81      0.75      0.78      7498
         1.0       0.34      0.41      0.37      2257

    accuracy                           0.67      9755
   macro avg       0.57      0.58      0.58      9755
weighted avg       0.70      0.67      0.69      9755

              precision    recall  f1-score   support

         0.0       0.81      0.79      0.80      7498
         1.0       0.35      0.37      0.36      2257

    accuracy                           0.70      9755
   macro avg       0.58      0.58      0.58      9755
weighted avg       0.70      0.70      0.70      9755



# Random Forest attempt

In [9]:
from sklearn.ensemble import RandomForestClassifier

X = df.drop(columns=['Injury Type'])
y = df['Injury Type']

# split into test/train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ros = RandomOverSampler()
x_os, y_os = ros.fit_resample(X_train, y_train)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=23)
# train classifier
rf_classifier.fit(x_os, y_os)
# predict test data
y_pred = rf_classifier.predict(X_test)

# determine accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

Accuracy: 0.7503844182470528
              precision    recall  f1-score   support

           0       0.80      0.89      0.85      7451
           1       0.45      0.28      0.35      2304

    accuracy                           0.75      9755
   macro avg       0.63      0.59      0.60      9755
weighted avg       0.72      0.75      0.73      9755



Accuracy is decent so we'll try different combinations of parameters to maximize

In [15]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=23)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='recall')

# Perform grid search on the training data
grid_search.fit(x_os, y_os)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.9729484307536062


In [18]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=23)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='f1')

# Perform grid search on the training data
grid_search.fit(x_os, y_os)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.9170307955456217


In [19]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# determine accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))


Accuracy: 0.7504869297796002
              precision    recall  f1-score   support

           0       0.80      0.89      0.85      7451
           1       0.45      0.28      0.35      2304

    accuracy                           0.75      9755
   macro avg       0.63      0.59      0.60      9755
weighted avg       0.72      0.75      0.73      9755



In [11]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 400],
    'max_depth': [10, 20, 40],
    'min_samples_split': [10, 20, 40]
}

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=23)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=3, scoring='accuracy', verbose=2)

# Perform grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV] END max_depth=10, min_samples_split=10, n_estimators=100; total time=   2.9s
[CV] END max_depth=10, min_samples_split=10, n_estimators=100; total time=   2.6s
[CV] END max_depth=10, min_samples_split=10, n_estimators=100; total time=   2.7s
[CV] END max_depth=10, min_samples_split=10, n_estimators=200; total time=   5.5s
[CV] END max_depth=10, min_samples_split=10, n_estimators=200; total time=   5.8s
[CV] END max_depth=10, min_samples_split=10, n_estimators=200; total time=   6.0s
[CV] END max_depth=10, min_samples_split=10, n_estimators=400; total time=  12.6s
[CV] END max_depth=10, min_samples_split=10, n_estimators=400; total time=  13.1s
[CV] END max_depth=10, min_samples_split=10, n_estimators=400; total time=  10.8s
[CV] END max_depth=10, min_samples_split=20, n_estimators=100; total time=   2.8s
[CV] END max_depth=10, min_samples_split=20, n_estimators=100; total time=   2.8s
[CV] END max_depth=10, min_samples_sp

KeyboardInterrupt: 

# Neural Network

In [77]:
from sklearn.preprocessing import OneHotEncoder

X = df.drop(columns=['Injury Type', 'Weekend?', 'Latitude', 'Longitude'])
y = df['Injury Type']

# normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)

# perform one hot encoding
categories = [[0, 1]]
encoder = OneHotEncoder(categories=categories, sparse=False)
y = encoder.fit_transform(y.values.reshape(-1,1))

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# to oversample, we must recombine y_train (one-hot encoded) too a single column
y_train_cat = y_train[1]
ros = RandomOverSampler()
x_os, y_os = ros.fit_resample(X_train, y_train_cat)



In [78]:
num_nodes = (23, 17, 13)
learning_rate = 0.4
epochs = 500

mlp = MLPClassifier(solver = 'sgd',
                    random_state = 0,
                    activation = 'logistic',
                    learning_rate_init = learning_rate,
                    batch_size = 100,
                    hidden_layer_sizes = num_nodes,
                    max_iter = epochs)
mlp

In [79]:
mlp.fit(x_os, y_train)

pred = mlp.predict(X_test)
print("Accuracy : ", accuracy_score(y_test, pred))
print("Mean Square Error : ", mean_squared_error(y_test, pred))

print("Classification Report : ")
print(classification_report(y_test, pred))

Accuracy :  0.7873910814966684
Mean Square Error :  0.21260891850333163
Classification Report : 
              precision    recall  f1-score   support

           0       0.78      0.99      0.88      7389
           1       0.86      0.15      0.25      2366

   micro avg       0.79      0.79      0.79      9755
   macro avg       0.82      0.57      0.56      9755
weighted avg       0.80      0.79      0.72      9755
 samples avg       0.79      0.79      0.79      9755



In [87]:
# Predict probabilities for each class on the test data
proba_predictions = mlp.predict_proba(X_test)

# Set the threshold for classification
threshold = 0.138  # Example threshold value

# Classify instances based on the threshold
binary_predictions = (proba_predictions[:, 1] > threshold).astype(int)

y_test_binary = np.argmax(y_test, axis=1)
print("Classification Report : ")
print(classification_report(y_test_binary, binary_predictions))

Classification Report : 
              precision    recall  f1-score   support

           0       0.81      0.12      0.20      7389
           1       0.25      0.92      0.39      2366

    accuracy                           0.31      9755
   macro avg       0.53      0.52      0.30      9755
weighted avg       0.67      0.31      0.25      9755

