# Classification Model of Accident Severity in NYC

In [66]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
from pathlib import Path

In [67]:
# Get File Path
dir_path = Path().resolve().parent
data_path = dir_path / Path('data_files/clean_data')

In [68]:
location_df = pd.read_csv(str(data_path / 'location_data.csv'))
location_df.head()

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_INJURED,NUMBER_OF_PERSONS_KILLED,CONTRIBUTING_FACTOR_VEHICLE_1,CONTRIBUTING_FACTOR_VEHICLE_2,ACCIDENT_SEVERITY
0,4486555,2021-12-14,17:05:00,40.709183,-73.956825,0,0,Passing Too Closely,Unspecified,No Injury
1,4487074,2021-12-14,21:10:00,40.67172,-73.8971,0,0,Driver Inexperience,Unspecified,No Injury
2,4486519,2021-12-14,14:58:00,40.75144,-73.97397,0,0,Passing Too Closely,Unspecified,No Injury
3,4486934,2021-12-13,00:34:00,40.701275,-73.88887,0,0,Passing or Lane Usage Improper,Unspecified,No Injury
4,4487127,2021-12-14,16:50:00,40.675884,-73.75577,0,0,Turning Improperly,Unspecified,No Injury


In [69]:
location_df.shape

(1085722, 10)

In [70]:
location_df.columns

Index(['COLLISION_ID', 'CRASH_DATE', 'CRASH_TIME', 'LATITUDE', 'LONGITUDE',
       'NUMBER_OF_PERSONS_INJURED', 'NUMBER_OF_PERSONS_KILLED',
       'CONTRIBUTING_FACTOR_VEHICLE_1', 'CONTRIBUTING_FACTOR_VEHICLE_2',
       'ACCIDENT_SEVERITY'],
      dtype='object')

In [71]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

label_encoder_severity = LabelEncoder()
location_df['ACCIDENT_SEVERITY_ENCODED'] = label_encoder_severity.fit_transform((location_df['ACCIDENT_SEVERITY']))
label_encoder_factor_1 = LabelEncoder()
location_df['CONTRIBUTING_FACTOR_VEHICLE_1_ENCODED'] = label_encoder_factor_1.fit_transform((location_df['CONTRIBUTING_FACTOR_VEHICLE_1']))
label_encoder_factor_2 = LabelEncoder()
location_df['CONTRIBUTING_FACTOR_VEHICLE_2_ENCODED'] = label_encoder_factor_2.fit_transform((location_df['CONTRIBUTING_FACTOR_VEHICLE_2']))
location_df.head(20)

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_INJURED,NUMBER_OF_PERSONS_KILLED,CONTRIBUTING_FACTOR_VEHICLE_1,CONTRIBUTING_FACTOR_VEHICLE_2,ACCIDENT_SEVERITY,ACCIDENT_SEVERITY_ENCODED,CONTRIBUTING_FACTOR_VEHICLE_1_ENCODED,CONTRIBUTING_FACTOR_VEHICLE_2_ENCODED
0,4486555,2021-12-14,17:05:00,40.709183,-73.956825,0,0,Passing Too Closely,Unspecified,No Injury,3,31,51
1,4487074,2021-12-14,21:10:00,40.67172,-73.8971,0,0,Driver Inexperience,Unspecified,No Injury,3,9,51
2,4486519,2021-12-14,14:58:00,40.75144,-73.97397,0,0,Passing Too Closely,Unspecified,No Injury,3,31,51
3,4486934,2021-12-13,00:34:00,40.701275,-73.88887,0,0,Passing or Lane Usage Improper,Unspecified,No Injury,3,32,51
4,4487127,2021-12-14,16:50:00,40.675884,-73.75577,0,0,Turning Improperly,Unspecified,No Injury,3,48,51
5,4486635,2021-12-14,23:10:00,40.66684,-73.78941,2,0,Reaction to Uninvolved Vehicle,Unspecified,Minor Injury,2,39,51
6,4486604,2021-12-14,17:58:00,40.68158,-73.97463,0,0,Passing Too Closely,Unspecified,No Injury,3,31,51
7,4486537,2021-12-14,14:30:00,40.783268,-73.82485,0,0,Following Too Closely,Unspecified,No Injury,3,17,51
8,4486905,2021-12-11,04:45:00,40.748917,-73.993546,0,0,Following Too Closely,Unspecified,No Injury,3,17,51
9,4487122,2021-12-14,05:46:00,40.744644,-73.77041,1,0,Other Vehicular,Other Vehicular,Minor Injury,2,27,27


In [72]:
location_df.columns

Index(['COLLISION_ID', 'CRASH_DATE', 'CRASH_TIME', 'LATITUDE', 'LONGITUDE',
       'NUMBER_OF_PERSONS_INJURED', 'NUMBER_OF_PERSONS_KILLED',
       'CONTRIBUTING_FACTOR_VEHICLE_1', 'CONTRIBUTING_FACTOR_VEHICLE_2',
       'ACCIDENT_SEVERITY', 'ACCIDENT_SEVERITY_ENCODED',
       'CONTRIBUTING_FACTOR_VEHICLE_1_ENCODED',
       'CONTRIBUTING_FACTOR_VEHICLE_2_ENCODED'],
      dtype='object')

In [73]:
def seasons(row):
    month = row['MONTH']
    match month:
        case month if month <= 3 : 
            return 1
        case month if month <= 6:
            return 2
        case month if month <= 9:
            return 3
        case month if month <= 12:
            return 4
    

In [74]:
location_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1085722 entries, 0 to 1085721
Data columns (total 13 columns):
 #   Column                                 Non-Null Count    Dtype  
---  ------                                 --------------    -----  
 0   COLLISION_ID                           1085722 non-null  int64  
 1   CRASH_DATE                             1085722 non-null  object 
 2   CRASH_TIME                             1085722 non-null  object 
 3   LATITUDE                               1085722 non-null  float64
 4   LONGITUDE                              1085722 non-null  float64
 5   NUMBER_OF_PERSONS_INJURED              1085722 non-null  int64  
 6   NUMBER_OF_PERSONS_KILLED               1085722 non-null  int64  
 7   CONTRIBUTING_FACTOR_VEHICLE_1          1085722 non-null  object 
 8   CONTRIBUTING_FACTOR_VEHICLE_2          1085722 non-null  object 
 9   ACCIDENT_SEVERITY                      1085722 non-null  object 
 10  ACCIDENT_SEVERITY_ENCODED              108

In [75]:
import datetime as dt
# Extract year, month, day of the week, and whether it was a weekend
location_df['CRASH_DATE'] = pd.to_datetime(location_df['CRASH_DATE'])
location_df['CRASH_TIME'] = pd.to_datetime(location_df['CRASH_TIME'], format='%H:%M:%S').dt.time

location_df['YEAR'] = location_df['CRASH_DATE'].dt.year
location_df['MONTH'] = location_df['CRASH_DATE'].dt.month
location_df['DAY_OF_WEEK'] = location_df['CRASH_DATE'].dt.weekday  # Monday=0, Sunday=6
location_df['IS_WEEKEND'] = location_df['DAY_OF_WEEK'].apply(lambda x: 1 if x >= 5 else 0)  # 1 for weekend, 0 for weekday

location_df['SEASON'] = location_df.apply(seasons, axis=1)

In [76]:
location_df.head()

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_INJURED,NUMBER_OF_PERSONS_KILLED,CONTRIBUTING_FACTOR_VEHICLE_1,CONTRIBUTING_FACTOR_VEHICLE_2,ACCIDENT_SEVERITY,ACCIDENT_SEVERITY_ENCODED,CONTRIBUTING_FACTOR_VEHICLE_1_ENCODED,CONTRIBUTING_FACTOR_VEHICLE_2_ENCODED,YEAR,MONTH,DAY_OF_WEEK,IS_WEEKEND,SEASON
0,4486555,2021-12-14,17:05:00,40.709183,-73.956825,0,0,Passing Too Closely,Unspecified,No Injury,3,31,51,2021,12,1,0,4
1,4487074,2021-12-14,21:10:00,40.67172,-73.8971,0,0,Driver Inexperience,Unspecified,No Injury,3,9,51,2021,12,1,0,4
2,4486519,2021-12-14,14:58:00,40.75144,-73.97397,0,0,Passing Too Closely,Unspecified,No Injury,3,31,51,2021,12,1,0,4
3,4486934,2021-12-13,00:34:00,40.701275,-73.88887,0,0,Passing or Lane Usage Improper,Unspecified,No Injury,3,32,51,2021,12,0,0,4
4,4487127,2021-12-14,16:50:00,40.675884,-73.75577,0,0,Turning Improperly,Unspecified,No Injury,3,48,51,2021,12,1,0,4


## Model for Predicting an Accident's Severity Given Coordinates and Contributing Factors

In [77]:
# Split the data
x = location_df.drop(columns=['COLLISION_ID', 'CONTRIBUTING_FACTOR_VEHICLE_1', 'CONTRIBUTING_FACTOR_VEHICLE_2', 'ACCIDENT_SEVERITY', 'CRASH_TIME', 'CRASH_DATE', 'ACCIDENT_SEVERITY_ENCODED'])
y = location_df['ACCIDENT_SEVERITY_ENCODED']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [78]:
# Predict the type of severity of accident
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [79]:
# Decode the predictions back to the original accident severity
y_pred_decoded = label_encoder_severity.inverse_transform(y_pred)

# Decode the true labels (y_test) as well
y_test_decoded = label_encoder_severity.inverse_transform(y_test)

In [80]:
from sklearn.metrics import classification_report, confusion_matrix

print("Classification Report:")
print(classification_report(y_test_decoded, y_pred_decoded))

#  Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_decoded, y_pred_decoded))

Classification Report:
              precision    recall  f1-score   support

       Fatal       1.00      1.00      1.00       180
Major Injury       1.00      0.85      0.92      4551
Minor Injury       0.98      1.00      0.99     41168
   No Injury       1.00      1.00      1.00    171246

    accuracy                           1.00    217145
   macro avg       1.00      0.96      0.98    217145
weighted avg       1.00      1.00      1.00    217145

Confusion Matrix:
[[   180      0      0      0]
 [     0   3858    693      0]
 [     0      0  41168      0]
 [     0      0      0 171246]]


In [81]:
location_df.to_csv(str(dir_path / Path('data_files/clean_data/location_model_data.csv')), index=False)

## Predict the Severity of Accident Given Contributing Factor of Alcohol

In [82]:
location_df.columns

Index(['COLLISION_ID', 'CRASH_DATE', 'CRASH_TIME', 'LATITUDE', 'LONGITUDE',
       'NUMBER_OF_PERSONS_INJURED', 'NUMBER_OF_PERSONS_KILLED',
       'CONTRIBUTING_FACTOR_VEHICLE_1', 'CONTRIBUTING_FACTOR_VEHICLE_2',
       'ACCIDENT_SEVERITY', 'ACCIDENT_SEVERITY_ENCODED',
       'CONTRIBUTING_FACTOR_VEHICLE_1_ENCODED',
       'CONTRIBUTING_FACTOR_VEHICLE_2_ENCODED', 'YEAR', 'MONTH', 'DAY_OF_WEEK',
       'IS_WEEKEND', 'SEASON'],
      dtype='object')

In [83]:
alcohol_df =  location_df.query("CONTRIBUTING_FACTOR_VEHICLE_1 == 'Alcohol Involvement' | CONTRIBUTING_FACTOR_VEHICLE_2 == 'Alcohol Involvement'").copy()

In [84]:
# Split the data
x = alcohol_df.drop(columns=['COLLISION_ID', 'CONTRIBUTING_FACTOR_VEHICLE_1', 'CONTRIBUTING_FACTOR_VEHICLE_2', 'ACCIDENT_SEVERITY', 'CRASH_TIME', 'CRASH_DATE', 'ACCIDENT_SEVERITY_ENCODED'])
y = alcohol_df['ACCIDENT_SEVERITY_ENCODED']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [85]:
# Predict the type of severity of accident
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [86]:
# Decode the predictions back to the original accident severity
y_pred_decoded = label_encoder_severity.inverse_transform(y_pred)

# Decode the true labels (y_test) as well
y_test_decoded = label_encoder_severity.inverse_transform(y_test)

In [87]:
print("Classification Report:")
print(classification_report(y_test_decoded, y_pred_decoded))

#  Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_decoded, y_pred_decoded))

Classification Report:
              precision    recall  f1-score   support

       Fatal       1.00      1.00      1.00        11
Major Injury       1.00      0.97      0.98       151
Minor Injury       0.99      1.00      1.00       966
   No Injury       1.00      1.00      1.00      2518

    accuracy                           1.00      3646
   macro avg       1.00      0.99      1.00      3646
weighted avg       1.00      1.00      1.00      3646

Confusion Matrix:
[[  11    0    0    0]
 [   0  146    5    0]
 [   0    0  966    0]
 [   0    0    0 2518]]


In [88]:
alcohol_df.to_csv(str(dir_path / Path('data_files/clean_data/alcohol_data.csv')), index=False)


## Predict the Severity of Accident Given Contributing Factor of Cell Phone Use


In [89]:
cell_phone_rows = ['Cell Phone (Hand-Held)', 'Cell Phone (Hands-Free)', 'Texting']
cell_phone_df = location_df.query(f"CONTRIBUTING_FACTOR_VEHICLE_1 in {cell_phone_rows} | CONTRIBUTING_FACTOR_VEHICLE_2 in {cell_phone_rows}")

In [90]:
cell_phone_df.head()

Unnamed: 0,COLLISION_ID,CRASH_DATE,CRASH_TIME,LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_INJURED,NUMBER_OF_PERSONS_KILLED,CONTRIBUTING_FACTOR_VEHICLE_1,CONTRIBUTING_FACTOR_VEHICLE_2,ACCIDENT_SEVERITY,ACCIDENT_SEVERITY_ENCODED,CONTRIBUTING_FACTOR_VEHICLE_1_ENCODED,CONTRIBUTING_FACTOR_VEHICLE_2_ENCODED,YEAR,MONTH,DAY_OF_WEEK,IS_WEEKEND,SEASON
2744,4410194,2021-04-23,15:28:00,40.68789,-73.936035,1,0,Cell Phone (Hands-Free),Unspecified,Minor Injury,2,7,51,2021,4,4,0,2
4558,4457664,2021-09-15,19:53:00,40.67069,-73.91703,1,0,Cell Phone (Hand-Held),Traffic Control Device Improper/Non-Working,Minor Injury,2,6,46,2021,9,2,0,3
4754,4458056,2021-09-13,08:17:00,40.773148,-73.950554,0,0,Cell Phone (Hand-Held),Unspecified,No Injury,3,6,51,2021,9,0,0,3
5292,4412097,2021-04-30,22:30:00,40.74875,-73.90077,0,0,Cell Phone (Hand-Held),Unspecified,No Injury,3,6,51,2021,4,4,0,2
6446,4413583,2021-04-28,17:45:00,40.650806,-73.94958,1,0,Cell Phone (Hand-Held),Unspecified,Minor Injury,2,6,51,2021,4,2,0,2


In [100]:
# Split the data
x = cell_phone_df.drop(columns=['COLLISION_ID', 'CONTRIBUTING_FACTOR_VEHICLE_1', 'CONTRIBUTING_FACTOR_VEHICLE_2', 'ACCIDENT_SEVERITY', 'CRASH_TIME', 'CRASH_DATE', 'ACCIDENT_SEVERITY_ENCODED'])
y = cell_phone_df['ACCIDENT_SEVERITY_ENCODED']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [101]:
# Predict the type of severity of accident
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [102]:
# Decode the predictions back to the original accident severity
y_pred_decoded = label_encoder_severity.inverse_transform(y_pred)

# Decode the true labels (y_test) as well
y_test_decoded = label_encoder_severity.inverse_transform(y_test)

In [103]:
print("Classification Report:")
print(classification_report(y_test_decoded, y_pred_decoded))

#  Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_decoded, y_pred_decoded))

Classification Report:
              precision    recall  f1-score   support

Major Injury       1.00      0.25      0.40         4
Minor Injury       0.96      1.00      0.98        72
   No Injury       1.00      1.00      1.00       178

    accuracy                           0.99       254
   macro avg       0.99      0.75      0.79       254
weighted avg       0.99      0.99      0.98       254

Confusion Matrix:
[[  1   3   0]
 [  0  72   0]
 [  0   0 178]]


In [104]:
cell_phone_df.to_csv(str(dir_path / Path('data_files/clean_data/cell_phone_data.csv')), index=False)