In [1]:
import numpy as np
import pandas as pd

In [2]:
file = 'C:\\Users\\reill\\Downloads\\Traffic_Crashes_-_Crashes.csv'
df = pd.DataFrame(pd.read_csv(file))

In [3]:
#Drop high NaN count columns
dropcols = []
for col in df:
    if (df[col].isna().sum() / len(df[col])) > 0.6:
        dropcols.append(col)
df.drop(dropcols, axis='columns', inplace=True)

#Drop unnecessary columns
df.drop(['CRASH_RECORD_ID', 'REPORT_TYPE', 'DATE_POLICE_NOTIFIED', 'LOCATION', 'CRASH_DATE'], axis='columns', inplace=True)
df.replace('UNKNOWN', np.nan, inplace=True)

df = df.dropna(axis='index').reset_index(drop=True)

In [4]:
categorical = ['TRAFFIC_CONTROL_DEVICE',
       'DEVICE_CONDITION', 'WEATHER_CONDITION', 'LIGHTING_CONDITION',
       'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE', 'ALIGNMENT',
       'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'CRASH_TYPE', 'DAMAGE', 'PRIM_CONTRIBUTORY_CAUSE',
       'SEC_CONTRIBUTORY_CAUSE', 'MOST_SEVERE_INJURY']
#Removed street name and direction

In [5]:
numerical = ['POSTED_SPEED_LIMIT', 'STREET_NO', 'BEAT_OF_OCCURRENCE', 'NUM_UNITS', 'CRASH_HOUR',
       'CRASH_DAY_OF_WEEK', 'CRASH_MONTH', 'LATITUDE', 'LONGITUDE']

#This leaves out the datetime column

In [6]:
nonfatal = df[df['INJURIES_FATAL'] == 0.0]
hasfatal = df[df['INJURIES_FATAL'] != 0.0]

In [7]:
from sklearn.utils import resample

nonfatal_down = resample(nonfatal, replace=False, n_samples=1000)

In [8]:
down_df = pd.concat([nonfatal_down, hasfatal])

In [26]:
down_df = pd.concat([down_df[down_df['INJURIES_FATAL']>2], down_df]).drop_duplicates(keep=False)

In [27]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = down_df
scaled_df[numerical] = scaler.fit_transform(scaled_df[numerical])
scaled_df.head()

Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,...,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE
91300,0.158237,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,ONE-WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,2.0,0.0,-0.079263,0.441634,-1.466203,0.147287,-0.586048
416537,0.158237,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,DIVIDED - W/MEDIAN BARRIER,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,1.0,0.0,1.575488,0.441634,-0.568635,1.43445,-1.162902
536255,0.158237,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,2.0,0.0,1.124192,0.936843,-1.466203,-1.528532,0.108236
413203,0.158237,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,2.0,0.0,0.071169,-1.043994,0.628123,1.279079,0.350411
422069,-2.538631,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,REAR END,ALLEY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,2.0,0.0,0.372033,1.432052,0.328933,0.783239,-0.495152


In [28]:
scaled_df = pd.get_dummies(scaled_df)

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [29]:
x = scaled_df.drop(['INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN'], axis='columns')
y = scaled_df['INJURIES_FATAL']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [30]:
from sklearn.metrics import confusion_matrix, f1_score

forest_fatal = RandomForestClassifier(n_jobs=16)
forest_fatal.fit(x_train, y_train)

y_pred = forest_fatal.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))

[[198   0   0]
 [  0 135   0]
 [  0   9   0]]
0.9609507640067911


In [33]:
from sklearn.model_selection import GridSearchCV
rf = RandomForestClassifier()
cross = GridSearchCV(rf, {'n_estimators':[50, 100, 200, 400, 600],
                          'max_depth':[1, 2, 4, None]}, scoring='f1_weighted', n_jobs=16)
cross.fit(x_train, y_train)

In [34]:
print(cross.best_estimator_)
print(cross.best_score_)

RandomForestClassifier(n_estimators=50)
0.9631075300898442


In [35]:
forest_final = RandomForestClassifier(n_estimators=50, n_jobs=16)
forest_final.fit(x_train, y_train)

y_pred = forest_final.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(f1_score(y_test, y_pred, average='weighted'))

[[198   0   0]
 [  0 135   0]
 [  0   9   0]]
0.9609507640067911


In [47]:
big_test = df
big_test[numerical] = scaler.fit_transform(big_test[numerical])
big_test = pd.get_dummies(big_test)

x = big_test.drop(['INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN'], axis='columns')
y = big_test['INJURIES_FATAL']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

forest_big = RandomForestClassifier(n_estimators=50, n_jobs=16)
forest_big.fit(x_train, y_train)

y_pred = forest_big.predict(big_test.drop(['INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN'], axis='columns'))
print(confusion_matrix(big_test['INJURIES_FATAL'], y_pred))
print(f1_score(big_test['INJURIES_FATAL'], y_pred, average='weighted'))

[[613782      0      0      0      0]
 [    37    630      0      0      0]
 [     4      9     30      0      0]
 [     0      3      0      4      0]
 [     0      0      0      0      1]]
0.9999104412450324


In [48]:
forest_big.feature_importances_

array([0.00883861, 0.0288476 , 0.024007  , ..., 0.01223665, 0.01204374,
       0.00904311])

In [50]:
forest_importances = pd.Series(forest_big.feature_importances_, index=x_test.columns)

In [52]:
forest_importances.sort_values(ascending=False)

MOST_SEVERE_INJURY_FATAL    0.495183
LONGITUDE                   0.030971
LATITUDE                    0.029000
STREET_NO                   0.028848
BEAT_OF_OCCURRENCE          0.024007
                              ...   
STREET_NAME_HAMPDEN CT      0.000000
STREET_NAME_HARBOR DR       0.000000
STREET_NAME_HARPER CT       0.000000
STREET_NAME_HART ST         0.000000
STREET_NAME_GARLAND CT      0.000000
Length: 1744, dtype: float64