In [31]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/preprocessed_data.csv', index_col=0)

  mask |= (ar1 == a)


In [3]:
df.head()

Unnamed: 0,severity,num_vehicles,num_casualties,doy,Speed_limit,urb_or_rur,police_presence,hazard_Any animal (except a ridden horse),hazard_Dislodged vehicle load in carriageway,hazard_Involvement with previous accident,...,Zebra crossing,non-junction pedestrian crossing,rtype_Dual carriageway,rtype_One way street,rtype_Roundabout,rtype_Single carriageway,rtype_Slip road,rtype_Unknown,weekend,night
0,3,2,1,5,0.428571,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,3,2,1,4,0.428571,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,2,1,3,0.428571,0,1,0,0,0,...,0,1,0,1,0,0,0,0,0,0
3,3,1,1,4,0.428571,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,3,1,1,3,0.428571,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [4]:
df.columns

Index(['severity', 'num_vehicles', 'num_casualties', 'doy', 'Speed_limit',
       'urb_or_rur', 'police_presence',
       'hazard_Any animal (except a ridden horse)',
       'hazard_Dislodged vehicle load in carriageway',
       'hazard_Involvement with previous accident',
       'hazard_Other object in carriageway',
       'hazard_Pedestrian in carriageway (not injured)',
       'road_Flood (Over 3cm of water)', 'road_Frost/Ice', 'road_Snow',
       'road_Wet/Damp', 'weather_Fine with high winds',
       'weather_Fine without high winds', 'weather_Fog or mist',
       'weather_Other', 'weather_Raining with high winds',
       'weather_Raining without high winds', 'weather_Snowing with high winds',
       'weather_Snowing without high winds', 'weather_Unknown',
       'Darkeness: No street lighting', 'Darkness: Street lighting unknown',
       'Darkness: Street lights present and lit',
       'Darkness: Street lights present but unlit', 'Central refuge',
       'Footbridge or subway', 

In [5]:
df.severity.value_counts()

3    1275940
2     204118
1      19415
Name: severity, dtype: int64

### Creating new df with only the variables that we want to run through the PCA. We will not use num_vehicles and num_casualties since it's an information that it's only available AFTER a car accident. We will also drop doy(day of week) since we have the weekend categorical column.

### Also, given that our categories aren't balanced at all we will proceed with a downsampling of the rows with category 2 and 3 so that we can have a balanced dataset to test our models on.

In [6]:
df_downsample = df[df.severity == 1]
df_downsample = pd.concat([df_downsample, df[df.severity == 2].sample(len(df[df.severity == 1])), 
          df[df.severity == 3].sample(len(df[df.severity == 1]))])

In [7]:
target = df_downsample.severity

In [15]:
df_X = df_downsample.drop(['num_vehicles', 'num_casualties', 'doy', 'severity'], axis=1)

### Selecting features manually at first

In [16]:
X = df_X[['urb_or_rur', 'weekend', 'night', 'road_Flood (Over 3cm of water)', 'road_Frost/Ice', 'road_Snow',
          'road_Wet/Damp', 'weather_Fine without high winds', 'weather_Fog or mist',
          'weather_Other', 'weather_Raining with high winds',
          'weather_Raining without high winds', 'weather_Snowing with high winds',
          'weather_Snowing without high winds', 'weather_Unknown']]

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.25, random_state=42)

### Random Forest Classifier

Using RandomizedSearchCV for hyperparameter setting

In [32]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
rfclass = RandomForestClassifier()
rf_randomsearch = RandomizedSearchCV(estimator = rfclass, param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_randomsearch.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                  index = X_train.columns,
                                   columns=[‘importance’]).sort_values(‘importance’,                                                                 ascending=False)

### Using PCA for feature selection

In [None]:
pca_selection = PCA().fit(df_pca)

In [None]:
for x in zip(df_pca.columns, pca_selection.explained_variance_ratio_):
    print(x)