In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Data gathering and accessing

In [2]:
df = pd.read_csv('UK_Traffic_Accidents_2015.csv')
df.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(140056, 32)

In [3]:
df.head()

Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,...,Pedestrian_Crossing-Human_Control,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location
0,201501BS70001,525130.0,180050.0,-0.198465,51.505538,1,3,1,1,12/01/2015,...,0,0,4,1,1,0,0,1,1,E01002825
1,201501BS70002,526530.0,178560.0,-0.178838,51.491836,1,3,1,1,12/01/2015,...,0,0,1,1,1,0,0,1,1,E01002820
2,201501BS70004,524610.0,181080.0,-0.20559,51.51491,1,3,1,1,12/01/2015,...,0,1,4,2,2,0,0,1,1,E01002833
3,201501BS70005,524420.0,181080.0,-0.208327,51.514952,1,3,1,1,13/01/2015,...,0,0,1,1,2,0,0,1,2,E01002874
4,201501BS70008,524630.0,179040.0,-0.206022,51.496572,1,2,2,1,09/01/2015,...,0,5,1,2,2,0,0,1,2,E01002814


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140056 entries, 0 to 140055
Data columns (total 32 columns):
 #   Column                                       Non-Null Count   Dtype  
---  ------                                       --------------   -----  
 0   Accident_Index                               140056 non-null  object 
 1   Location_Easting_OSGR                        140029 non-null  float64
 2   Location_Northing_OSGR                       140029 non-null  float64
 3   Longitude                                    140029 non-null  float64
 4   Latitude                                     140029 non-null  float64
 5   Police_Force                                 140056 non-null  int64  
 6   Accident_Severity                            140056 non-null  int64  
 7   Number_of_Vehicles                           140056 non-null  int64  
 8   Number_of_Casualties                         140056 non-null  int64  
 9   Date                                         140056 non-nul

### Data Cleaning and feature selection

In [5]:
# Drop rows with missing values
df = df[df.LSOA_of_Accident_Location.notnull()]

In [6]:
# Rename columns properly
df.columns = [labels.lower() for labels in df.columns]
df.columns = [labels.replace('-', '_') for labels in df.columns]
# test
df.columns

Index(['accident_index', 'location_easting_osgr', 'location_northing_osgr',
       'longitude', 'latitude', 'police_force', 'accident_severity',
       'number_of_vehicles', 'number_of_casualties', 'date', 'day_of_week',
       'time', 'local_authority_(district)', 'local_authority_(highway)',
       '1st_road_class', '1st_road_number', 'road_type', 'speed_limit',
       'junction_detail', 'junction_control', '2nd_road_class',
       '2nd_road_number', 'pedestrian_crossing_human_control',
       'pedestrian_crossing_physical_facilities', 'light_conditions',
       'weather_conditions', 'road_surface_conditions',
       'special_conditions_at_site', 'carriageway_hazards',
       'urban_or_rural_area', 'did_police_officer_attend_scene_of_accident',
       'lsoa_of_accident_location'],
      dtype='object')

 We drop unwanted columns(index, date and location columns) which include accident_index,location_easting_osgr, location_northing_osgr,longitude, latitude, date,day_of_week, time, local_authority_(district), local_authority_(highway), lsoa_of_accident_location

In [7]:
df = df.drop(columns= ['accident_index', 'location_easting_osgr', 'location_northing_osgr','longitude', 'latitude','date', 'day_of_week',
       'time', 'local_authority_(district)', 'local_authority_(highway)','lsoa_of_accident_location', 'did_police_officer_attend_scene_of_accident'])

In [8]:
df.head()

Unnamed: 0,police_force,accident_severity,number_of_vehicles,number_of_casualties,1st_road_class,1st_road_number,road_type,speed_limit,junction_detail,junction_control,2nd_road_class,2nd_road_number,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area
0,1,3,1,1,5,0,6,30,3,4,6,0,0,0,4,1,1,0,0,1
1,1,3,1,1,6,0,6,30,3,4,3,3218,0,0,1,1,1,0,0,1
2,1,3,1,1,4,415,6,30,2,4,6,0,0,1,4,2,2,0,0,1
3,1,3,1,1,4,450,6,30,6,4,6,0,0,0,1,1,2,0,0,1
4,1,2,2,1,3,315,6,30,6,2,3,3220,0,5,1,2,2,0,0,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 131159 entries, 0 to 131591
Data columns (total 20 columns):
 #   Column                                   Non-Null Count   Dtype
---  ------                                   --------------   -----
 0   police_force                             131159 non-null  int64
 1   accident_severity                        131159 non-null  int64
 2   number_of_vehicles                       131159 non-null  int64
 3   number_of_casualties                     131159 non-null  int64
 4   1st_road_class                           131159 non-null  int64
 5   1st_road_number                          131159 non-null  int64
 6   road_type                                131159 non-null  int64
 7   speed_limit                              131159 non-null  int64
 8   junction_detail                          131159 non-null  int64
 9   junction_control                         131159 non-null  int64
 10  2nd_road_class                           131159 non-null

In [10]:
df.describe()

Unnamed: 0,police_force,accident_severity,number_of_vehicles,number_of_casualties,1st_road_class,1st_road_number,road_type,speed_limit,junction_detail,junction_control,2nd_road_class,2nd_road_number,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area
count,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0,131159.0
mean,25.683156,2.836092,1.847994,1.331788,4.099864,993.790895,5.167522,37.973605,2.296426,1.813234,2.761595,373.122714,0.006267,0.83918,1.949992,1.496062,1.285615,0.096577,0.059973,1.336843
std,19.570807,0.39917,0.706834,0.79398,1.433879,1784.160242,1.650387,13.796987,2.484445,2.345673,3.200451,1275.61442,0.117242,1.933939,1.65023,1.507155,0.541564,0.690085,0.567379,0.472633
min,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0,1.0
25%,6.0,3.0,1.0,1.0,3.0,0.0,6.0,30.0,0.0,-1.0,-1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
50%,22.0,3.0,2.0,1.0,4.0,128.0,6.0,30.0,3.0,3.0,3.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0
75%,44.0,3.0,2.0,1.0,6.0,674.0,6.0,40.0,3.0,4.0,6.0,0.0,0.0,0.0,4.0,1.0,2.0,0.0,0.0,2.0
max,63.0,3.0,37.0,38.0,6.0,9914.0,9.0,70.0,9.0,4.0,6.0,9999.0,2.0,8.0,7.0,9.0,5.0,7.0,7.0,2.0


In [11]:
# Missing data has been coded with -1 for some variable. So we drop all rows with  -1

for label in df.columns:
    df = df[df[label] != -1]
    

In [12]:
# test
df[df.pedestrian_crossing_physical_facilities== -1]

Unnamed: 0,police_force,accident_severity,number_of_vehicles,number_of_casualties,1st_road_class,1st_road_number,road_type,speed_limit,junction_detail,junction_control,2nd_road_class,2nd_road_number,pedestrian_crossing_human_control,pedestrian_crossing_physical_facilities,light_conditions,weather_conditions,road_surface_conditions,special_conditions_at_site,carriageway_hazards,urban_or_rural_area


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78791 entries, 0 to 131590
Data columns (total 20 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   police_force                             78791 non-null  int64
 1   accident_severity                        78791 non-null  int64
 2   number_of_vehicles                       78791 non-null  int64
 3   number_of_casualties                     78791 non-null  int64
 4   1st_road_class                           78791 non-null  int64
 5   1st_road_number                          78791 non-null  int64
 6   road_type                                78791 non-null  int64
 7   speed_limit                              78791 non-null  int64
 8   junction_detail                          78791 non-null  int64
 9   junction_control                         78791 non-null  int64
 10  2nd_road_class                           78791 non-null  int64
 11  2

In [14]:
# Separate features from target
features = df.drop(columns= 'accident_severity').values
target = df.accident_severity.values

In [15]:
# Split the dataset into train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size= 0.3, random_state= 42)

### Feature Scaling

In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Building models
#### SVM

In [17]:
#import module and initialize model
from sklearn.svm import SVC
clf = SVC(kernel = 'linear', random_state = 0)

In [18]:
# Train model useing our train set and make prediction using test set
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [19]:
y_pred

array([3, 3, 3, ..., 3, 3, 3], dtype=int64)

In [20]:
# check for model accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8658938996531009

In [21]:
# print out confusion matrix
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(y_test, y_pred)
cfm

array([[    0,     0,   141],
       [    0,     0,  3029],
       [    0,     0, 20468]], dtype=int64)

In [22]:
# print out classification report
from sklearn.metrics import classification_report
clf_report = classification_report(y_test, y_pred)
clf_report

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


'              precision    recall  f1-score   support\n\n           1       0.00      0.00      0.00       141\n           2       0.00      0.00      0.00      3029\n           3       0.87      1.00      0.93     20468\n\n    accuracy                           0.87     23638\n   macro avg       0.29      0.33      0.31     23638\nweighted avg       0.75      0.87      0.80     23638\n'

In [23]:
print(clf_report)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638



In [24]:
def train_predict_svm(kernel, c, gamma):
    """
    The function train an SVM model for various hyperparameter combination, predict values for 
    the target variable and print out a classification report.
    INPUTS:
    kernel: String values for any SVM kernel
    c: integer value for SVM C
    gamma: integer value for SVM gamma

    """
    #initialize model
    clf = SVC(kernel= kernel, C= c, gamma= gamma)
    # Train model useing our train set and make prediction using test set
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # check for model accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)
    # print out classification report
    clf_report = classification_report(y_test, y_pred, zero_division = 0)
    print(clf_report)

In [25]:
# run the train_predict_svm function for different values of c
val_c = [2,3]
for value in val_c:
    print('For the C value of {} using a linear kernel and a gamma value of  1'.format(value))
    train_predict_svm(kernel= 'linear', c = value, gamma= 1)

For the C value of 1 using a linear kernel and a gamma value of  1
0.8658938996531009
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638

For the C value of 2 using a linear kernel and a gamma value of  1
0.8658938996531009
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638

For the C value of 3 using a linear kernel and a gamma value of  1

In [26]:
# run the train_predict_svm function for different values of gamma
val_gamma = [10, 20, 30]
for value in val_gamma:
    print('For the gamma value of {} using a linear kernel and a c value of  1'.format(value))
    train_predict_svm(kernel= 'linear', c= 1, gamma= value)

For the gamma value of 10 using a linear kernel and a c value of  1
0.8658938996531009
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638

For the gamma value of 20 using a linear kernel and a c value of  1
0.8658938996531009
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638

For the gamma value of 30 using a linear kernel and a c value of

In [27]:
# run the train_predict_svm for different kernel
val_ker = ['rbf', 'sigmoid']
for value in val_ker:
    print('For the gamma value of 1 using a {} kernel and a c value of  1'.format(value))
    train_predict_svm(kernel= value, c= 1, gamma= 1)


For the gamma value of 1 using a rbf kernel and a c value of  1
0.8643286234029952
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.25      0.01      0.01      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.86     23638
   macro avg       0.37      0.33      0.31     23638
weighted avg       0.78      0.86      0.80     23638

For the gamma value of 1 using a sigmoid kernel and a c value of  1
0.766139267281496
              precision    recall  f1-score   support

           1       0.02      0.05      0.03       141
           2       0.12      0.10      0.11      3029
           3       0.86      0.87      0.87     20468

    accuracy                           0.77     23638
   macro avg       0.33      0.34      0.33     23638
weighted avg       0.76      0.77      0.76     23638



In [29]:
# run the train_predict_svm function for different values of gamma
val_gamma = [10, 20, 30]
for value in val_gamma:
    print('For the gamma value of {} using a sigmid kernel and a c value of  1'.format(value))
    train_predict_svm(kernel= 'sigmoid', c= 1, gamma= value)

For the gamma value of 10 using a sigmid kernel and a c value of  1
0.768085286403249
              precision    recall  f1-score   support

           1       0.01      0.01      0.01       141
           2       0.13      0.13      0.13      3029
           3       0.87      0.87      0.87     20468

    accuracy                           0.77     23638
   macro avg       0.34      0.34      0.34     23638
weighted avg       0.77      0.77      0.77     23638

For the gamma value of 20 using a sigmid kernel and a c value of  1
0.765123952957103
              precision    recall  f1-score   support

           1       0.01      0.01      0.01       141
           2       0.12      0.12      0.12      3029
           3       0.87      0.87      0.87     20468

    accuracy                           0.77     23638
   macro avg       0.33      0.33      0.33     23638
weighted avg       0.77      0.77      0.77     23638

For the gamma value of 30 using a sigmid kernel and a c value of  

In [30]:
# run the train_predict_svm function for different values of c
val_c = [10,100]
for value in val_c:
    print('For the C value of {} using a sigmoid kernel and a gamma value of  1'.format(value))
    train_predict_svm(kernel= 'sigmoid', c = value, gamma= 1)

For the C value of 10 using a sigmoid kernel and a gamma value of  1
0.766943057788307
              precision    recall  f1-score   support

           1       0.01      0.02      0.01       141
           2       0.12      0.10      0.11      3029
           3       0.86      0.87      0.87     20468

    accuracy                           0.77     23638
   macro avg       0.33      0.33      0.33     23638
weighted avg       0.76      0.77      0.77     23638

For the C value of 100 using a sigmoid kernel and a gamma value of  1
0.7665623149166596
              precision    recall  f1-score   support

           1       0.01      0.03      0.02       141
           2       0.12      0.10      0.11      3029
           3       0.86      0.87      0.87     20468

    accuracy                           0.77     23638
   macro avg       0.33      0.33      0.33     23638
weighted avg       0.76      0.77      0.77     23638



#### Random forest

In [32]:
# import, instantiate and train model
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators= 50, max_features= 'auto', random_state= 44,)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=50, random_state=44)

In [33]:
# make prediction on feature test
y_pred = rf_clf.predict(X_test)
y_pred

array([3, 2, 3, ..., 3, 3, 3], dtype=int64)

In [34]:
# prediction probability
pred_prob = rf_clf.predict_proba(X_test)
pred_prob

array([[0.        , 0.04      , 0.96      ],
       [0.        , 0.53      , 0.47      ],
       [0.        , 0.06      , 0.94      ],
       ...,
       [0.        , 0.16      , 0.84      ],
       [0.        , 0.10020847, 0.89979153],
       [0.        , 0.        , 1.        ]])

In [35]:
# check for model accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8582367374566376

In [36]:
# print out confusion matrix
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(y_test, y_pred)
cfm

array([[    0,     4,   137],
       [    1,    59,  2969],
       [    4,   236, 20228]], dtype=int64)

In [37]:
# print out classification report
from sklearn.metrics import classification_report
clf_report = classification_report(y_test, y_pred, zero_division=0)
print(clf_report)

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.20      0.02      0.04      3029
           3       0.87      0.99      0.92     20468

    accuracy                           0.86     23638
   macro avg       0.35      0.34      0.32     23638
weighted avg       0.78      0.86      0.80     23638



In [38]:
def train_predict_rf(n_est, max_depth):
    """
    The function train a random forest model for various hyperparameter combination, predict values for 
    the target variable and print out a classification report.
    INPUTS:
    n_est: integer value for the number of tree of a random forest model ie n_estimator
    max_depth: integer value for the tree depth of a random forest model

    """
    #initialize model
    rf_clf = RandomForestClassifier(n_estimators= n_est, max_features= 'auto', random_state= 44,max_depth= max_depth)
    # Train model useing our train set and make prediction using test set
    rf_clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    # check for model accuracy
    r_accuracy = accuracy_score(y_test, y_pred)
    print(r_accuracy)
    # print out classification report
    rf_clf_report = classification_report(y_test, y_pred, zero_division = 0)
    print(rf_clf_report)

In [39]:
# run train_predict_rf for different values of n_estimators
n_trees = [100, 200, 300]
for n in n_trees:
    print('For a n_estimator of {} while max_depth is 80'.format(n))
    train_predict_rf(n_est= n, max_depth= 80)

For a n_estimator of 100 while max_depth is 80
0.8658938996531009
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638

For a n_estimator of 200 while max_depth is 80
0.8658938996531009
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638

For a n_estimator of 300 while max_depth is 80
0.8658938996531009
              precision    recall  f1-sc

In [41]:
# run train_predict_rf for different values of max_depth
n_depth = [100, 150, 200, 250]
for n in n_depth:
    print('For a n_estimator of 100 while max_depth is {}'.format(n))
    train_predict_rf(n_est= 100, max_depth= n)

For a n_estimator of 100 while max_depth is 100
0.8658938996531009
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638

For a n_estimator of 100 while max_depth is 150
0.8658938996531009
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       141
           2       0.00      0.00      0.00      3029
           3       0.87      1.00      0.93     20468

    accuracy                           0.87     23638
   macro avg       0.29      0.33      0.31     23638
weighted avg       0.75      0.87      0.80     23638

For a n_estimator of 100 while max_depth is 200
0.8658938996531009
              precision    recall  f1