In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
#split into train and test
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN

from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    recall_score,
    confusion_matrix,
    classification_report
)

import warnings
warnings.filterwarnings("ignore")
plt.rcParams["figure.autolayout"] = True

In [2]:
accident_data = pd.read_csv('accident_data.csv',encoding = "ISO-8859-1").sort_values(by='Accident_Index')
accident_data.head()

Unnamed: 0,Accident_Index,1st_Road_Class,1st_Road_Number,2nd_Road_Class,2nd_Road_Number,Accident_Severity,Carriageway_Hazards,Date,Day_of_Week,Did_Police_Officer_Attend_Scene_of_Accident,...,Police_Force,Road_Surface_Conditions,Road_Type,Special_Conditions_at_Site,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Year,InScotland
27261,2005030000000.0,A,591.0,,0.0,Slight,,08/04/2005,Friday,1.0,...,Cumbria,Dry,Single carriageway,,60,16:39,Rural,Fine no high winds,2005,No
28085,2005030000000.0,,0.0,,0.0,Slight,,01/09/2005,Thursday,2.0,...,Cumbria,Dry,Single carriageway,,60,14:15,Rural,Fine no high winds,2005,No
28084,2005030000000.0,A,6.0,A,590.0,Slight,,18/09/2005,Sunday,1.0,...,Cumbria,Wet or damp,Single carriageway,,60,00:20,Rural,Other,2005,No
28083,2005030000000.0,A,591.0,Unclassified,0.0,Slight,,19/09/2005,Monday,1.0,...,Cumbria,Dry,Dual carriageway,,70,15:50,Rural,Fine no high winds,2005,No
28082,2005030000000.0,A,69.0,,0.0,Slight,,15/09/2005,Thursday,1.0,...,Cumbria,Wet or damp,Single carriageway,,60,09:17,Rural,Raining no high winds,2005,No


In [3]:
accident_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1048575 entries, 27261 to 1048574
Data columns (total 34 columns):
 #   Column                                       Non-Null Count    Dtype  
---  ------                                       --------------    -----  
 0   Accident_Index                               1048575 non-null  object 
 1   1st_Road_Class                               742986 non-null   object 
 2   1st_Road_Number                              1048573 non-null  float64
 3   2nd_Road_Class                               608751 non-null   object 
 4   2nd_Road_Number                              1037772 non-null  float64
 5   Accident_Severity                            1048575 non-null  object 
 6   Carriageway_Hazards                          1048546 non-null  object 
 7   Date                                         1048575 non-null  object 
 8   Day_of_Week                                  1048575 non-null  object 
 9   Did_Police_Officer_Attend_Scene_of_Acciden

In [4]:
accident_data.Day_of_Week.unique()

array(['Friday', 'Thursday', 'Sunday', 'Monday', 'Wednesday', 'Saturday',
       'Tuesday'], dtype=object)

In [5]:
# Removing exact duplicates
accident_data.drop(columns=['Accident_Index'], inplace=True, axis=1)
remove_data = accident_data.duplicated(keep=False)

accident_data = accident_data[~remove_data]
print(len(accident_data))

1048537


In [6]:
accident_data.Accident_Severity.value_counts()

Slight     895845
Serious    138192
Fatal       14500
Name: Accident_Severity, dtype: int64

In [7]:
# Removing all garbage values as Nan
accident_data['Urban_or_Rural_Area'] = accident_data['Urban_or_Rural_Area'].apply(lambda x : np.nan if x == 'Unallocated' \
                                                                                  else x).copy()
accident_data['Light_Conditions'] = accident_data['Light_Conditions'].apply(lambda x :np.nan \
                                                                            if x =='Darkness - lighting unknown' \
                                                                            else x).copy()

accident_data['Junction_Control'] = accident_data['Junction_Control'].apply(lambda x :np.nan \
                                                                            if x =='Data missing or out of range' \
                                                                            else x).copy()
accident_data['Junction_Detail'] = accident_data['Junction_Detail'].apply(lambda x :np.nan \
                                                                            if x =='Data missing or out of range' \
                                                                            else x).copy()
accident_data['2nd_Road_Class'] = accident_data['2nd_Road_Class'].apply(lambda x: np.nan if x == 'Unclassified' else x)

# Correcting the misspelled words
accident_data['Junction_Control'] = accident_data['Junction_Control'].apply(lambda x :'Auto traffic signal' \
                                                                            if x =='Auto traffic sigl' \
                                                                            else x).copy()
accident_data['Special_Conditions_at_Site'] = accident_data['Special_Conditions_at_Site'].apply(lambda x :\
                                                                                                'Auto traffic signal - out' \
                                                                                                if x =='Auto traffic sigl - out' \
                                                                                                else x).copy()
accident_data['Special_Conditions_at_Site'] = accident_data['Special_Conditions_at_Site'].apply(lambda x :\
                                                                                                'Auto signal part defective' \
                                                                                                if x =='Auto sigl part defective' \
                                                                                                else x).copy()




In [8]:
accident_data.drop(columns=['Local_Authority_(District)', 'Local_Authority_(Highway)', \
                            'Location_Easting_OSGR', 'Location_Northing_OSGR', \
                            '2nd_Road_Number', '1st_Road_Number', 'LSOA_of_Accident_Location', \
                            'Police_Force'], inplace=True, axis = 1)# 'Date', 'Time', 'Year', 'Day_of_Week',\
null_data = accident_data[accident_data.isna().sum(axis=1)>=1].copy()
accident_data.drop(list(null_data.index), axis=0, inplace = True)
accident_data.shape, null_data.shape

((168549, 25), (879988, 25))

In [9]:
# Converting Date and Time column to datetime object 
accident_data['Date'] = accident_data.Date.apply(lambda x: dt.datetime.strptime(x,"%d/%m/%Y"))
accident_data['Time'] = accident_data.Time.apply(lambda x: dt.datetime.strptime(str(x),"%H:%M"))


In [10]:

# Converting time column in a day into 4 parts i.e. Morning - 1, afternoon - 2, evening - 3, night - 0


def divideTimeintoPeriod(hour):
    if hour < 6:
        return 0 # Since its time when morning working period has not yet started in the road thus categorizing it as night
    elif hour < 12:
        return 1 # As per normal understanding after 12 noon begins 
    elif hour < 5 :
        return 2
    elif hour < 8 :
        return 3
    else:
        return 0 # this is the period of night time 
accident_data['Time_int'] = accident_data.Time.dt.hour.apply(lambda x: divideTimeintoPeriod(x))

In [11]:
accident_data['Month_int'] = pd.DatetimeIndex(accident_data['Date']).month
accident_data['Date_int'] = pd.DatetimeIndex(accident_data['Date']).day
#accident_data[['Month_int', 'Date_int']].head()

In [12]:
def convertweekDaysToNum(day_of_week):
    if day_of_week in ['Friday', 'Thursday', 'Monday', 'Wednesday','Tuesday']:
        return 0 # Denoting Weekdays
    else:
        return 1 # Denoting Weekend
accident_data['Day_of_Week_int'] = accident_data['Day_of_Week'].apply(lambda x:convertweekDaysToNum(x))
accident_data.drop(columns=["Date", "Time", 'Day_of_Week',], inplace=True, axis = 1)

In [13]:
cat_columns = []
for c in accident_data.columns:
    if accident_data[c].dtype=='object':
        if c not in ['Accident_Index', 'Accident_Severity', 'Date', 'Time']:
            cat_columns.append(c)

In [14]:
le = preprocessing.LabelEncoder()
for c in cat_columns:
    accident_data[c+'_int'] = le.fit_transform(accident_data[c])
def numerateTheSeverity(acc_severity):
    if str(acc_severity).lower() == 'serious':
        return 1
    elif str(acc_severity).lower() == 'fatal':
        return 2
    else:
        return 0
accident_data['Accident_Severity'] = accident_data['Accident_Severity'].apply(lambda x: numerateTheSeverity(x))

In [15]:
training_cols = []
for c in accident_data.columns:
    if accident_data[c].dtype in ['float64', 'int64', 'int32']:
        training_cols.append(c)
training_cols.remove('Accident_Severity')
#training_cols.extend(["Date", "Time"])
print(training_cols)

['Did_Police_Officer_Attend_Scene_of_Accident', 'Latitude', 'Longitude', 'Number_of_Casualties', 'Number_of_Vehicles', 'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities', 'Speed_limit', 'Year', 'Time_int', 'Month_int', 'Date_int', 'Day_of_Week_int', '1st_Road_Class_int', '2nd_Road_Class_int', 'Carriageway_Hazards_int', 'Junction_Control_int', 'Junction_Detail_int', 'Light_Conditions_int', 'Road_Surface_Conditions_int', 'Road_Type_int', 'Special_Conditions_at_Site_int', 'Urban_or_Rural_Area_int', 'Weather_Conditions_int', 'InScotland_int']


In [33]:
X = accident_data[training_cols]
Y = accident_data['Accident_Severity']
print(len(accident_data),"\n",Y.value_counts())

168549 
 0    149354
1     17828
2      1367
Name: Accident_Severity, dtype: int64


In [17]:
sm = SMOTE(
    sampling_strategy='auto',  # samples only the minority class
    random_state=0,  # for reproducibility
    k_neighbors=5,
    n_jobs=4
)

X_sm, Y_sm = sm.fit_resample(X, Y)


In [18]:
print(len(X_sm), len(Y_sm))

448062 448062


In [19]:

X_train, X_test, Y_train, Y_test=train_test_split(X_sm, Y_sm,test_size=0.2,random_state=40)
# Performing Feature Scaling to the training and testing input data 
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)


In [20]:
rfc = RandomForestClassifier(n_estimators=100, criterion='gini')
rfc.fit(X_train_scaled, Y_train)
Y_pred = rfc.predict(X_test_scaled)
rfc_cfmatrix = confusion_matrix(Y_test, Y_pred)
rfc_report = classification_report(Y_test, Y_pred)

In [21]:
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(Y_test, Y_pred))
print('Balanced accuracy, Random Forest test:', balanced_accuracy_score(Y_test, Y_pred))
print('Recall Score', recall_score(Y_test, Y_pred , average=None))
print('Confusion Matrix \n', rfc_cfmatrix)
print('Classification Report \n', rfc_report)

ACCURACY OF THE MODEL:  0.8516175108522201
Balanced accuracy, Random Forest test: 0.85227072988804
Recall Score [0.63440717 0.92716227 0.99524275]
Confusion Matrix 
 [[19043 10832   142]
 [ 2093 27775    89]
 [  108    33 29498]]
Classification Report 
               precision    recall  f1-score   support

           0       0.90      0.63      0.74     30017
           1       0.72      0.93      0.81     29957
           2       0.99      1.00      0.99     29639

    accuracy                           0.85     89613
   macro avg       0.87      0.85      0.85     89613
weighted avg       0.87      0.85      0.85     89613



In [22]:
rfc_cfmatrix

array([[19043, 10832,   142],
       [ 2093, 27775,    89],
       [  108,    33, 29498]], dtype=int64)

In [23]:
enn = EditedNearestNeighbours(
    sampling_strategy='auto',  # undersamples only the majority class
    n_neighbors=3, # the number of neighbours to examine
    kind_sel='all',  # all neighbours need to have the same label as the observation examined
    n_jobs=4)  



In [24]:
smenn = SMOTEENN(
    sampling_strategy='auto',  # samples only the minority class
    random_state=0,  # for reproducibility
    smote=sm,
    enn=enn,
    n_jobs=2
)

X_smenn, Y_smenn = smenn.fit_resample(X, Y)
print(Y_smenn.value_counts())
print(len(X_smenn), len(Y_smenn))

0    149354
2    149333
1    145965
Name: Accident_Severity, dtype: int64
444652 444652


In [25]:
X_train, X_test, Y_train, Y_test=train_test_split(X_smenn, Y_smenn,test_size=0.2,random_state=40)
# Performing Feature Scaling to the training and testing input data 
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)


In [26]:
rfc = RandomForestClassifier(n_estimators=100, criterion='gini')
rfc.fit(X_train_scaled, Y_train)
Y_pred = rfc.predict(X_test_scaled)
rfc_cfmatrix = confusion_matrix(Y_test, Y_pred)
rfc_report = classification_report(Y_test, Y_pred)

In [27]:
print("ACCURACY OF THE MODEL: ", metrics.accuracy_score(Y_test, Y_pred))
print('Balanced accuracy, Random Forest test:', balanced_accuracy_score(Y_test, Y_pred))
print('Recall Score', recall_score(Y_test, Y_pred , average=None))
print('Confusion Matrix \n', rfc_cfmatrix)
print('Classification Report \n', rfc_report)

ACCURACY OF THE MODEL:  0.85196388211085
Balanced accuracy, Random Forest test: 0.8521864421330121
Recall Score [0.63267846 0.92822269 0.99565817]
Confusion Matrix 
 [[18807 10794   125]
 [ 2029 27377    88]
 [   90    39 29582]]
Classification Report 
               precision    recall  f1-score   support

           0       0.90      0.63      0.74     29726
           1       0.72      0.93      0.81     29494
           2       0.99      1.00      0.99     29711

    accuracy                           0.85     88931
   macro avg       0.87      0.85      0.85     88931
weighted avg       0.87      0.85      0.85     88931

