In [None]:
#import relevant libraries for the data analysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score, classification_report
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline
print('libraries imported')

In [None]:
#import the dataset
website_url='https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv'
df= pd.read_csv(website_url)
df.info()


In [None]:
## The Exploratory data analysis
# count the incidents in each year
df_1=df[['SEVERITYCODE','COLLISIONTYPE','JUNCTIONTYPE','ADDRTYPE','OBJECTID','PERSONCOUNT','PEDCOUNT','PEDCYLCOUNT','VEHCOUNT']]
df_1=df.dropna()
df_1.info()

In [None]:
incidents_type=df_1['OBJECTID'].groupby([df_1['SEVERITYCODE'],df_1['COLLISIONTYPE']]).agg({'count'})
incidents_type=incidents_type.reset_index()
incidents_type
f1=incidents_type[incidents_type.SEVERITYCODE==1]
f2=incidents_type[incidents_type.SEVERITYCODE==2]

labels=f1['COLLISIONTYPE']
s1=f1['count']
s2=f2['count']

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars


fig, ax = plt.subplots(figsize=(14,8))
ax.barh(x, s1, width, color='orange', label='Code 1')
ax.barh(x + width, s2, width, color='green', label='Code 2')

ax.set(yticks=x + width, yticklabels=labels, ylim=[2*width - 1, len(labels)])
ax.legend()

plt.show()

In [None]:
weather_type=df['OBJECTID'].groupby([df['SEVERITYCODE'],df['WEATHER']]).agg({'count'})
weather_type=weather_type.reset_index()
weather_type
w1=weather_type[weather_type.SEVERITYCODE==1]
w2=weather_type[weather_type.SEVERITYCODE==2]

w1
w2

labels=w1['WEATHER']
q1=w1['count']
q2=w2['count']

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars


fig, ax = plt.subplots(figsize=(14,8))
ax.barh(x, q1, width, color='purple', label='Code 1')
ax.barh(x + width, q2, width, color='blue', label='Code 2')

ax.set(yticks=x + width, yticklabels=labels, ylim=[2*width - 1, len(labels)])
ax.legend()

plt.show()

In [None]:
junction_type=df['OBJECTID'].groupby([df['SEVERITYCODE'],df['JUNCTIONTYPE']]).agg({'count'})
junction_type=junction_type.reset_index()
junction_type
j1=junction_type[junction_type.SEVERITYCODE==1]
j2=junction_type[junction_type.SEVERITYCODE==2]

j1
j2

label3=j1['JUNCTIONTYPE']
jf1=j1['count']
jf2=j2['count']

x = np.arange(len(label3))  # the label locations
width = 0.4  # the width of the bars


fig, ax = plt.subplots(figsize=(14,8))
ax.barh(x, jf1, width, color='blueviolet', label='Code 1')
ax.barh(x + width, jf2, width, color='burlywood', label='Code 2')

ax.set(yticks=x + width, yticklabels=label3, ylim=[2*width - 1, len(label3)])
ax.legend()

plt.show()

In [None]:
ROADCOND=df['OBJECTID'].groupby([df['SEVERITYCODE'],df['ROADCOND']]).agg({'count'})
ROADCOND=ROADCOND.reset_index()
ROADCOND
r1=ROADCOND[ROADCOND.SEVERITYCODE==1]
r2=ROADCOND[ROADCOND.SEVERITYCODE==2]

r1
r2
label4=r1['ROADCOND']
rf1=r1['count']
rf2=r2['count']

x = np.arange(len(label4))  # the label locations
width = 0.35  # the width of the bars


fig, ax = plt.subplots(figsize=(14,8))
ax.barh(x, rf1, width, color='cornflowerblue', label='Code 1')
ax.barh(x + width, rf2, width, color='crimson', label='Code 2')

ax.set(yticks=x + width, yticklabels=label4, ylim=[2*width - 1, len(label3)])
ax.legend()

plt.show()

In [None]:
LIGHTCOND=df['OBJECTID'].groupby([df['SEVERITYCODE'],df['LIGHTCOND']]).agg({'count'})
LIGHTCOND=LIGHTCOND.reset_index()
LIGHTCOND
l1=LIGHTCOND[LIGHTCOND.SEVERITYCODE==1]
l2=LIGHTCOND[LIGHTCOND.SEVERITYCODE==2]

l1
l2
label5=l1['LIGHTCOND']
lf1=l1['count']
lf2=l2['count']

x = np.arange(len(label5))  # the label locations
width = 0.4  # the width of the bars


fig, ax = plt.subplots(figsize=(14,8))
ax.barh(x, lf1, width, color='darkorange', label='Code 1')
ax.barh(x + width, lf2, width, color='darkslateblue', label='Code 2')

ax.set(yticks=x + width, yticklabels=label5, ylim=[2*width - 1, len(label3)])
ax.legend()

plt.show()

In [None]:
# To Check the null or NaN in the dataset
missing_data=df.isnull()
missing_data.head()

for column in missing_data.columns.values.tolist():
    print(column)
    print(missing_data[column].value_counts())
    print("")


In [None]:
#drop all irrelevant variables in the dataset
df = df.drop(['OBJECTID','INCKEY','LOCATION','COLDETKEY','REPORTNO','STATUS','INTKEY','EXCEPTRSNCODE','EXCEPTRSNDESC','SEVERITYDESC','INCDATE','SDOT_COLCODE','SDOT_COLDESC','SDOTCOLNUM','ST_COLCODE','ST_COLDESC','SEGLANEKEY','CROSSWALKKEY','INCDTTM'],axis=1)
df=df.drop('SEVERITYCODE.1',axis=1)
df.rename(columns={'X':'Longitude','Y':'Latitude'},inplace=True)
df.info()

In [None]:
#drop the columns with too much missing values
df = df.drop(["INATTENTIONIND","PEDROWNOTGRNT","SPEEDING"],axis=1)

#handle the typo and unclear value with NaN and drop it
df.replace(r'^\s*$', np.nan, regex=True)
df.replace("Unknown", np.nan, inplace = True)
df.replace("Other", np.nan, inplace = True)

#drop the missing values in the rest columns
df.dropna(subset=["Longitude","Latitude","COLLISIONTYPE","JUNCTIONTYPE","UNDERINFL","WEATHER","ROADCOND","LIGHTCOND"], axis=0, inplace=True)

df.info()

In [None]:
df['SEVERITYCODE'].value_counts()

#creating a balanced dataset
df= df.sample(frac=1,random_state=0,replace=False)

#separate code 2  from original dataset.
df_code2 = df.loc[df['SEVERITYCODE'] == 2]

#select 58188 observations from the severity code 1(it has more values than code 2)
df_code1 = df.loc[df['SEVERITYCODE'] == 1].sample(n=48926,random_state=42)

#get balanced dataset
df_balanced = pd.concat([df_code1,df_code2])
df_balanced = df_balanced.sample(frac=1,random_state=0,replace=False)

#Replacing 0 with N and 1 with Y as this column has mixed datatype values, making it consistent
df_balanced['UNDERINFL'] = df_balanced['UNDERINFL'].replace(['0'],'N')
df_balanced['UNDERINFL'] = df_balanced['UNDERINFL'].replace(['1'],'Y')

#checking if dataset balanced
df_balanced.info()
df_balanced['SEVERITYCODE'].value_counts()

In [None]:
A = df_balanced.iloc[:,1:]
#Encoding Categorical Features - Training Dataset
A = pd.get_dummies(data=A, columns=['ADDRTYPE','COLLISIONTYPE','JUNCTIONTYPE','WEATHER','ROADCOND','LIGHTCOND','UNDERINFL','HITPARKEDCAR'])

B = df_balanced[['SEVERITYCODE']]
A.info()
A_train, A_test, B_train, B_test = train_test_split(A,B,test_size=0.2,random_state=0)

In [None]:
#Scaling training data as per the requirements
scaler = StandardScaler()
A_train[['Longitude','Latitude','PERSONCOUNT','PEDCOUNT','PEDCYLCOUNT','VEHCOUNT']] = scaler.fit_transform(A_train[['Longitude','Latitude','PERSONCOUNT','PEDCOUNT','PEDCYLCOUNT','VEHCOUNT']])

#Scaling test data as per the requirements
scaler = StandardScaler()
A_test[['Longitude','Latitude','PERSONCOUNT','PEDCOUNT','PEDCYLCOUNT','VEHCOUNT']] = scaler.fit_transform(A_test[['Longitude','Latitude','PERSONCOUNT','PEDCOUNT','PEDCYLCOUNT','VEHCOUNT']])

A_train.info()

In [None]:
df_corr = pd.concat([B_train,A_train])
df_corr = df_corr.corr()
df_corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
plt.figure(figsize=(50,50))
seaborn.heatmap(df_corr,annot=True,cmap='coolwarm')
plt.savefig('correlation.png')


In [None]:
# Apply the Machine Learning Algorithm
## LogisticRegression FOR THE FOLLOWING ANALYSIS
#Fitting and Predictions
lr = LogisticRegression(random_state = 0)
lr.fit(A_train,B_train)
lr_predictions = lr.predict(A_test)

#Confusion Matrix
lr_cm = confusion_matrix(B_test,lr_predictions)
print(lr_cm,'\n')

#Classification Report
lr_cr = classification_report(B_test,lr_predictions)
print(lr_cr,'\n')

#Accuracy
acc = accuracy_score(B_test,lr_predictions)
print(acc,'\n')
accDict = {}
accDict['LR'] = acc

In [None]:
## KNeighborsClassifier for the Machine Learning process

#Fitting and Predictions
knn = KNeighborsClassifier()
params = {'n_neighbors':[3,4,5,6,7],'p':[1,2]}
knn1 = GridSearchCV(knn, param_grid=params)
knn1.fit(A_train,B_train.values.ravel())
knn_predictions = knn1.predict(A_test)

print('Best Hyperparameter KNN : ',knn1.best_params_)

#Confusion Matrix
knn_cm = confusion_matrix(B_test,knn_predictions)
print(knn_cm,'\n')

#Classification Report
knn_cr = classification_report(B_test,knn_predictions)
print(knn_cr,'\n')

#Accuracy
acc = accuracy_score(B_test,knn_predictions)
print(acc,'\n')
accDict['KNN'] = acc


In [None]:
#Bayes
nb = GaussianNB()
nb.fit(A_train,B_train)
nb_predictions = nb.predict(A_test)

#Confusion Matrix
nb_cm=confusion_matrix(B_test,nb_predictions)
print(nb_cm,'\n')

#Classification Report
nb_cr = classification_report(B_test,nb_predictions)
print(nb_cr,'\n')

#Accuracy
acc = accuracy_score(B_test,nb_predictions)
print(acc,'\n')
accDict['NB'] = acc

In [None]:
#Fitting and Predictions
svc = SVC()
params = {'kernel':['linear','rbf'], 
          'random_state':[0]}
svc1 = GridSearchCV(svc, param_grid=params)
svc1.fit(A_train,B_train)
svc_predictions = svc1.predict(A_test)
print('Best Hyperparameter SVM : ',svc1.best_params_)

#Confusion Matrix
svc_cm=confusion_matrix(B_test,svc_predictions)
print(svc_cm,'\n')

#Classification Report
svc_cr = classification_report(B_test,svc_predictions)
print(svc_cr,'\n')

#Accuracy
acc = accuracy_score(B_test,svc_predictions)
print(acc,'\n')
accDict['SVC'] = acc