# Capstone
### This project will be used for coursera Capstone course 

In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import resample
from sklearn import preprocessing

In [144]:
df_original = pd.read_csv('Data-Collisions.csv')
df = df_original.copy()

  interactivity=interactivity, compiler=compiler, result=result)


# Preprocessing

In [145]:
columns_to_drop = ['HITPARKEDCAR', 'JUNCTIONTYPE', 'STATUS', 'VEHCOUNT', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'SEGLANEKEY', 'INCDTTM', 'SEVERITYDESC', 'SEVERITYCODE.1', 'X', 'Y','SDOTCOLNUM', 'ST_COLDESC', 'SDOT_COLDESC', 'COLDETKEY', 'INCKEY', 'OBJECTID', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'INCDATE', 'INATTENTIONIND', 'PEDROWNOTGRNT', 'SPEEDING']

In [146]:
# unnessesary columns dropped
df.drop(columns = columns_to_drop, inplace = True)

# dropping duplicates
df.drop_duplicates(subset = 'REPORTNO', keep = 'first', inplace=True)
df.drop(columns = ['REPORTNO'], inplace = True)

In [147]:
# dropping unknown
df.drop(index = df[df['ST_COLCODE'] == ' '].index, inplace = True)
df.dropna(axis = 0, inplace = True)

In [148]:
# filling nan values with Unknown 
df['COLLISIONTYPE'].fillna('Unknown', inplace = True)

In [149]:
# converting data in columns from str to numeric
df['ADDRTYPE'].replace(to_replace=['Block', 'Intersection', 'Alley'], 
                       value = [0, 1, 2], 
                       inplace = True)

df['COLLISIONTYPE'].replace(to_replace=['Parked Car', 'Angles', 'Rear Ended', 'Other', 'Sideswipe', 'Left Turn', 'Pedestrian', 'Cycles', 'Right Turn', 'Head On'], 
                      value=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 
                      inplace=True)

df['ST_COLCODE'] = pd.to_numeric(df['ST_COLCODE'])

In [102]:
# getting dummies for each str column
columns = df[['SEVERITYCODE', 'STATUS', 'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 
           'VEHCOUNT', 'SDOT_COLCODE', 'UNDERINFL', 'ST_COLCODE', 
           'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']]
df = pd.concat([columns, 
                pd.get_dummies(df['ADDRTYPE']), 
                pd.get_dummies(df['COLLISIONTYPE']), 
                pd.get_dummies(df['JUNCTIONTYPE']), 
                pd.get_dummies(df['WEATHER']), 
                pd.get_dummies(df['ROADCOND']), 
                pd.get_dummies(df['LIGHTCOND'])], axis=1)

In [36]:
df['JUNCTIONTYPE'].value_counts().index

Index(['Mid-Block (not related to intersection)',
       'At Intersection (intersection related)',
       'Mid-Block (but intersection related)', 'Driveway Junction',
       'At Intersection (but not related to intersection)', 'Ramp Junction',
       'Unknown'],
      dtype='object')

# Down-sample Majority Class

In [150]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 187947 entries, 0 to 194672
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype
---  ------         --------------   -----
 0   SEVERITYCODE   187947 non-null  int64
 1   ADDRTYPE       187947 non-null  int64
 2   COLLISIONTYPE  187947 non-null  int64
 3   PERSONCOUNT    187947 non-null  int64
 4   PEDCOUNT       187947 non-null  int64
 5   PEDCYLCOUNT    187947 non-null  int64
 6   SDOT_COLCODE   187947 non-null  int64
 7   ST_COLCODE     187947 non-null  int64
 8   CROSSWALKKEY   187947 non-null  int64
dtypes: int64(9)
memory usage: 14.3 MB


In [151]:
# Separate majority and minority classes
df_majority = df[df['SEVERITYCODE'] == 1]
df_minority = df[df['SEVERITYCODE'] == 2]

In [152]:
# Downsample majority class
length = len(df_minority['SEVERITYCODE'])
df_majority_downsampled = resample(df_majority, 
                                 replace = False,    # sample without replacement
                                 n_samples = length,     # to match minority class
                                 random_state = 0) # reproducible results

In [153]:
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Training

In [154]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [155]:
#deviding into x and y
y = df_downsampled['SEVERITYCODE']
X = df_downsampled[df_downsampled.columns[1:]]

X = preprocessing.StandardScaler().fit(X).transform(X)

In [156]:
#train test split and model fitting
X_train, X_test, y_train, y_test = train_test_split(X, y)

clf = GradientBoostingClassifier().fit(X_train, y_train)

In [157]:
tree = DecisionTreeClassifier(criterion="entropy", max_depth = 7).fit(X_train, y_train)

In [158]:
reg = LogisticRegression(C=0.5, solver='liblinear').fit(X_train,y_train)

In [159]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score

index = ['Gradient Boost', 'Decision Tree', 'LogisticRegression']

report_clf = [metrics.accuracy_score(y_train, clf.predict(X_train)), 
              metrics.accuracy_score(y_test, clf.predict(X_test)), 
              jaccard_similarity_score(y_test, clf.predict(X_test)), 
              f1_score(y_test, clf.predict(X_test), average='weighted')]

report_tree = [metrics.accuracy_score(y_train, tree.predict(X_train)), 
               metrics.accuracy_score(y_test, tree.predict(X_test)), 
               jaccard_similarity_score(y_test, tree.predict(X_test)), 
               f1_score(y_test, tree.predict(X_test), average='weighted')]

report_reg = [metrics.accuracy_score(y_train, reg.predict(X_train)), 
              metrics.accuracy_score(y_test, reg.predict(X_test)), 
              jaccard_similarity_score(y_test, reg.predict(X_test)), 
              f1_score(y_test, reg.predict(X_test), average='weighted')]



In [160]:
report = pd.DataFrame([report_clf, report_tree, report_reg], 
                      index = index, 
                      columns = ['Train set Accuracy', 'Test set Accuracy', 'Jaccard', 'F1-score'])

In [161]:
report

Unnamed: 0,Train set Accuracy,Test set Accuracy,Jaccard,F1-score
Gradient Boost,0.717699,0.71911,0.71911,0.717043
Decision Tree,0.715499,0.716723,0.716723,0.715555
LogisticRegression,0.650383,0.649888,0.649888,0.649637


In [84]:
report

Unnamed: 0,Train set Accuracy,Test set Accuracy,Jaccard,F1-score
Gradient Boost,0.717027,0.705955,0.705955,0.703494
Decision Tree,0.712695,0.702423,0.702423,0.698738
LogisticRegression,0.660984,0.652444,0.652444,0.65229


In [88]:
report

Unnamed: 0,Train set Accuracy,Test set Accuracy,Jaccard,F1-score
Gradient Boost,0.717027,0.705955,0.705955,0.703494
Decision Tree,0.712695,0.702423,0.702423,0.698738
LogisticRegression,0.659665,0.651243,0.651243,0.651116


In [94]:
report

Unnamed: 0,Train set Accuracy,Test set Accuracy,Jaccard,F1-score
Gradient Boost,0.717027,0.705955,0.705955,0.703494
Decision Tree,0.712695,0.702423,0.702423,0.698738
LogisticRegression,0.660984,0.652444,0.652444,0.65229


In [142]:
df_downsampled.corr()

Unnamed: 0,SEVERITYCODE,ADDRTYPE,COLLISIONTYPE,PERSONCOUNT,PEDCOUNT,PEDCYLCOUNT,JUNCTIONTYPE,SDOT_COLCODE,ST_COLCODE,CROSSWALKKEY,HITPARKEDCAR
SEVERITYCODE,1.0,0.199868,0.261013,0.126854,0.217089,0.189525,0.110143,0.162395,-0.167216,0.15384,-0.112609
ADDRTYPE,0.199868,1.0,0.197349,0.041952,0.158857,0.084928,0.207435,-0.05869,-0.197837,0.199926,-0.117026
COLLISIONTYPE,0.261013,0.197349,1.0,-0.028345,0.356358,0.427172,0.114241,0.359609,0.038857,0.276874,-0.172787
PERSONCOUNT,0.126854,0.041952,-0.028345,1.0,-0.042746,-0.060168,0.049361,-0.169015,-0.197164,-0.048852,-0.043407
PEDCOUNT,0.217089,0.158857,0.356358,-0.042746,1.0,-0.035761,0.013429,0.298971,-0.310253,0.566796,-0.035484
PEDCYLCOUNT,0.189525,0.084928,0.427172,-0.060168,-0.035761,1.0,0.036806,0.449375,0.262021,0.096818,-0.030896
JUNCTIONTYPE,0.110143,0.207435,0.114241,0.049361,0.013429,0.036806,1.0,-0.043146,-0.133172,0.02874,-0.107903
SDOT_COLCODE,0.162395,-0.05869,0.359609,-0.169015,0.298971,0.449375,-0.043146,1.0,0.319313,0.210428,-0.060241
ST_COLCODE,-0.167216,-0.197837,0.038857,-0.197164,-0.310253,0.262021,-0.133172,0.319313,1.0,-0.160762,0.106608
CROSSWALKKEY,0.15384,0.199926,0.276874,-0.048852,0.566796,0.096818,0.02874,0.210428,-0.160762,1.0,-0.027191


In [None]:
'STATUS', 'VEHCOUNT', 'UNDERINFL', 'WEATHER', 'ROADCOND', 'LIGHTCOND', 'SEGLANEKEY', 