# Capstone
### This project will be used for coursera Capstone course 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import resample
from sklearn import preprocessing

In [2]:
df_original = pd.read_csv('Data-Collisions.csv')
df = df_original.copy()
df_original.info()

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194673 entries, 0 to 194672
Data columns (total 38 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   SEVERITYCODE    194673 non-null  int64  
 1   X               189339 non-null  float64
 2   Y               189339 non-null  float64
 3   OBJECTID        194673 non-null  int64  
 4   INCKEY          194673 non-null  int64  
 5   COLDETKEY       194673 non-null  int64  
 6   REPORTNO        194673 non-null  object 
 7   STATUS          194673 non-null  object 
 8   ADDRTYPE        192747 non-null  object 
 9   INTKEY          65070 non-null   float64
 10  LOCATION        191996 non-null  object 
 11  EXCEPTRSNCODE   84811 non-null   object 
 12  EXCEPTRSNDESC   5638 non-null    object 
 13  SEVERITYCODE.1  194673 non-null  int64  
 14  SEVERITYDESC    194673 non-null  object 
 15  COLLISIONTYPE   189769 non-null  object 
 16  PERSONCOUNT     194673 non-null  int64  
 17  PEDCOUNT  

# Preprocessing

In [3]:
columns_to_drop = ['INCDTTM', 'SEVERITYDESC', 'SEVERITYCODE.1', 'X', 'Y','SDOTCOLNUM', 'ST_COLDESC', 'SDOT_COLDESC', 'COLDETKEY', 'INCKEY', 'OBJECTID', 'INTKEY', 'LOCATION', 'EXCEPTRSNCODE', 'EXCEPTRSNDESC', 'INCDATE', 'INATTENTIONIND', 'PEDROWNOTGRNT', 'SPEEDING']

In [4]:
# unnessesary columns dropped
df.drop(columns = columns_to_drop, inplace = True)

# dropping duplicates
df.drop_duplicates(subset = 'REPORTNO', keep = 'first', inplace=True)
df.drop(columns = ['REPORTNO'], inplace = True)

In [5]:
# coverted column with date and time of accident to datetime format
#df['INCDTTM'] = pd.to_datetime(df['INCDTTM'])

In [6]:
# dropping unknown
df.drop(index = df[df['ST_COLCODE'] == ' '].index, inplace = True)
df.dropna(axis = 0, inplace = True)

In [7]:
# converting data in columns from str to numeric
df['UNDERINFL'].replace(to_replace=['N','Y'], value=[0,1],inplace=True)
df['UNDERINFL'] = pd.to_numeric(df['UNDERINFL'])

df['HITPARKEDCAR'].replace(to_replace=['N','Y'], value=[0,1],inplace=True)
df['STATUS'].replace(to_replace=['Unmatched','Matched'], value=[0,1],inplace=True)

df['ST_COLCODE'] = pd.to_numeric(df['ST_COLCODE'])

In [8]:
# filling nan values with Unknown 
df['WEATHER'].fillna('Unknown', inplace = True)
df['ROADCOND'].fillna('Unknown', inplace = True)
df['LIGHTCOND'].fillna('Unknown', inplace = True)
df['JUNCTIONTYPE'].fillna('Unknown', inplace = True)
df['COLLISIONTYPE'].fillna('Unknown', inplace = True)

In [9]:
# getting dummies for each str column
columns = df[['SEVERITYCODE', 'STATUS', 'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 
           'VEHCOUNT', 'SDOT_COLCODE', 'UNDERINFL', 'ST_COLCODE', 
           'SEGLANEKEY', 'CROSSWALKKEY', 'HITPARKEDCAR']]
df = pd.concat([columns, 
                pd.get_dummies(df['ADDRTYPE']), 
                pd.get_dummies(df['COLLISIONTYPE']), 
                pd.get_dummies(df['JUNCTIONTYPE']), 
                pd.get_dummies(df['WEATHER']), 
                pd.get_dummies(df['ROADCOND']), 
                pd.get_dummies(df['LIGHTCOND'])], axis=1)

# Down-sample Majority Class

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 182892 entries, 0 to 194672
Data columns (total 61 columns):
 #   Column                                             Non-Null Count   Dtype
---  ------                                             --------------   -----
 0   SEVERITYCODE                                       182892 non-null  int64
 1   STATUS                                             182892 non-null  int64
 2   PERSONCOUNT                                        182892 non-null  int64
 3   PEDCOUNT                                           182892 non-null  int64
 4   PEDCYLCOUNT                                        182892 non-null  int64
 5   VEHCOUNT                                           182892 non-null  int64
 6   SDOT_COLCODE                                       182892 non-null  int64
 7   UNDERINFL                                          182892 non-null  int64
 8   ST_COLCODE                                         182892 non-null  int64
 9   SEGLANEKEY     

In [11]:
# Separate majority and minority classes
df_majority = df[df['SEVERITYCODE'] == 1]
df_minority = df[df['SEVERITYCODE'] == 2]

In [12]:
# Downsample majority class
length = len(df_minority['SEVERITYCODE'])
df_majority_downsampled = resample(df_majority, 
                                 replace = False,    # sample without replacement
                                 n_samples = length,     # to match minority class
                                 random_state = 0) # reproducible results

In [13]:
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Training

In [14]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [15]:
#deviding into x and y
y = df_downsampled['SEVERITYCODE']
X = df_downsampled[df_downsampled.columns[1:]]

X = preprocessing.StandardScaler().fit(X).transform(X)

In [16]:
#train test split and model fitting
X_train, X_test, y_train, y_test = train_test_split(X, y)

clf = GradientBoostingClassifier().fit(X_train, y_train)
tree = DecisionTreeClassifier(criterion="entropy", max_depth = 5).fit(X_train, y_train)
reg = LogisticRegression(C=0.4, solver='liblinear').fit(X_train,y_train)

Gradient Train set Accuracy:  0.7150493324228224
Gradient Test set Accuracy:  0.7165159649618537


In [None]:
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import f1_score

index = ['Gradient Boost', 'Decision Tree', 'LogisticRegression']

report_clf = [metrics.accuracy_score(y_train, clf.predict(X_train)), 
              metrics.accuracy_score(y_test, clf.predict(X_test)), 
              jaccard_similarity_score(y_t, clf.predict(X_t)), 
              f1_score(y_t, clf.predict(X_t), average='weighted')]

report_tree = [metrics.accuracy_score(y_train, tree.predict(X_train)), 
               metrics.accuracy_score(y_test, tree.predict(X_test)), 
               jaccard_similarity_score(y_t, tree.predict(X_t)), 
               f1_score(y_t, tree.predict(X_t), average='weighted')]

report_reg = [metrics.accuracy_score(y_train, reg.predict(X_train)), 
              metrics.accuracy_score(y_test, reg.predict(X_test)), 
              jaccard_similarity_score(y_t, reg.predict(X_t)), 
              f1_score(y_t, reg.predict(X_t), average='weighted')]

In [None]:
report = pd.DataFrame([report_clf, report_tree, report_reg], 
                      index = index, 
                      columns = ['Train set Accuracy', 'Test set Accuracy', 'Jaccard', 'F1-score'])

In [None]:
report