In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

In [2]:
df = pd.read_csv('~/Documents/Capstone/Data-Collisions.csv')
df.dtypes


SEVERITYCODE     int64
WEATHER         object
ROADCOND        object
LIGHTCOND       object
dtype: object

In [3]:
objList = df.select_dtypes(include = "object").columns
print (objList)

#Label Encoding for object to numeric conversion
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

for feat in objList:
    df[feat] = le.fit_transform(df[feat].astype(str))

print (df.info())
df





Index(['WEATHER', 'ROADCOND', 'LIGHTCOND'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194673 entries, 0 to 194672
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   SEVERITYCODE  194673 non-null  int64
 1   WEATHER       194673 non-null  int64
 2   ROADCOND      194673 non-null  int64
 3   LIGHTCOND     194673 non-null  int64
dtypes: int64(4)
memory usage: 5.9 MB
None


Unnamed: 0,SEVERITYCODE,WEATHER,ROADCOND,LIGHTCOND
0,2,4,8,5
1,1,6,8,2
2,1,4,0,5
3,1,1,0,5
4,2,6,8,5
...,...,...,...,...
194668,2,1,0,5
194669,1,6,8,5
194670,2,1,0,5
194671,2,1,0,6


In [5]:
df["SEVERITYCODE"].value_counts()

1    136485
2     58188
Name: SEVERITYCODE, dtype: int64

In [6]:
from sklearn.utils import resample

df_hi = df[df.SEVERITYCODE==1]
df_lo = df[df.SEVERITYCODE==2]

df_hi_fi = resample(df_hi, replace=False, n_samples=58188, random_state=123)
df_bal = pd.concat([df_hi_fi,df_lo])

df_bal.SEVERITYCODE.value_counts()

2    58188
1    58188
Name: SEVERITYCODE, dtype: int64

In [7]:
#Define X and y 

X = np.asarray(df[['WEATHER','LIGHTCOND','ROADCOND']])
X[0:5]

array([[4, 5, 8],
       [6, 2, 8],
       [4, 5, 0],
       [1, 5, 0],
       [6, 5, 8]])

In [8]:
y = np.asarray(df['SEVERITYCODE'])
y[0:5]

array([2, 1, 1, 1, 2])

In [9]:
#Normalising Data

from sklearn import preprocessing
X=preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[ 0.22981187,  0.25900713,  1.39847224],
       [ 0.87758556, -1.36653782,  1.39847224],
       [ 0.22981187,  0.25900713, -0.73846749],
       [-0.74184867,  0.25900713, -0.73846749],
       [ 0.87758556,  0.25900713,  1.39847224]])

In [12]:
#Train/test split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 123)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)


(136271, 3) (136271,)
(58402, 3) (58402,)


# KNN Method

In [28]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

k = 6
knn = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)

knn_y_pred = knn.predict(X_test)
knn_y_pred[0:5]


print(accuracy_score(y_test, knn_y_pred))
print(classification_report(y_test, knn_y_pred))
print(confusion_matrix(y_test, knn_y_pred))


0.6995308379850005
              precision    recall  f1-score   support

           1       0.70      1.00      0.82     40892
           2       0.30      0.00      0.00     17510

    accuracy                           0.70     58402
   macro avg       0.50      0.50      0.41     58402
weighted avg       0.58      0.70      0.58     58402

[[40826    66]
 [17482    28]]


# Decision Tree

In [41]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion="entropy", max_depth = 6)
dt
dt.fit(X_train,y_train)

dt_y_pred = dt.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, dt_y_pred))
print(classification_report(y_test, dt_y_pred))


[[40879    13]
 [17506     4]]
              precision    recall  f1-score   support

           1       0.70      1.00      0.82     40892
           2       0.24      0.00      0.00     17510

    accuracy                           0.70     58402
   macro avg       0.47      0.50      0.41     58402
weighted avg       0.56      0.70      0.58     58402



# Logistic Regression

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

logisticRegr = LogisticRegression(C=6,solver = 'lbfgs')

logisticRegr.fit(X_train, y_train)


lr_y_pred = lr.predict_proba(X_test)
lr_y_pred

score = logisticRegr.score(X_test, y_test)
print(score)




0.7001815006335399
