In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import  date
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

%matplotlib inline
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
plt.style.use('ggplot')
df=pd.read_csv('US_accidents_for_5_states.csv')

Data Augmentation - Extracting time of Accident and splitting into different fields

In [3]:
df = pd.read_csv("US_accidents_for_5_states.csv")
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')
df['End_Time'] = pd.to_datetime(df['End_Time'], errors='coerce')

# Extract year, month, day, hour and weekday
df['Year']=df['Start_Time'].dt.year
df['Month']=df['Start_Time'].dt.strftime('%b')
df['Day']=df['Start_Time'].dt.day
df['Hour']=df['Start_Time'].dt.hour
df['Weekday']=df['Start_Time'].dt.strftime('%a')

# Extract the amount of time in the unit of minutes for each accident, round to the nearest integer
td='Time_Duration(min)'
df[td]=round((df['End_Time']-df['Start_Time'])/np.timedelta64(1,'m'))
df.info()


dd=df.copy()


dd=dd[dd['State']=='KY']

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 993530 entries, 0 to 993529
Data columns (total 56 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Unnamed: 0             993530 non-null  int64         
 1   ID                     993530 non-null  object        
 2   Source                 993530 non-null  object        
 3   TMC                    623281 non-null  float64       
 4   Severity               993530 non-null  int64         
 5   Start_Time             993530 non-null  datetime64[ns]
 6   End_Time               993530 non-null  datetime64[ns]
 7   Start_Lat              993530 non-null  float64       
 8   Start_Lng              993530 non-null  float64       
 9   End_Lat                370249 non-null  float64       
 10  End_Lng                370249 non-null  float64       
 11  Distance(mi)           993530 non-null  float64       
 12  Description            993530 non-null  obje

We drop the columns that have a more than half NULL values

In [4]:
print(len(dd))

print(len(dd.columns))
cols = dd.columns[dd.isnull().mean()>0.5]
dd.drop(cols, axis=1,inplace=True)

print(dd.shape)

22553
56
(22553, 53)


In [5]:
unwanted_cols=['Turning_Loop','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight','Weather_Timestamp','TMC']
dd.drop(unwanted_cols, axis=1,inplace=True)

print(dd.shape)
print(dd.isnull().sum())
print(len(dd))

(22553, 47)
Unnamed: 0                0
ID                        0
Source                    0
Severity                  0
Start_Time                0
End_Time                  0
Start_Lat                 0
Start_Lng                 0
Distance(mi)              0
Description               0
Street                    0
Side                      0
City                      0
County                    0
State                     0
Zipcode                   0
Country                   0
Timezone                  0
Airport_Code              0
Temperature(F)          114
Wind_Chill(F)          9782
Humidity(%)             128
Pressure(in)            100
Visibility(mi)          125
Wind_Direction          136
Wind_Speed(mph)        1953
Precipitation(in)     10516
Weather_Condition       122
Amenity                   0
Bump                      0
Crossing                  0
Give_Way                  0
Junction                  0
No_Exit                   0
Railway                   0
Roundabo

In [6]:
print(dd.isnull().sum())

Unnamed: 0                0
ID                        0
Source                    0
Severity                  0
Start_Time                0
End_Time                  0
Start_Lat                 0
Start_Lng                 0
Distance(mi)              0
Description               0
Street                    0
Side                      0
City                      0
County                    0
State                     0
Zipcode                   0
Country                   0
Timezone                  0
Airport_Code              0
Temperature(F)          114
Wind_Chill(F)          9782
Humidity(%)             128
Pressure(in)            100
Visibility(mi)          125
Wind_Direction          136
Wind_Speed(mph)        1953
Precipitation(in)     10516
Weather_Condition       122
Amenity                   0
Bump                      0
Crossing                  0
Give_Way                  0
Junction                  0
No_Exit                   0
Railway                   0
Roundabout          

In [7]:
#Filling the temperature column missing values with its mean
dd_temperature=dd["Temperature(F)"]
#approximately follows normal distribution
print(dd["Temperature(F)"].isnull().sum())
#filling the missing values with the median
median_temperature=dd_temperature.median()
mean_temperature=dd_temperature.mean()
print(mean_temperature,median_temperature)

#filling the median temperature into the missing values since we are keeping the outliers
dd["Temperature(F)"].fillna(median_temperature, inplace=True)
#ensuring that there are no more null values in temperature
print(dd['Temperature(F)'].isnull().sum())

114
58.45944115156626 60.1
0


In [8]:
#Filling the humidity column missing values with its mean
dd_humidity=dd["Humidity(%)"]

#approximately follows normal distribution
print(dd["Humidity(%)"].isnull().sum())
#filling the missing values with the mean
median_humidity=dd_humidity.median()
mean_humidity=dd_humidity.mean()
print(mean_humidity,median_humidity)
#calculating the median and mean humidity and it is left-skewed ditribution

#filling the median temperature into the missing values since we are keeping the outliers
dd["Humidity(%)"].fillna(mean_humidity, inplace=True)
#ensuring that there are no more null values in temperature
print(dd['Humidity(%)'].isnull().sum())

128
69.93502787068005 73.0
0


In [9]:
#Filling the Pressure column missing values with its mean
dd_Pressure=dd["Pressure(in)"]

#approximately follows normal distribution
print(dd["Pressure(in)"].isnull().sum())
#filling the missing values with the concept of flooring and capping
median_Pressure=dd_Pressure.median()
mean_Pressure=dd_Pressure.mean()
print(mean_Pressure,median_Pressure)

new_median_Pressure=dd_Pressure.median()
new_mean_Pressure=dd_Pressure.mean()
print(new_mean_Pressure,new_median_Pressure)
dd["Pressure(in)"].fillna(new_median_Pressure, inplace=True)
#ensuring that there are no more null values in pressure
print(dd['Pressure(in)'].isnull().sum())

100
29.783977196811236 29.91
29.783977196811236 29.91
0


In [10]:
#Filling the Visibility column missing values with its mean
dd_Visibility=dd["Visibility(mi)"]

#approximately follows normal distribution
print(dd["Visibility(mi)"].isnull().sum())
#filling the missing values with the concept of flooring and capping
median_Visibility=dd_Visibility.median()
mean_Visibility=dd_Visibility.mean()
print(mean_Visibility,median_Visibility)

new_median_Visibility=dd_Visibility.median()
new_mean_Visibility=dd_Visibility.mean()
print(new_mean_Visibility,new_median_Visibility)
dd["Visibility(mi)"].fillna(new_median_Visibility, inplace=True)
#ensuring that there are no more null values in visibility
print(dd['Visibility(mi)'].isnull().sum())

print(dd.isnull().sum())

125
8.803699393615124 10.0
8.803699393615124 10.0
0
Unnamed: 0                0
ID                        0
Source                    0
Severity                  0
Start_Time                0
End_Time                  0
Start_Lat                 0
Start_Lng                 0
Distance(mi)              0
Description               0
Street                    0
Side                      0
City                      0
County                    0
State                     0
Zipcode                   0
Country                   0
Timezone                  0
Airport_Code              0
Temperature(F)            0
Wind_Chill(F)          9782
Humidity(%)               0
Pressure(in)              0
Visibility(mi)            0
Wind_Direction          136
Wind_Speed(mph)        1953
Precipitation(in)     10516
Weather_Condition       122
Amenity                   0
Bump                      0
Crossing                  0
Give_Way                  0
Junction                  0
No_Exit                 

In [11]:
#using the concept of flooring and capping
quartile_10=dd['Wind_Speed(mph)'].quantile(0.10) #0
quartile_90=dd['Wind_Speed(mph)'].quantile(0.90) #13.8
print(dd['Wind_Speed(mph)'].skew())  #38.513

0.8149940758927152


In [12]:
#Filling the Visibility column missing values with its mean
dd_Windspeed=dd["Wind_Speed(mph)"]

#approximately follows normal distribution
print(dd["Wind_Speed(mph)"].isnull().sum())
#filling the missing values with the concept of flooring and capping
median_Windspeed=dd_Windspeed.median()
mean_Windspeed=dd_Windspeed.mean()
print(mean_Windspeed,median_Windspeed)

#new mean and median

new_median_Windspeed=dd_Windspeed.median()
new_mean_Windspeed=dd_Windspeed.mean()
print(new_mean_Windspeed,new_median_Windspeed)
dd["Wind_Speed(mph)"].fillna(new_median_Windspeed, inplace=True)
#ensuring that there are no more null values in visibility
print(dd['Wind_Speed(mph)'].isnull().sum())

1953
8.340451456310856 8.0
8.340451456310856 8.0
0


In [13]:
#Wind_Direction

#filling the category with the mode 
mode_found=(dd['Wind_Direction'].mode())
print(mode_found[0])
dd["Wind_Direction"].fillna(mode_found[0], inplace=True)
print(dd['Wind_Direction'].isnull().sum())

SSW
0


In [14]:
print(dd.isnull().sum())



#Weather_Condition
#filling the category with the mode 
mode_found=(dd['Weather_Condition'].mode())
print(mode_found[0])
dd["Weather_Condition"].fillna(mode_found[0], inplace=True)
print(dd['Weather_Condition'].isnull().sum())

Unnamed: 0                0
ID                        0
Source                    0
Severity                  0
Start_Time                0
End_Time                  0
Start_Lat                 0
Start_Lng                 0
Distance(mi)              0
Description               0
Street                    0
Side                      0
City                      0
County                    0
State                     0
Zipcode                   0
Country                   0
Timezone                  0
Airport_Code              0
Temperature(F)            0
Wind_Chill(F)          9782
Humidity(%)               0
Pressure(in)              0
Visibility(mi)            0
Wind_Direction            0
Wind_Speed(mph)           0
Precipitation(in)     10516
Weather_Condition       122
Amenity                   0
Bump                      0
Crossing                  0
Give_Way                  0
Junction                  0
No_Exit                   0
Railway                   0
Roundabout          

In [15]:
#City
#filling the category with the mode 
mode_found=(dd['City'].mode())
print(mode_found[0])
dd["City"].fillna(mode_found[0], inplace=True)
print(dd['City'].isnull().sum())

#Sunrise_sunset
#filling the category with the mode 
mode_found=(dd['Sunrise_Sunset'].mode())
print(mode_found[0])
dd["Sunrise_Sunset"].fillna(mode_found[0], inplace=True)
print(dd['Sunrise_Sunset'].isnull().sum())
#dropping the cols 'Zipcode','Timezone','Airport_Code' 
unwanted_cols_2=['Zipcode','Timezone','Airport_Code','Wind_Chill(F)','Precipitation(in)']
dd.drop(unwanted_cols_2, axis=1,inplace=True)

print(dd.isnull().mean())
print(dd.info())

Louisville
0
Day
0
Unnamed: 0            0.0
ID                    0.0
Source                0.0
Severity              0.0
Start_Time            0.0
End_Time              0.0
Start_Lat             0.0
Start_Lng             0.0
Distance(mi)          0.0
Description           0.0
Street                0.0
Side                  0.0
City                  0.0
County                0.0
State                 0.0
Country               0.0
Temperature(F)        0.0
Humidity(%)           0.0
Pressure(in)          0.0
Visibility(mi)        0.0
Wind_Direction        0.0
Wind_Speed(mph)       0.0
Weather_Condition     0.0
Amenity               0.0
Bump                  0.0
Crossing              0.0
Give_Way              0.0
Junction              0.0
No_Exit               0.0
Railway               0.0
Roundabout            0.0
Station               0.0
Stop                  0.0
Traffic_Calming       0.0
Traffic_Signal        0.0
Sunrise_Sunset        0.0
Year                  0.0
Month              

In [16]:
# #correlation plots for numerical features
# numerical_features=['Start_Lat','Start_Lng','Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)']
# df_numerical=dd[numerical_features].copy()
# print(df_numerical.shape)
# sns.heatmap(df_numerical.corr(), annot = True)
# print(df_numerical.corr())




#removing the unwanted columns
filtered_columns=['Start_Time','End_Time','Description','Street','State','Country','Unnamed: 0','ID']
dd.drop(filtered_columns, axis=1,inplace=True)
print(dd.shape)

(22553, 34)


In [17]:
#Performing a standardisation on numerical columns such that mean is 0 and variance is 1
# numerical features
num_cols = ['Temperature(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)','Distance(mi)']
# apply standardization on numerical features
for i in num_cols:
    # fit on training data column
    scale = StandardScaler().fit(dd[[i]])
    # transform the training data column
    dd[i] = scale.transform(dd[[i]])




#Approach using one hot encoding/pd.get_dummies for few columns
#one hot encoding approach



#Copying the dataframe into ds
ds=dd.copy()

In [18]:
ds

Unnamed: 0,Source,Severity,Start_Lat,Start_Lng,Distance(mi),Side,City,County,Temperature(F),Humidity(%),...,Stop,Traffic_Calming,Traffic_Signal,Sunrise_Sunset,Year,Month,Day,Hour,Weekday,Time_Duration(min)
120455,MapQuest,2,38.766727,-84.188011,0.563767,L,Foster,Bracken,0.454244,-0.200616,...,False,False,False,Night,2016,Mar,27,22,Sun,75.0
120456,MapQuest,2,38.502827,-82.775070,-0.299735,R,Greenup,Greenup,0.850255,-0.965348,...,False,False,False,Day,2016,May,10,14,Tue,45.0
120457,MapQuest,1,38.534069,-82.834595,-0.307805,R,Greenup,Greenup,0.130235,1.175901,...,False,False,False,Day,2016,May,11,8,Wed,90.0
152408,MapQuest,2,38.453114,-82.673744,-0.299735,L,Ashland,Boyd,0.027374,0.666080,...,False,False,True,Day,2017,Feb,22,17,Wed,30.0
152409,MapQuest,2,38.453114,-82.673744,-0.299735,L,Ashland,Boyd,-0.132059,1.022955,...,False,False,True,Night,2017,Feb,22,19,Wed,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993229,Bing,4,38.222800,-85.510840,-0.016474,R,Louisville,Jefferson,1.210265,0.054295,...,False,False,False,Day,2019,Aug,20,9,Tue,29.0
993230,Bing,2,38.222726,-85.515250,0.087630,R,Louisville,Jefferson,1.210265,0.054295,...,False,False,False,Day,2019,Aug,20,9,Tue,28.0
993289,Bing,2,38.021666,-84.492849,-0.173034,R,Lexington,Fayette,1.518846,-0.863384,...,False,False,True,Day,2019,Aug,21,20,Wed,30.0
993290,Bing,4,38.028528,-84.314463,0.518575,R,Lexington,Fayette,0.901685,0.615098,...,False,False,False,Night,2019,Aug,22,1,Thu,29.0


In [19]:
#using pd_getdummies i.e one hot encoding
features_converted=['Side','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal','Sunrise_Sunset','Year','Month','Day','Hour','Weekday']
for i in features_converted:
    
    ds = pd.concat([ds,pd.get_dummies(ds[i], prefix=i)],axis=1)
    ds.drop([i],axis=1, inplace=True)




#using label encoding to the rest of the columns having object datatype
features_label_encoding=['Source','City','County','Wind_Direction','Weather_Condition']
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column

for i in features_label_encoding:
    ds[i] = labelencoder.fit_transform(ds[i])

In [20]:
target='Severity'
# Create arrays for the features and the response variable
# set X and y
y = ds[target]
X = ds.drop(target, axis=1)

# Split the data set into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
lr = LogisticRegression(random_state=0)
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)

# Get the accuracy score
acc=accuracy_score(y_test, y_pred)
print(acc)

NameError: name 'LogisticRegression' is not defined

In [None]:
#using PCA
from sklearn.decomposition import PCA
# put none to n_componenets to create explained variance vector 
# ( contain the percentage of variance explained by each of the principal components that we extracted here.)
pca = PCA(n_components=2) 
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
expained_variance = pca.explained_variance_ratio_

#Fitting logistic Regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train_pca, y_train)

#Prdicting the test set results
y_pred = classifier.predict(X_test_pca)
acc=accuracy_score(y_test, y_pred)
print(acc)

In [None]:
#Approach using labelencoding for all few columns
#labelencoding approach



#Copying the dataframe into dr

dr=dd.copy()
features_label_encoding=['Source','Side','City','County','Wind_Direction','Weather_Condition','Bump','Crossing','Give_Way','Junction','No_Exit','Railway','Roundabout','Station','Stop','Traffic_Calming','Traffic_Signal','Sunrise_Sunset','Year','Month','Day','Hour','Weekday']
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column

for i in features_label_encoding:
    dr[i] = labelencoder.fit_transform(dr[i])


In [None]:
target='Severity'
# Create arrays for the features and the response variable
# set X and y
y = dr[target]
X = dr.drop(target, axis=1)
# Split the data set into training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
lr = LogisticRegression(random_state=0)
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)
# Get the accuracy score
acc=accuracy_score(y_test, y_pred)
print(acc)

In [None]:
#using PCA
from sklearn.decomposition import PCA
# put none to n_componenets to create explained variance vector 
# ( contain the percentage of variance explained by each of the principal components that we extracted here.)
pca = PCA(n_components=2) 
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
expained_variance = pca.explained_variance_ratio_

#Fitting logistic Regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train_pca, y_train)

#Prdicting the test set results
y_pred = classifier.predict(X_test_pca)
acc=accuracy_score(y_test, y_pred)
print(acc)