## Study

In [None]:
# Data Processing
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.basemap import Basemap

# Machine Learning
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [None]:
# Plot Settings
plt.style.use('ggplot')
sns.set_palette('pastel')

plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Colors
severity_cols = {
    1: 'palegreen',
    2: 'papayawhip',
    3: 'lightsalmon',
    4: 'tomato'
}

## Load Data

In [None]:
# Load from csv and print time needed
%time raw_df = pd.read_csv('../input/us-accidents/US_Accidents_June20.csv')

In [None]:
# Update col type
raw_df['Start_Time'] = pd.to_datetime(raw_df['Start_Time'])
raw_df['End_Time'] = pd.to_datetime(raw_df['End_Time'])
raw_df['Severity'] = raw_df['Severity'].astype('category')
raw_df['State'] = raw_df['State'].astype('category')

In [None]:
# Colomun categories
road_params = [
    'Amenity', 
    'Bump', 
    'Crossing',
    'Give_Way', 
    'Junction', 
    'No_Exit',
    'Railway', 
    'Roundabout', 
    'Station',
    'Stop', 
    'Traffic_Calming',
    'Traffic_Signal', 
    'Turning_Loop']

weather_params = [
    'Weather_Timestamp',
    'Temperature(F)',
    'Wind_Chill(F)',
    'Humidity(%)',
    'Pressure(in)',
    'Visibility(mi)',
    'Wind_Direction',
    'Wind_Speed(mph)',
    'Precipitation(in)',
    'Weather_Condition'
]

## Data Wrangling

In [None]:
# Missing values

# Preprocessing
missing_values = raw_df.isna().sum() / len(raw_df)

# Visualization
plt.figure(figsize=(14, 14))
plt.title('Missing Data')
plt.xlabel('% of rows with missing data')
missing_values.plot(kind='barh');

## Exploratory Data Analysis

In [None]:
# Params
lon, lat = raw_df['Start_Lng'].values, raw_df['Start_Lat'].values
left, right, bot, top = -140, -40, 22, 52

# Draw a map. 
m = Basemap(projection='merc', 
            llcrnrlat=bot, 
            urcrnrlat=top, 
            llcrnrlon=left, 
            urcrnrlon=right, 
            lat_ts=20, 
            resolution='c')

m.drawcoastlines()
m.drawmapboundary()

vcol = [severity_cols[i] for i in raw_df['Severity']]

x, y = m(lon, lat)
m.scatter(x, y, 1, marker='o', color=vcol)
plt.title('Accidents representation map by severity level')
plt.show()

In [None]:
plt.title('Accidents by State')
raw_df.groupby('State').size().sort_values().plot(kind='bar');

In [None]:
plt.title('Severity level proportion')
raw_df.groupby('Severity').size().plot(kind='pie', colors = severity_cols.values(), autopct='%1.0f%%');

In [None]:
raw_df['Month_Code'] = raw_df['Start_Time'].dt.year.astype(str) + '_' + raw_df['Start_Time'].dt.month.astype(str).str.zfill(2)
plt.title('Number of accidents by month')
raw_df.groupby('Month_Code').size().plot();

In [None]:
df_17_20 = raw_df[(raw_df['Month_Code'] < '202000') & (raw_df['Month_Code'] > '201799')]
plt.title('Number of accidents by month')
df_17_20.groupby('Month_Code').size().plot();

In [None]:
# Number of accidents by month - total 2017, 2018, 2019
plt.title('Number of accidents by month - Total from January 2017 to December 2019')
df_17_20.groupby(df_17_20['Start_Time'].dt.month).size().plot(kind='bar');

In [None]:
# Number of accidents by hour - total 2017, 2018, 2019
plt.title('Number of accidents by hour of the day')
plt.xlabel('Hour of the accident')
raw_df.groupby(raw_df['Start_Time'].dt.hour).size().plot(kind='bar');

In [None]:
# Number of accidents by month - total 2017, 2018, 2019
state_and_month = df_17_20.groupby(['State', df_17_20['Start_Time'].dt.month]).size().unstack()
plt.figure(figsize=(20, 6))
sns.heatmap(state_and_month.T, square=True, linewidths =.5, cmap="YlGnBu");

In [None]:
# % of accident including road params
road_param_percent = raw_df.loc[:, road_params].sum() / len(raw_df)
plt.title('Presence of road element near accidents')
plt.xlabel('% of total of accidents')
road_param_percent.sort_values().plot(kind='barh');

In [None]:
# % of accident by Weather_Condition
acc_by_weather_condition = raw_df.groupby('Weather_Condition').size() / len(raw_df)
acc_by_weather_condition = acc_by_weather_condition[acc_by_weather_condition > 0.005]
plt.title('Presence of weather condition during accidents')
plt.xlabel('% of total of accidents')
acc_by_weather_condition.sort_values().plot(kind='barh');

## Predictions

In [None]:
# Feature Engineering
raw_df['Duration'] = (raw_df['End_Time'] - raw_df['Start_Time']).dt.seconds / 60 # Minutes
raw_df['Start_Hour'] = raw_df['Start_Time'].dt.hour

predictors = road_params + ['Start_Hour'] + ['State']

Xl = raw_df.loc[:, predictors].dropna()
Xl = pd.get_dummies(Xl)

yl = raw_df.loc[Xl.index, 'Duration']

In [None]:
# Plot distribution of Duration
plt.figure(figsize=(16, 8))
plt.title('Histogram of duration of impact on traffic (in minutes)')
raw_df['Duration'].clip(upper=600).plot(kind='hist', bins=100) # clipped data
plt.xlabel('Time in minutes')
plt.xticks(range(0, 601, 60));

In [None]:
# Create Linear Model
regr = LinearRegression()
model1 = regr.fit(Xl, yl)
model1.score(Xl, yl)

In [None]:
# Create Bayesian Model
baye = BayesianRidge()
model2 = baye.fit(Xl, yl)
model2.score(Xl, yl)

In [None]:
# Create Random Forest
clf = RandomForestClassifier(n_estimators=10,
                            random_state=0, 
                             n_jobs=-1)

df_resampled = raw_df.groupby('Severity').apply(lambda x: x.sample(25000)).reset_index(drop=True)

Xrf = raw_df.loc[:, predictors].dropna()
%time Xrf = pd.get_dummies(Xrf)
yrf = raw_df.loc[Xrf.index, 'Severity']

Xrfrs = df_resampled.loc[:, predictors].dropna()
Xrfrs = pd.get_dummies(Xrfrs)
yrfrs = df_resampled.loc[Xrfrs.index, 'Severity']


model3 = clf.fit(Xrfrs, yrfrs)
print(model3.score(Xrfrs, yrfrs))
print(model3.score(Xrf, yrf))

In [None]:
yp = model3.predict(Xrfrs)
conf_mat = confusion_matrix(yrfrs, yp)
print(conf_mat)

In [None]:
yp = model3.predict(Xrf)
conf_mat = confusion_matrix(yrf, yp)
print(conf_mat)

In [None]:
sns.heatmap(conf_mat,cmap="YlGnBu", 
             annot=True, square=True, 
             yticklabels=range(1, 5), 
             xticklabels=range(1, 5))
plt.title('Confusion matrix of accidents’ severity level prediction');