# <font color=Green>**Start Project**</font> #

 import the modules that we use in the code!

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly
import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.graph_objects as go 
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression

import folium
from folium.plugins import HeatMap

plt.style.use('fivethirtyeight')

In [None]:
US_Accidents_df = pd.read_csv('../input/us-accidents/US_Accidents_Dec20_Updated.csv', parse_dates=['Start_Time','End_Time']) #, index_col='ID')

#### <font color=orange>**Add some New Columns:**</font> ####

In [None]:
US_Accidents_df['Month'] = US_Accidents_df['Start_Time'].dt.month
US_Accidents_df['Year'] = US_Accidents_df['Start_Time'].dt.year
US_Accidents_df['Hour'] = US_Accidents_df['Start_Time'].dt.hour
US_Accidents_df['Weekday'] = US_Accidents_df['Start_Time'].dt.weekday
US_Accidents_df['Impact'] = (US_Accidents_df['End_Time'] - US_Accidents_df['Start_Time']).dt.total_seconds()/60

#### <font color=orange>**Clean Data:**</font> ####

In [None]:
# clean the data based on the condition that the impact on traffic is between zero-one week,and drop duplicates
oneweek = 60*24*7
US_Accidents_df_clean = US_Accidents_df[(US_Accidents_df['Impact']>0) & (US_Accidents_df['Impact']< oneweek)].drop_duplicates(subset=['Start_Time','End_Time','City','Street','Number','Description'])

#### <font color=orange>**Display 10 first rows of Dataset:**</font> ####

In [None]:
US_Accidents_df_clean.head(5)

In [None]:
print('Rows     :', US_Accidents_df_clean.shape[0])
print('Columns  :', US_Accidents_df_clean.shape[1])
print('\nFeatures :\n     :', US_Accidents_df_clean.columns.tolist())
print('\nMissing values    :', US_Accidents_df_clean.isnull().values.sum())
print('\nUnique values :  \n', US_Accidents_df_clean.nunique())

In [None]:
US_Accidents_df_clean.select_dtypes(exclude=['int','float']).columns

In [None]:
US_Accidents_df_clean['Description'].head(5)

In [None]:
# print(US_Accidents_df_clean['Source'].unique())
print(US_Accidents_df_clean['Description'].unique())
print(US_Accidents_df_clean['Timezone'].unique())
print(US_Accidents_df_clean['Amenity'].unique())

In [None]:
import warnings
warnings.filterwarnings('ignore')
missing_df = US_Accidents_df_clean.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name','missing_count']
missing_df = missing_df[missing_df['missing_count']>0]
missing_df = missing_df.sort_values(by='missing_count')

ind = np.arange(missing_df.shape[0])
width = 0.5
fig,ax = plt.subplots(figsize=(12,18))
rects = ax.barh(ind,missing_df.missing_count.values,color='blue')
ax.set_yticks(ind)
ax.set_yticklabels(missing_df.column_name.values, rotation='horizontal')
ax.set_xlabel("Count of missing values")
ax.set_title("Number of missing values in each column")
plt.show()

In [None]:
sns.set_style('whitegrid')
sns.set_context('paper')
sns.set_palette('GnBu_d')
a = sns.catplot(x='Year', data=US_Accidents_df_clean[US_Accidents_df_clean['Year']<=2020], kind='count')
a.fig.suptitle('Yearly Accidents Cases(2016-2020)', y=1.03)
a.set(ylabel='Yearly Cases', xlabel='Year')
# plt.savefig('Yearly_accidents_cases.png', dpi=1200, bbox_inches='tight')
plt.show()
# there is a growing trend of year accidents cases

In [None]:
#time series analysis
df1 = US_Accidents_df_clean[['Country','Start_Time','End_Time','Year','Month','Weekday','Hour','Impact','Severity']]

In [None]:
dfA = df1[df1['Year'] < 2020].set_index('Start_Time').resample('A').count()
dfA['YEAR'] = np.array([2016,2017,2018,2019], dtype=np.int32)
plt.scatter(dfA.YEAR,dfA.Country)
#use linear regression and scatter plot to test if there exists a linear regression
lrModel = LinearRegression()
x = dfA['YEAR'].values.reshape(-1, 1)
y = dfA.Country
# r^2 = 0.915, which indicate there is a strong linear relationship between year and accident cases
# did a regression fit test on quarterly increase, r^2 is 0.74, therefore yearly increase is a better fit
lrModel.fit(x,y)
print(lrModel.score(x,y))
# use linear regression parameter to predict the accident number in 2020
dfA.loc['2020-12-31 00:00:00', 'Country'] = lrModel.coef_*2020+ lrModel.intercept_
dfA.loc['2020-12-31 00:00:00', 'YEAR'] = 2020

# print(dfA[['YEAR','Country']])


plt.figure(figsize=(20, 20))
sns.set_context('talk')
p = sns.catplot(x='YEAR', y='Country', data=dfA, kind='bar')
p.fig.suptitle('Yearly accidents cases(2016-2020)', y=1.03)
p.set(ylabel='yearly cases',xlabel='year')
plt.show()

In [None]:
sns.set_context('paper')
h = sns.catplot(x='Hour',data=US_Accidents_df_clean, kind='count', height=8.27, aspect=11.7/8.27)
h.fig.suptitle('Hourly Accidents Cases', y=1.03)
h.set(ylabel='Hourly Cases', xlabel='Hour')
plt.annotate('Morning Peak', xy=(6,330000))
plt.annotate('Afternoon Peak', xy=(15,270000))
plt.annotate('bottom', xy=(1,25000))
plt.annotate('go to work', xy=(7.5,0), xytext=(1, 125000), arrowprops={'arrowstyle':'fancy'})
plt.annotate('get off work', xy=(17.5,0), xytext=(19, 150000), arrowprops={'arrowstyle':'fancy'})
# plt.savefig('Hourly_Accidents_Cases.png', dpi=1200, bbox_inches='tight');
plt.show()
# most accidents happend during the day time, and there are two peaks on 7-8 and 16-17 when people are on commute 
# between workplace and home
# during 23 to 3 o'clock，before dawn.cases numbers are relatively at the bottom level as most people are in sleep

In [None]:
states = US_Accidents_df_clean.State.unique()

# print(US_Accidents_df_clean[US_Accidents_df_clean['State']=='CA'][0])
count_by_state=[]
for state in US_Accidents_df_clean.State.unique():
    count_by_state.append([US_Accidents_df_clean[US_Accidents_df_clean['State']==state].count()[0]])

count_by_state = np.array(count_by_state, dtype='int32').flatten()
states = np.array(states, dtype='str')

states_accidents_df = pd.DataFrame({'State':states, 'Accidents':count_by_state})

plt.figure(figsize=(16,10))
states_accidents_df = states_accidents_df.sort_values(by=['Accidents'], ascending=False)
sns.barplot(states_accidents_df['State'], states_accidents_df['Accidents']);
plt.savefig('Accidents_Cases_bt_State.png', dpi=1200, bbox_inches='tight');
plt.show()

In [None]:
#10 states with the highest accident rates
df_st = US_Accidents_df_clean.groupby('State').size().to_frame('Counts')
df_st = df_st.reset_index().sort_values('Counts', ascending = False)[:10]

fig, ax = plt.subplots(figsize = (12,8))
b = sns.barplot(y = 'State',x = 'Counts', data = df_st )

b.set_title("10 States With The Highest Accident Rates")

plt.show()
# these states are consistent with the states with largest population in the U.S.

In [None]:
fig=sns.heatmap(US_Accidents_df_clean[['Severity','Start_Lat','End_Lat','Distance(mi)','Temperature(F)','Wind_Chill(F)','Humidity(%)','Pressure(in)','Visibility(mi)','Wind_Speed(mph)']].corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':15})
fig=plt.gcf()
fig.set_size_inches(15,7)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
# f,ax=plt.subplots(1,2,figsize=(18,8))
# US_Accidents_df_clean['Source'].value_counts().plot.pie(explode=[0,0.1,0.1],autopct='%1.1f%%',ax=ax[0],shadow=True)
# ax[0].set_title('Share of Sources')
# ax[0].set_ylabel('Count')
# sns.countplot('Source', data=US_Accidents_df_clean, ax=ax[1], order=US_Accidents_df_clean['Source'].value_counts().index)
# ax[1].set_title('Count of Source')
# plt.show()

In [None]:
# plt.style.use('fast')
plt.style.use('fivethirtyeight')
f,ax = plt.subplots(3, 1,figsize=(8, 20))
US_Accidents_df_clean['Severity'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0]) #,shadow=True)
ax[0].set_title('Percentage Severity Distribution', fontsize=15)
ax[0].set_ylabel('Severity', fontsize=13)
# plt.savefig('Percentage_Severity_Distribution.png', dpi=1200, bbox_inches='tight');
sns.countplot('Severity', data=US_Accidents_df_clean, ax=ax[1], order=US_Accidents_df_clean['Severity'].value_counts().index)
ax[1].set_title('Count of Severity', fontsize=15)
ax[1].set_xlabel('Severity', fontsize=12)
ax[1].set_ylabel('Count', fontsize=12)
# plt.ticklabel_format(style='plain', axis='y')
ax[1].ticklabel_format(axis='y', style='plain')
# ax[1].legend()
(US_Accidents_df_clean.Severity.value_counts(normalize=True).sort_index()*100).plot.bar(ax=ax[2])
ax[2].set_title('Severity Percentage', fontsize=15)
ax[2].set_xlabel('Severity', fontsize=12)
ax[2].set_ylabel('Percentage', fontsize=12)
# ax[2].set_ylim(0, 100)
import matplotlib.ticker as mtick

# ax = df['myvar'].plot(kind='bar')
ax[2].yaxis.set_major_formatter(mtick.PercentFormatter())
ax[2].set_ylim(0, 100)
#plt.grid()
plt.savefig('Percentage_Severity_Distribution.png', dpi=1200, bbox_inches='tight');

plt.tight_layout()
plt.show();

In [None]:
df_top_Severity_State = US_Accidents_df_clean.groupby('State').agg({'Severity': 'mean'}).sort_values('Severity').reset_index()

plt.figure(figsize=(23, 8))
sns.barplot(y="Severity", x="State", data=df_top_Severity_State.head(49))
plt.title("Mean Severity by State", fontsize=14)
plt.xlabel("State", fontsize=12)
plt.ylabel("Severity", fontsize=12)
plt.ioff()
# plt.savefig('Mean_Severity_by_State.png', dpi=1200, bbox_inches='tight');
plt.show()

In [None]:
plt.figure(figsize=(14,8))
# plt.style.use('fivethirtyeight')
plt.style.use('fast')
US_Accidents_df_clean.groupby('Weather_Condition') \
        .size() \
        .sort_values(ascending = False) \
        .iloc[:10] \
        .plot.pie(explode=[0,0,0.1,0,0, 0, 0, 0, 0.3, 0], autopct='%1.1f%%')#,shadow=True)
plt.ioff()
plt.ylabel("");
plt.title("Weather Condition in Accidents", fontsize=13)
# plt.savefig('Weather_Condition_in_Accidents.png', dpi=1200, bbox_inches='tight');
plt.show();

In [None]:
counts = US_Accidents_df_clean["Weather_Condition"].value_counts()[:15]
plt.figure(figsize=(23, 8))

sns.set_context('notebook')
sns.barplot(counts.index, counts.values)
plt.title("Histogram Distribution of the Top 15 Weather Conditions")
plt.xlabel("Weather Condition")
plt.ylabel("Value")
# plt.savefig('Histogram_weather_conditions.png', dpi=1200, bbox_inches='tight');
plt.show()

#### <font color=orange>**Weeks Day Accidents:**</font> ####

In [None]:
counts = pd.to_datetime(US_Accidents_df_clean['Start_Time']).dt.day_name().value_counts()
weekdays = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]

plt.figure(figsize=(20, 8))
sns.set_context('talk')
sns.barplot(counts.index, counts.values, order=weekdays)
plt.title("Number of Accidents for each Weekday")
plt.xlabel("Weekday")
plt.ylabel("Value")
# plt.savefig('Number_Accidents_Weekday.png', dpi=1200, bbox_inches='tight');
plt.show();

#### <font color=orange>**Monthly Accidents:**</font> ####

In [None]:
sns.set_context('talk')
m = sns.catplot(x='Month', data=df1[df1['Year'] < 2020], kind='count', height=8.27, aspect=20/8.27)
m.fig.suptitle('Monthly Accidents Cases(2016-2019)', y=1.03)
m.set(ylabel='Monthly Cases')
# plt.savefig('Monthly_Accident_Cases.png', dpi=1200, bbox_inches='tight');
plt.show()
# there were more cases druing 8-12 compared to other months,excluding the data from 2020
# guess there are more bad weather conditions in the winter

In [None]:
def plot_map1(LatLong, city=None):
    accident_map = folium.Map(location=LatLong, 
                           tiles = "Stamen Toner",
                           zoom_start = 10)
    if city != None:
        data_heatmap = US_Accidents_df_clean[US_Accidents_df_clean["City"] == city]
    else:
        data_heatmap = df.copy()
    data_heatmap = data_heatmap[['Start_Lat','Start_Lng']]
    data_heatmap = [[row['Start_Lat'],row['Start_Lng']] for index, row in data_heatmap.iterrows()]
    HeatMap(data_heatmap, radius=10).add_to(accident_map)
    return accident_map

In [None]:
plot_map1([40.712776,-74.005974], city='New York')

In [None]:
def plot_map2(city):
    data_heatmap = US_Accidents_df_clean[US_Accidents_df_clean["City"] == city]
    lat = data_heatmap['Start_Lat'].iloc[0]
    long = data_heatmap['Start_Lng'].iloc[0]
    LatLong = [lat, long]
    accident_map = folium.Map(location=LatLong, 
                           tiles = "cartodbpositron",
                           zoom_start = 10)

    data_heatmap = data_heatmap[['Start_Lat','Start_Lng']]
    data_heatmap = [[row['Start_Lat'],row['Start_Lng']] for index, row in data_heatmap.iterrows()]
    HeatMap(data_heatmap, radius=10).add_to(accident_map)
    return accident_map

In [None]:
plot_map2(city = "New York")

In [None]:
plot_map2(city = "Los Angeles")

In [None]:
severity_1_by_state = []
severity_2_by_state = []
severity_3_by_state = []
severity_4_by_state = []
for i in states_accidents_df['State']:
    severity_1_by_state.append(US_Accidents_df_clean[(US_Accidents_df_clean['Severity']==1)&(US_Accidents_df_clean['State']==i)].count()[0])
    severity_2_by_state.append(US_Accidents_df_clean[(US_Accidents_df_clean['Severity']==2)&(US_Accidents_df_clean['State']==i)].count()[0])
    severity_3_by_state.append(US_Accidents_df_clean[(US_Accidents_df_clean['Severity']==3)&(US_Accidents_df_clean['State']==i)].count()[0])
    severity_4_by_state.append(US_Accidents_df_clean[(US_Accidents_df_clean['Severity']==4)&(US_Accidents_df_clean['State']==i)].count()[0])

In [None]:
plt.style.use('fast')
plt.figure(figsize=(30, 15))

plt.bar(states_accidents_df['State'], severity_2_by_state, label='Severity 2')
plt.bar(states_accidents_df['State'], severity_3_by_state, label='Severity 3')
plt.bar(states_accidents_df['State'], severity_4_by_state, label='Severity 4')
plt.bar(states_accidents_df['State'], severity_1_by_state, label='Severity 1')
# sns.barplot(states_accidents_df['State'], severity_2_by_state, label='Severity 2', palette="bright")
# sns.barplot(states_accidents_df['State'], severity_3_by_state, label='Severity 3', palette="bright")
# sns.barplot(states_accidents_df['State'], severity_4_by_state, label='Severity 4', palette="bright")
# sns.barplot(states_accidents_df['State'], severity_1_by_state, label='Severity 1', palette="bright")
plt.legend();
plt.title("Total States Accidents Based on Severity", fontsize=15)
plt.ylabel("Num. Accidents", fontsize=12)
plt.xlabel("State", fontsize=12)
plt.grid(color='pink', linestyle='-', linewidth=.3)
# plt.savefig('Total_States_Accidents_Based_on_Severity.png', dpi=1200, bbox_inches='tight');
plt.show();

In [None]:
# import dateutil.parser

# US_Accidents_df_clean["Time_added"] = US_Accidents_df_clean["Start_Time"].apply(lambda x: dateutil.parser.parse(x))

# a4_dims = (10, 7)
# fig, ax = plt.subplots(figsize=a4_dims)
# sns.countplot(x=US_Accidents_df_clean["Time_added"].apply(lambda timestamp: timestamp.month))

In [None]:
#street classification
def str_type(text):
    if '-' in text or 'Fwy'in text or 'Expy' in text or 'Highway'in text or 'Hwy'in text :
        result = 'Highway'
    else:
        result = 'others'
    return result

US_Accidents_df_clean['Street_Type'] = US_Accidents_df_clean['Street'].apply(str_type)

In [None]:
# accident rates vs. street_type
fig, ax = plt.subplots(figsize = (5, 6))
sns.set_style('whitegrid')
e = sns.countplot(x ='Street_Type', data = US_Accidents_df_clean)
e.set_title('Accident Rate VS. Street Type', fontsize=14)
plt.xticks(rotation = 90)

plt.xlabel("Street Type")
plt.ylabel("Count")
# ax.ticklabel_format(useOffset=False, style='plain')
plt.ticklabel_format(style='plain', axis='y')

# plt.savefig('Accident_Rate_Street_Type.png', dpi=1200, bbox_inches='tight');
plt.show();
#given that the milage of highway is much less than other roads, this plot indicates that there is a higher probability of accident occurs in highway

In [None]:
fig, ax = plt.subplots(figsize =(8, 6))
sns.set_style('whitegrid')
sns.countplot(x='Street_Type', hue='Severity', data=US_Accidents_df_clean, palette="bright")
plt.legend(['Severity-1', 'Severity-2', 'Severity-3', 'Severity-4']) #(['0', '1'], loc='upper right', prop={'size': 10})
plt.title("Accidents Severity by Street Type", fontsize=14)
plt.xlabel("Street Type", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.ticklabel_format(style='plain', axis='y')
# plt.savefig('Accidents_Severity_by_Street_Type.png', dpi=1200, bbox_inches='tight');
plt.show()

In [None]:
# drop the rows with missing weather condition description
df_weather = US_Accidents_df_clean[['Month', 'Weather_Condition', 'Impact', 'Severity']].dropna()
df_weather.isna().sum()

In [None]:
weatherDict = {'Light Rain':'Rain','Rain':'Rain','Clear':'Fair','Fair':'Fair','Mostly Cloudy':'Cloudy','Overcast':'Cloudy',
        'Partly Cloudy':'Cloudy','Cloudy':'Cloudy','Scattered Clouds':'Cloudy','Light Snow':'Ice','Haze':'Fog',
       'Fog':'Fog','Heavy Rain':'Rain','Light Drizzle':'Rain','Fair / Windy':'Fair','Snow':'Ice',
        'Light Thunderstorms and Rain':'Thunder','Thunderstorm':'Thunder','Mostly Cloudy / Windy':'Cloudy','Cloudy / Windy':'Cloudy',
       'T-Storm':'Thunder','Smoke':'Fog','Thunder in the Vicinity':'Thunder','Light Rain with Thunder':'Thunder','Partly Cloudy / Windy':'Cloudy',
      'Patches of Fog':'Fog','Drizzle':'Rain','Heavy Thunderstorms and Rain':'Thunder','Mist':'Fog','Thunder':'Thunder',
       'Thunderstorms and Rain':'Thunder','Light Freezing Rain':'Ice','Light Rain / Windy':'Rain','Heavy T-Storm':'Thunder',
       'Wintry Mix':'Ice','Heavy Snow':'Ice','Shallow Fog':'Fog','Light Snow / Windy ':'Ice','Light Freezing Fog':'Ice',
       'Light Freezing Drizzle':'Ice','Rain / Windy':'Rain','N/A Precipitation':'Fair','Showers in the Vicinity':'Rain',
       'Blowing Snow':'Ice','Heavy Rain / Windy':'Rain','Heavy Drizzle':'Rain','Light Ice Pellets':'Ice','Heavy T-Storm / Windy':'Thunder',
       'T-Storm / Windy':'Thunder','Haze / Windy':'Fog','Light Rain Showers':'Rain','Widespread Dust':'Fog','Light Rain Shower':'Rain',
       'Drizzle and Fog':'Fog','Snow / Windy':'Ice','Rain Showers':'Rain','Blowing Dust / Windy':'Fog','Thunder / Windy':'Thunder',
       'Ice Pellets':'Ice','Fog / Windy':'Fog','Blowing Snow / Windy':'Ice','Heavy Snow / Windy':'Ice','Wintry Mix / Windy':'Ice',
       'Small Hail':'Ice','Sand / Dust Whirlwinds':'Fog','Squalls':'Cloudy','Light Snow Showers':'Ice','Light Thunderstorms and Snow':'Thunder',
       'Volcanic Ash':'Fog','Partial Fog':'Fog','Freezing Rain':'Ice','Rain Shower':'Rain','Light Snow / Windy':'Ice',
       'Blowing Dust':'Fog','Light Drizzle / Windy':'Rain','Light Snow and Sleet':'Ice','Light Sleet':'Ice','Snow and Sleet':'Ice',
       'Funnel Cloud':'Cloudy','Smoke / Windy':'Fog','Light Rain Shower / Windy':'Rain','Squalls / Windy':'Cloudy','Light Haze':'Fog'}

In [None]:
df_weather.loc[:,'Condition'] = df_weather.Weather_Condition.map(weatherDict)

In [None]:
print(df_weather.isna().sum())
df_weather_sort = df_weather.dropna()
print(df_weather_sort.isna().sum())

df_weather_sort.Condition.value_counts(normalize = True)

In [None]:
sns.set_context('notebook')

sns.catplot(x='Condition',y='Impact',data=df_weather_sort, kind='point', ci=None,
                order=['Fog', 'Thunder', 'Rain', 'Fair', 'Cloudy', 'Ice'], height=6, aspect=11.7/8.27, linestyles=["-"], dodge=True,);

sns.set_style('whitegrid')

# plt.ylabel("Traffic Impact (Min)") #,fontsize=10)
# w.set(title='The impact time in different weather condition',
#       xlabel= 'Weather Condition',ylabel='Traffic Impact (Min)');
plt.title("The impact time in different weather condition")
plt.xlabel("Weather Condition")
plt.ylabel("Traffic Impact (Min)")

# plt.savefig('impact_time_in_different_weather_condition.png', dpi=1200, bbox_inches='tight')
# the point plot shows that under weather condition with ice, the impact accidents have on traffic is the longest of
# more than 100 minuts
# impact under other weather conditions are nearly the same

In [None]:
sns.set_context('notebook')

s = sns.catplot(x='Condition', y='Severity', data=df_weather_sort,kind='point', ci=None,color='c',
                order=['Fog','Fair','Cloudy','Rain','Thunder','Ice'], height=6, aspect=11.7/8.27)
s.set(title='The Accident Severity in different Weather Condition',
      xlabel='Weather Condition', ylabel='Severity')
# under extreme weather conditions like ice and thunder, the severity is much higher
# plt.savefig('accident_severity_in_different_weather_condition.png', dpi=1200, bbox_inches='tight')

In [None]:
#report by cities
#top 10 cities with highest severity
df_city = US_Accidents_df_clean.groupby('City').sum('Severity')[['Severity']]
df_city = df_city.reset_index().sort_values('Severity', ascending = False)[:10]

sns.set_style('whitegrid')
fig, ax = plt.subplots(figsize = (12,8))
c = sns.barplot(x = 'Severity', y = 'City', data = df_city)
c.set_title("Top 10 Cities with Highest Severity")

plt.show()

#most of these cities are large cities.

In [None]:
df_city

In [None]:
#10 cities with the highest accident rates
df_ci_cnt = US_Accidents_df_clean.groupby('City').size().to_frame('Count_city')
df_ci_cnt = df_ci_cnt.reset_index().sort_values('Count_city', ascending = False)[:10]

fig, ax = plt.subplots(figsize = (12,8))
b = sns.barplot(y = 'City',x = 'Count_city', data = df_ci_cnt )

b.set_title("Top 10 Cities With The Highest Accident Rates")
plt.xlabel("Num of Accidents")
# plt.savefig('Top_10_Cities_With_Highest_Accident_Rates.png', dpi=1200, bbox_inches='tight')

plt.show()

In [None]:
# Map of accidents
def plot_map_of_accidents():
    plt.figure(figsize=(12, 8))
    sns.scatterplot(x='Start_Lng', y='Start_Lat', data=US_Accidents_df_clean, hue='State',s=20, legend=False)
    plt.title("Accident Occurance Place Scatter Plot")
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    # plt.savefig('Accident_Occurance_Place_Scatter_Plot.png', dpi=1200, bbox_inches='tight')
    plt.show();
# plot_map_of_accidents()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import Contours, Histogram2dContour, Marker, Scatter

df_st_ct = pd.value_counts(US_Accidents_df_clean['State'])

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))

def plot_num_of_accidents_by_state():
    configure_plotly_browser_state()
    init_notebook_mode(connected=False)

    fig = go.Figure(data=go.Choropleth(
        locations=df_st_ct.index,
        z = df_st_ct.values.astype(float),  # Data to be color-coded
        locationmode = 'USA-states',     # set of locations match entries in `locations`
        colorscale = 'YlOrRd',
        colorbar_title = "Count",
    ))

    fig.update_layout(
        title_text = 'US Accidents by State',
        geo_scope='usa', # limite map scope to USA
    )

    fig.show()

In [None]:
import plotly.offline as py
py.init_notebook_mode(connected=False)
import plotly.graph_objects as go

#average accident severity of states
# st_sev = US_Accidents_df_clean.groupby('State').mean('Severity')[['Severity']]

# configure_plotly_browser_state()
# init_notebook_mode(connected=False)

# fig = go.Figure(data=go.Choropleth( 
#     locations=list(st_sev.index),
#     z = st_sev['Severity'].astype(float),  
#     locationmode = 'USA-states', 
#     colorscale = 'Reds', 
#     colorbar_title = "Average value of severity", 
# )) 

# fig.update_layout( 
#     title_text = 'Accident Severity of Each State', 
#     geo_scope='usa', 
    
# )

# py.iplot(fig,filename = 'Severity_Map.html')

#SD & WY have very few accident records but the average value of severity are much higher than other states; 
#it probably because the population or the number of cars are less in these two states and there are many mountains and most of the roads are highway or mountian road.
#despite of these two states, overall, the eastern US is more serious than the western US in terms of accident severity

In [None]:
# main_dataset['timestamp'] = pd.to_datetime(main_dataset['Weather_Timestamp'], errors='coerce')
# main_dataset['Hour'] = US_Accidents_df_clean['timestamp'] .dt.hour
# main_dataset['Minute'] = US_Accidents_df_clean['timestamp'] .dt.minute
hours = [hour for hour, df in US_Accidents_df_clean.groupby('Hour')]
plt.plot(hours, US_Accidents_df_clean.groupby(['Hour'])['ID'].count())
plt.xticks(hours)
plt.xlabel('Hour')
plt.ylabel('Numer of accidents')
plt.grid(True)
plt.show()

In [None]:
US_Accidents_df_clean['time'] = pd.to_datetime(US_Accidents_df_clean.Start_Time, format='%Y-%m-%d %H:%M:%S')

colors = ["gold", "gold", "aqua", "magenta", "darkorange", "springgreen"]

plt.subplots(2,2,figsize=(15,10))
i=1
for s in np.arange(2017,2021):
    plt.subplot(2,2,i)
    plt.hist(pd.DatetimeIndex(US_Accidents_df_clean.loc[US_Accidents_df_clean["Year"] == s]['time']).month, bins=[1,2,3,4,5,6,7,8,9,10,11,12,13], align='left', rwidth=0.8, color=colors[i])
    plt.title("Accident Count by Each Month of Year " + str(s), fontsize=14)
    plt.xlabel("Month", fontsize=12)
    plt.ylabel("Accident Count", fontsize=12)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    i+=1
plt.tight_layout()
plt.savefig('Accident_Count_by_Month_with_Severity.png', dpi=1200, bbox_inches='tight')
plt.show()

In [None]:
US_Accidents_df_clean['DayOfWeek'] = US_Accidents_df_clean['time'].dt.dayofweek
plt.subplots(2,2,figsize=(15,10))
for s in np.arange(1,5):
    plt.subplot(2,2,s)
    plt.hist(US_Accidents_df_clean.loc[US_Accidents_df_clean["Severity"] == s]['DayOfWeek'], bins=[0,1,2,3,4,5,6,7], align='left', rwidth=0.8, color=colors[s])
    plt.title("Accident Count by Day with Severity " + str(s), fontsize=16)
    plt.xlabel("Day", fontsize=16)
    plt.ylabel("Accident Count", fontsize=16)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
plt.tight_layout()
plt.savefig('Accident_Count_by_Day_with_Severity.png', dpi=1200, bbox_inches='tight')
plt.show()

In [None]:
period_features = ['Sunrise_Sunset','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight']
fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(22, 15))

plt.subplots_adjust(wspace = 0.5)
for i, feature in enumerate(period_features, 1):    
    plt.subplot(2, 2, i)
    sns.countplot(x=feature, hue='Severity', data=US_Accidents_df_clean, palette="bright")
    
    plt.xlabel('{}'.format(feature), size=12, labelpad=3)
    plt.ylabel('Accident Count', size=12, labelpad=3)    
    plt.tick_params(axis='x', labelsize=12)
    plt.tick_params(axis='y', labelsize=12)
    
    plt.legend() #(['0', '1'], loc='upper right', prop={'size': 10})
    plt.title('Count of Severity in\n{} Feature'.format(feature), size=13)#, y=1.05)
fig.suptitle('Count of Accidents by Period-of-Day (resampled data)', fontsize=16) #y=1.0,
# plt.savefig('Count_of_Accidents_by_Day_Night.png', dpi=1200, bbox_inches='tight')
plt.show()

### **Clean Data for plot Left & Right side Accidents:**

In [None]:
# Get names of indexes for which column Stock has value No
indexNames = US_Accidents_df_clean[ US_Accidents_df_clean['Side'] == ' ' ].index
# Delete these row indexes from dataFrame
us_left_right = US_Accidents_df_clean.drop(indexNames , inplace=False)

In [None]:
# plt.figure(figsize=(12,8))
fig, ax = plt.subplots(figsize=(12,8))
# sns.color_palette("Spectral", as_cmap=True)
# sns.color_palette("coolwarm", as_cmap=True)
sns.countplot(x='Side', hue='Severity', data=us_left_right, palette="bright")
# sns.color_palette("icefire", as_cmap=True)

plt.title("Count of Accidents by Side", size=16, y=1.03)
plt.legend()
ax.set_xticklabels(['Right','Left'])
plt.ticklabel_format(style='plain', axis='y')
plt.xlabel("Side", fontsize=13)
plt.ylabel("Count", fontsize=13)
# plt.savefig('Count_of_Accidents_by_Side.png', dpi=1200, bbox_inches='tight')
plt.show()

In [None]:
US_Accidents_df_clean['Severity4'] = 0
US_Accidents_df_clean.loc[US_Accidents_df_clean['Severity'] == 4, 'Severity4'] = 1
US_Accidents_df_clean.Severity4.value_counts()

In [None]:
def plot_level4_accidents_us_map():
    df_4 = US_Accidents_df_clean[US_Accidents_df_clean['Severity4']==1]

    plt.figure(figsize=(15,10))

    plt.plot( 'Start_Lng', 'Start_Lat', data=US_Accidents_df_clean, linestyle='', marker='o', markersize=1, color="teal", alpha=0.2, label='All Accidents')
    plt.plot( 'Start_Lng', 'Start_Lat', data=df_4, linestyle='', marker='o', markersize=1, color="coral", alpha=0.3, label='Accidents with Serverity Level 4')
    plt.legend(markerscale=8)
    plt.xlabel('Longitude', size=12, labelpad=3)
    plt.ylabel('Latitude', size=12, labelpad=3)
    plt.title('Map of Accidents', size=16, y=1.05)
    # plt.savefig('Map_of_Accidents_vs_Level4_Severity.png', dpi=1200, bbox_inches='tight')
    plt.show()

In [None]:
from scipy.stats import boxcox



In [None]:
# resample again
# df_bl = pd.concat([df[df['Severity4']==1].sample(40000, replace = True), 
#                    df[df['Severity4']==0].sample(40000)], axis=0)
def plot_weather_density():
    US_Accidents_df_clean['Pressure_bc']= boxcox(US_Accidents_df_clean['Pressure(in)'].apply(lambda x: x+1), lmbda=6)
    US_Accidents_df_clean['Visibility_bc']= boxcox(US_Accidents_df_clean['Visibility(mi)'].apply(lambda x: x+1), lmbda = 0.1)
    US_Accidents_df_clean['Wind_Speed_bc']= boxcox(US_Accidents_df_clean['Wind_Speed(mph)'].apply(lambda x: x+1), lmbda=-0.2)
    US_Accidents_df_clean2 = US_Accidents_df_clean.drop(['Pressure(in)','Visibility(mi)','Wind_Speed(mph)'], axis=1)
    US_Accidents_df_clean2['Severity'] = US_Accidents_df_clean2['Severity'].astype('category')
    num_features = ['Temperature(F)', 'Humidity(%)', 'Pressure_bc', 'Visibility_bc', 'Wind_Speed_bc']
    fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(30, 20))
    plt.subplots_adjust(hspace=0.4, wspace=0.2)
    for i, feature in enumerate(num_features, 1):    
        plt.subplot(2, 3, i)
        sns.violinplot(x=feature, y="Severity", data=US_Accidents_df_clean2, palette="Set3")
        
        plt.xlabel('{}'.format(feature), size=12, labelpad=3)
        plt.ylabel('Severity', size=12, labelpad=3)    
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)

        if i==3:
            plt.xlim(0, 200000000)
        

        plt.title('{} Feature by Severity'.format(feature), size=14, y=1.05)
    fig.suptitle('Density of Accidents by Weather Features (resampled data)', fontsize=18)
    # plt.savefig('Density_of_Accidents_by_Weather_Features.png', dpi=600, bbox_inches='tight')
    plt.show()

In [None]:
# show distinctive weather conditions
import re

weather ='!'.join(US_Accidents_df_clean['Weather_Condition'].dropna().unique().tolist())
weather = np.unique(np.array(re.split(
    "!|\s/\s|\sand\s|\swith\s|Partly\s|Mostly\s|Blowing\s|Freezing\s", weather))).tolist()
print("Weather Conditions: ", weather)

In [None]:
# US_Accidents_df_clean['Clear'] = np.where(US_Accidents_df_clean['Weather_Condition'].str.contains('Clear', case=False, na = False), 1, 0)
# US_Accidents_df_clean['Cloud'] = np.where(US_Accidents_df_clean['Weather_Condition'].str.contains('Cloud|Overcast', case=False, na = False), 1, 0)
# US_Accidents_df_clean['Rain'] = np.where(US_Accidents_df_clean['Weather_Condition'].str.contains('Rain|storm', case=False, na = False), 1, 0)
# US_Accidents_df_clean['Heavy_Rain'] = np.where(US_Accidents_df_clean['Weather_Condition'].str.contains('Heavy Rain|Rain Shower|Heavy T-Storm|Heavy Thunderstorms', case=False, na = False), 1, 0)
# US_Accidents_df_clean['Snow'] = np.where(US_Accidents_df_clean['Weather_Condition'].str.contains('Snow|Sleet|Ice', case=False, na = False), 1, 0)
# US_Accidents_df_clean['Heavy_Snow'] = np.where(US_Accidents_df_clean['Weather_Condition'].str.contains('Heavy Snow|Heavy Sleet|Heavy Ice Pellets|Snow Showers|Squalls', case=False, na = False), 1, 0)
# US_Accidents_df_clean['Fog'] = np.where(US_Accidents_df_clean['Weather_Condition'].str.contains('Fog', case=False, na = False), 1, 0)

# # Assign NA to created weather features where 'Weather_Condition' is null.
# weather = ['Clear','Cloud','Rain','Heavy_Rain','Snow','Heavy_Snow','Fog']
# for i in weather:
#   US_Accidents_df_clean.loc[US_Accidents_df_clean['Weather_Condition'].isnull(),i] = US_Accidents_df_clean.loc[US_Accidents_df_clean['Weather_Condition'].isnull(),'Weather_Condition']

# US_Accidents_df_clean.loc[:,['Weather_Condition'] + weather]

# US_Accidents_df_clean = US_Accidents_df_clean.drop(['Weather_Condition'], axis=1)

In [None]:
def plot_weather_condition_severity():
    US_Accidents_df_clean['Severity34'] = 0
    US_Accidents_df_clean.loc[US_Accidents_df_clean['Severity'] == 4, 'Severity34'] = 1
    US_Accidents_df_clean.loc[US_Accidents_df_clean['Severity'] == 3, 'Severity34'] = 1
    # US_Accidents_df_clean.Severity34.value_counts()
    fig, axs = plt.subplots(ncols=3, nrows=3, figsize=(22, 15))
    plt.subplots_adjust(hspace=0.4,wspace = 0.6)
    for i, feature in enumerate(weather, 1):    
        plt.subplot(3, 3, i)
        sns.countplot(x=feature, hue='Severity34', data=US_Accidents_df_clean, palette="bright") #, palette="Set2")
        
        plt.xlabel('{}'.format(feature), size=12, labelpad=3)
        plt.ylabel('Accident Count', size=12, labelpad=3)    
        plt.tick_params(axis='x', labelsize=12)
        plt.tick_params(axis='y', labelsize=12)
        
        plt.legend(['Others', 'Severity - 3 & 4'], loc='upper right', prop={'size': 10})
        plt.title('Count of Severity in \n {} Feature'.format(feature), size=12) #, y=1.05)
        plt.ticklabel_format(style='plain', axis='y')
    fig.suptitle('Count of Accidents by Weather Features (resampled data)', fontsize=15)
    # plt.savefig('Count_of_Accidents_by_Weather_Features.png', dpi=600, bbox_inches='tight')
    plt.show()

In [None]:
def plot_traffic_density():
    traffic_feature = ['Amenity', 'Wind_Chill(F)', 'Crossing', 'Junction', 'Traffic_Signal']
    fig, axs = plt.subplots(ncols=2, nrows=3, figsize=(30, 20))
    plt.subplots_adjust(hspace=0.4, wspace=0.2)
    for i, feature in enumerate(traffic_feature, 1):    
        plt.subplot(2, 3, i)
        sns.violinplot(x="Severity", y=feature, data=US_Accidents_df_clean, palette="Set2")
        
        plt.ylabel('{}'.format(feature), size=12, labelpad=3)
        plt.xlabel('Severity', size=12, labelpad=3)    
        # plt.tick_params(axis='x', labelsize=12)
        # plt.tick_params(axis='y', labelsize=12)

        

        plt.title('{} Feature by Severity'.format(feature), size=14, y=1.03)
    fig.suptitle('Density of Accidents by Traffic Features (resampled data)', fontsize=18)
    # plt.savefig('Density_of_Accidents_by_Traffic_Features.png', dpi=600, bbox_inches='tight')
    plt.show()

In [None]:
#impact time vs. street type
# fig, ax = plt.subplots(figsize = (5, 6))
sns.catplot(x ='Street_Type', y ='Impact', data=US_Accidents_df_clean, kind='box', sym = '', palette='tab10')
plt.title("Impact Time VS. Street Type", y = 1.03, fontsize=15)
plt.xticks(rotation = 90)
plt.xlabel("Street Type")
plt.ylabel("Impact")
# plt.savefig('Impact_Time_Street_Type.png', dpi=1200, bbox_inches='tight')
plt.show();

#the impact time of accidents in highway is much longer than that in other roads.
#related to the accident severity

In [None]:
cases_w = US_Accidents_df_clean.groupby('Weekday')['Impact'].count()
severity_w = US_Accidents_df_clean.groupby('Weekday')['Severity'].mean()
plt.style.use('fast')
fig, ax = plt.subplots(figsize=(10, 8))
ax.plot(cases_w, color='springgreen', label='cases number', marker='o', lw=3)
ax.set_xlabel('weekday')
ax.set_ylabel('cases in a week',color='blue')
ax.legend(loc='center left')

ax2 = ax.twinx()
ax2.plot(severity_w, '-o', color='darkorange',label='severity', marker='o', lw=3)

ax2.set_ylabel('average accidents severity in a week ',color='green')
ax2.set_label('severity')
ax.set_title('Weekday Accidents Cases & Severity')
ax2.legend(loc='center right')

# plt.savefig('Weekday_Accidents_Cases_Severity.png', dpi=1200, bbox_inches='tight')
plt.show()
#although cases dropped a lot on weekend, the average impact of cases on weekend is much higher compared to working day
# guess the reason is that on weekend, the reaction speed of police and other department is slower

In [None]:
fig,ax=plt.subplots(1,2,figsize=(15,8))
clr = ("aqua", "forestgreen", "gold", "red", "purple",'cadetblue','hotpink','orange','darksalmon','brown')
US_Accidents_df_clean.State.value_counts().sort_values(ascending=False)[:10].sort_values().plot(kind='barh',color=clr,ax=ax[0])
ax[0].set_title("Top 10 Acciedent Prone States", size=17)
ax[0].set_xlabel('States', size=18)


count=US_Accidents_df_clean['State'].value_counts()
groups=list(US_Accidents_df_clean['State'].value_counts().index)[:10]
counts=list(count[:10])
counts.append(count.agg(sum)-count[:10].agg('sum'))
groups.append('Other')
type_dict=pd.DataFrame({"group":groups,"counts":counts})
clr1=('brown','darksalmon','orange','hotpink','cadetblue','purple','red','gold','forestgreen','aqua','plum')
qx = type_dict.plot(kind='pie', y='counts', labels=groups, colors=clr1, autopct='%1.1f%%', pctdistance=0.9, radius=1.2, ax=ax[1])
plt.legend(loc=0, bbox_to_anchor=(1.15,0.4)) 
plt.subplots_adjust(wspace =0.5, hspace =0)
plt.ioff()
plt.ylabel('')
# plt.savefig('Top_10_Acciedent_Prone_States.png', dpi=1200, bbox_inches='tight')
plt.show()

In [None]:
f, ax=plt.subplots(1, 2, figsize=(18, 8))
US_Accidents_df_clean['Timezone'].value_counts().plot.pie(explode=[0,0,0.1,0], autopct='%1.1f%%', ax=ax[0]) #, shadow=True)
ax[0].set_title('Accidents in Different Timezone')
#ax[0].set_ylabel('Count')
sns.countplot(x='Timezone', data=US_Accidents_df_clean, ax=ax[1], order=US_Accidents_df_clean['Timezone'].value_counts().index)
ax[1].set_title('Accident Count Based on Timezone')
plt.ticklabel_format(style='plain', axis='y')
# plt.savefig('Accidents_in_Different_Timezone.png', dpi=1200, bbox_inches='tight')
plt.show()

In [None]:
start = pd.to_datetime(US_Accidents_df_clean.Start_Time, format='%Y-%m-%d %H:%M:%S')
end = pd.to_datetime(US_Accidents_df_clean.End_Time, format='%Y-%m-%d %H:%M:%S')
laps=end-start

top_15 = laps.astype('timedelta64[m]').value_counts().nlargest(15) #Return the first n rows ordered by columns in descending order.
print('Top 15 longest accidents correspond to {:.1f}% of the data'.format(top_15.sum()*100/len(laps)))
(top_15/top_15.sum()).plot.bar(figsize=(10,8), color = 'plum')
plt.title('Top Accident Durations', fontsize = 24, color='indigo')
plt.xlabel('Duration in minutes')
plt.ylabel('% of Total Data')
plt.grid(linestyle=':', linewidth = '0.2', color ='salmon');

plt.show();