In [None]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy.stats import beta
from scipy import stats
np.random.seed(seed=42)
import math 
import sklearn.metrics
from sklearn.metrics import confusion_matrix, accuracy_score, r2_score
from sklearn.model_selection import train_test_split

In [None]:
import random
from random import randint

In [None]:
import re  

# Importing & Data Cleaning

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
accidents = pd.read_csv('~/Downloads/US_Accidents_June20.csv', delimiter = ',', header = 0, low_memory=False)

In [None]:
accidents.drop(['Source', 'End_Lat', 'End_Lng', 'Number', 'Weather_Timestamp', 'Airport_Code', 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight'], axis=1, inplace = True)

In [None]:
accidents.info()

In [None]:
accidents.isnull().sum()

In [None]:
accidents.loc[accidents['Wind_Direction']=='Calm','Wind_Direction'] = 'CALM'
accidents.loc[(accidents['Wind_Direction']=='West')|(accidents['Wind_Direction']=='WSW')|(accidents['Wind_Direction']=='WNW'),'Wind_Direction'] = 'W'
accidents.loc[(accidents['Wind_Direction']=='South')|(accidents['Wind_Direction']=='SSW')|(accidents['Wind_Direction']=='SSE'),'Wind_Direction'] = 'S'
accidents.loc[(accidents['Wind_Direction']=='North')|(accidents['Wind_Direction']=='NNW')|(accidents['Wind_Direction']=='NNE'),'Wind_Direction'] = 'N'
accidents.loc[accidents['Wind_Direction']=='Variable','Wind_Direction'] = 'VAR'

In [None]:
# accidents['Clear'] = np.where(accidents['Weather_Condition'].str.contains('Clear', case=False, na = False), 1, 0)
# accidents['Cloud'] = np.where(accidents['Weather_Condition'].str.contains('Cloud|Overcast', case=False, na = False), 1, 0)
# accidents['Rain'] = np.where(accidents['Weather_Condition'].str.contains('Rain|storm', case=False, na = False), 1, 0)
# accidents['Heavy_Rain'] = np.where(accidents['Weather_Condition'].str.contains('Heavy Rain|Rain Shower|Heavy T-Storm|Heavy Thunderstorms', case=False, na = False), 1, 0)
# accidents['Snow'] = np.where(accidents['Weather_Condition'].str.contains('Snow|Sleet|Ice', case=False, na = False), 1, 0)
# accidents['Heavy_Snow'] = np.where(accidents['Weather_Condition'].str.contains('Heavy Snow|Heavy Sleet|Heavy Ice Pellets|Snow Showers|Squalls', case=False, na = False), 1, 0)
# accidents['Fog'] = np.where(accidents['Weather_Condition'].str.contains('Fog', case=False, na = False), 1, 0)

# # Assign NA to created weather features where 'Weather_Condition' is null.
# weather = ['Clear','Cloud','Rain','Heavy_Rain','Snow','Heavy_Snow','Fog']
# for i in weather:
#   accidents.loc[accidents['Weather_Condition'].isnull(),i] = accidents.loc[accidents['Weather_Condition'].isnull(),'Weather_Condition']

# accidents.loc[:,['Weather_Condition'] + weather]

# accidents = accidents.drop(['Weather_Condition'], axis=1)

In [None]:
accidents['Start_Time'] = pd.to_datetime(accidents['Start_Time'], errors='coerce')
accidents['End_Time'] = pd.to_datetime(accidents['End_Time'], errors='coerce')

In [None]:
accidents['Timezone'] = accidents['Timezone'].str.lstrip('US/')

In [None]:
accidents['Year'] = pd.DatetimeIndex(accidents['Start_Time']).year

In [None]:
accidents['Month'] = pd.DatetimeIndex(accidents['Start_Time']).month

In [None]:
import calendar

In [None]:
accidents['Month'] = accidents['Month'].apply(lambda x: calendar.month_abbr[x])

In [None]:
accidents['DayOfWeek'] = accidents['Start_Time'].apply(lambda x: x.weekday())

In [None]:
accidents['DayOfWeek'] = accidents['DayOfWeek'].apply(str)

In [None]:
accidents['DayOfWeek'] = accidents['DayOfWeek'].replace(['0','1','2', '3', '4', '5', '6'], ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Sat', 'Sun'])

In [None]:
accidents['Duration'] = accidents['End_Time'] - accidents['Start_Time']
accidents['Durination_InMinutes']=accidents['Duration']/np.timedelta64(1,'m')

In [None]:
accidents.round(1)

In [None]:
accidents.info()

In [None]:
for col, values in accidents.iteritems():
    num_uniques = values.nunique()
    print ('{name}: {num_unique}'.format(name=col, num_unique=num_uniques))
    print (values.unique())
    print ('\n')

In [None]:
accidents.to_csv('Accidents.csv')

In [None]:
accidents2 = pd.read_csv('Accidents.csv')

In [None]:
accidents2 = accidents.drop(columns='Unnamed: 0')

In [None]:
accidents2.info()

# Visualizations - Accidents by Day, Month and Year

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt

In [None]:
sns.set(rc={'figure.figsize': (16, 12)})

In [None]:
#Number of Accidents Occuring by Day
sns.catplot(x = 'DayOfWeek', data=accidents2, kind='count')

In [None]:
corr = accidents2.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
#Number of Accidents Occuring by Month
sns.set(rc={'figure.figsize': (16, 12)})
sns.set_palette('GnBu_d')
sns.catplot(x = 'Month', data=accidents2, kind='count', order =accidents2.sort_values('Month').Month)

In [None]:
# sns.set(rc={'figure.figsize': (16, 12)})
# sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
# ax = sns.boxplot(x = 'DayOfWeek', y = 'Duration', data = accidents, color = 'cyan')

In [None]:
# sns.set(rc={'figure.figsize': (10, 6)})
# sns.catplot(x="OperatingSystem", y="TransactionAmt", hue="isFraud",
#             col="Timezone", aspect=.6,
#             kind="swarm", data=merged);

In [None]:
#Accidents Occuring by the Hour 
accidents['TimeofAccident'] = pd.to_datetime(accidents2['Start_Time'], errors='coerce')
accidents['Hour'] = accidents2['Start_Time'] .dt.hour
accidents['Minute'] = accidents2['Start_Time'] .dt.minute
hours = [hour for hour, df in accidents2.groupby('Hour')]
plt.plot(hours, accidents2.groupby(['Hour'])['ID'].count())
plt.xticks(hours)
plt.xlabel('Time of Day by Hour')
plt.ylabel('Numer of Accidents')
plt.grid(True)
plt.show()

In [None]:
#Accidents by Year
sns.set_context('talk')
sns.set_palette('magma')
a = sns.catplot(x='Year',data=accidents2,kind='count')
a.fig.suptitle('Accidents by Year ',y=1.03)
a.set(ylabel='Accident Count',xlabel='Year')
plt.show()

In [None]:
sns.set(rc={'figure.figsize': (16, 12)})
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
ax = sns.boxplot(x = 'Severity', y = 'Year', data = accidents2, color = 'cyan')

In [None]:
plt.figure(figsize=(20,12))
StatesAccidents = sns.countplot(y = 'State', data = accidents2, order = accidents2['State'].value_counts().index, palette='Blues')
plt.ylabel("State", labelpad=10, fontsize=10, weight = 'bold')
plt.xlabel('Count of Accidents', labelpad=10,fontsize=15, weight='bold')
plt.title('Accidents by State',fontsize=20, weight='bold')

In [None]:
#Accident Severity and Duration
sns.set(rc={'figure.figsize': (12, 10)})
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
ax = sns.boxplot(x = 'Duration', y = 'Severity', data = accidents2)

In [None]:
plt.figure(figsize=(20,12))
sns.barplot(surroundings['Accidents'],surroundings.index, palette = 'Blues_r')
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.xlabel("Number of Traffic Accidents", labelpad = 10, fontsize=15,weight='bold')
plt.ylabel("Nearby Road Features", labelpad = 12,fontsize=15, weight='bold')
plt.title("Common Road Features and Traffic Accidents in California", fontsize=20,weight='bold')

In [None]:
WeekendDF = accidents2[(accidents2['DayOfWeek'] == 'Saturday') | (df['DayOfWeek'] == 'Sunday')]
WeekendDF2 = WeekendDF.groupby('Hour').count()['ID']
WeekendDF2.plot(kind='bar', figsize=(10, 8))

# Visualizations - Severity and RoadRules/Types

In [None]:
accidents.Severity.value_counts()

In [None]:
#change the y-axis labels to the correct measure 
sns.set(rc={'figure.figsize': (6, 6)})
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, palette='autumn');

In [None]:
accidents.Sunrise_Sunset.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Sunrise_Sunset', palette='Accent');

In [None]:
accidents.Amenity.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Amenity', palette='Blues');

In [None]:
accidents.Bump.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Bump', palette='BrBG_r');

In [None]:
accidents.Crossing.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Crossing', palette='cividis');

In [None]:
accidents.Give_Way.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Give_Way', palette='Wistia');

In [None]:
accidents.Junction.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Junction', palette='copper');

In [None]:
accidents.No_Exit.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='No_Exit', palette='gnuplot');

In [None]:
accidents.Railway.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Railway', palette='tab20b');

In [None]:
accidents.Roundabout.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Roundabout', palette='twilight');

In [None]:
accidents.Station.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Station', palette='ocean_r');

In [None]:
accidents.Stop.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Stop', palette='pink');

In [None]:
accidents.Traffic_Signal.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Traffic_Signal', palette='Spectral');

In [None]:
accidents.Turning_Loop.value_counts()

In [None]:
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Severity", data=accidents2, hue ='Turning_Loop', palette='autumn');

In [None]:
# from wordcloud import WordCloud
# plt.style.use('seaborn')
# wrds1 = accidents["Description"].str.split("(").str[0].value_counts().keys()

# wc1 = WordCloud(scale=5,max_words=1000,colormap="rainbow",background_color="black").generate(" ".join(wrds1))
# plt.figure(figsize=(20,14))
# plt.imshow(wc1,interpolation="bilinear")
# plt.axis("off")
# plt.title("Patterned Words in Accident Description",color='b')
# plt.show()

In [None]:
sns.set_context("poster", font_scale = 1.0, rc={"grid.linewidth": 1.0})
sns.pairplot(accidents2[['Severity', 'Distance(mi)', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)']], plot_kws = {"color": "green"}, height = 10)

In [None]:
sns.set_context("poster", font_scale = 1.0, rc={"grid.linewidth": 1.0})
sns.pairplot(accidents2[['Pressure(in)', 'Severity', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']], plot_kws = {"color": "purple"}, height = 10)

In [None]:
# sns.set(rc={'figure.figsize': (10, 6)})
# sns.catplot(x="OperatingSystem", y="TransactionAmt", hue="isFraud",
#             col="id_23", aspect=.6,
#             kind="swarm", data=merged);

In [None]:
sns.set(rc={'figure.figsize': (6, 6)})
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Amenity", data=accidents2, palette='autumn');

In [None]:
sns.set(rc={'figure.figsize': (6, 6)})
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Bump", data=accidents2, palette='autumn');

In [None]:
sns.set(rc={'figure.figsize': (6, 6)})
sns.set_context("poster", font_scale = .75, rc={"grid.linewidth": 1.0})
sns.countplot(x="Amenity", data=accidents2, palette='autumn');

# Visualizations  - Environmental Impact