In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Exploratory Data Analysis - US Accidents

>  ### In this Project, We are going to Explore the countrywide car accident dataset of the US. The accident data are collected from February 2016 to Dec 2020, there are about **3 million** accident records in this dataset. We are going to analyse the data to explore various questions like Hotspot locations of the Accidents, What time of the day is the frequency higher? and the impact of environmental stimuli on accident occurrence. 

##  1. Import Data and Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
data = pd.read_csv("../input/us-accidents/US_Accidents_Dec20_Updated.csv")
df = data.copy()
df.head(3)

# 2. Data Preparation

>    ### Before We get to exploring the data, first and foremost we should prepare the data for the analysis. We'll first do data cleaning. we'll check for the null values and remove all the columns with a lot of null values. Also we'll imput appropriate values for the required columns for our analysis then We'll do memory optimzation since our data is too large.

In [None]:
df.info()

> #### We can see that we only have 46 columns, taking 790 MB of memory, We'll try to reduce this as much we can after we deal with the null values.

## 1. Data Cleaning

> #### We'll first see how many null values are there in the dataset. We'll drop the columns containing large number of null values since they won't be much useful. We'll also get rid of few of the columns which aren't too important.

In [None]:
df.isna().sum().sort_values(ascending = False)

> #### There are a lot of null values. Some columns have definitely way more null columns, They should be discarded completely. But first let's visualize the null values of the data for better understanding. 

In [None]:
null_values = df.isna().sum().reset_index()
null_values.columns = ["Columns", "Null_count"]
null_values["% Null_values"] = (null_values["Null_count"]*100)/len(df)
null_values.sort_values(by = "Null_count",ascending = False, inplace = True)
Missing_values = null_values[null_values["Null_count"] != 0]
Missing_values

In [None]:
sns.set_style("darkgrid")

In [None]:
plt.figure(figsize=(15,10))
plt.xticks(rotation = 90)
plt.title("Percentage Of Null Values",fontsize= 20)
sns.barplot(x = "Columns", y = "% Null_values", data = Missing_values )

> #### The top 3 columns with most null values have more than 40% of null values so they're useless. There's also  a big jump from 7th to 6th column, since the top 6 columns containing most null values are not that important for our analysis, We'll drop them completely.

In [None]:
df.drop(axis = 1, columns = ['Number','Precipitation(in)','Wind_Chill(F)','Wind_Speed(mph)','End_Lat','End_Lng'], inplace = True)

> #### We still have null values in our dataset. But we don't require all the  columns for our analysis. We'll remove all the unncessary columns containing null values.

In [None]:
df.drop(axis = 1, columns = ['Wind_Direction','Pressure(in)','Weather_Timestamp','Airport_Code','Timezone','Zipcode','Civil_Twilight','Nautical_Twilight','Astronomical_Twilight'], inplace = True)

> #### Now that we've taken taken care of all the columns with null values that we don't require, We'll now fill the remaining coulmns with appropriate values.

#### i. Impute the missing values of numerical columns:

In [None]:
df["Temperature(F)"].fillna(df["Temperature(F)"].median(), inplace = True)
df["Humidity(%)"].fillna(df["Humidity(%)"].median(), inplace = True)
df["Visibility(mi)"].fillna(df["Visibility(mi)"].median(), inplace = True)

#### ii. Impute the missing values of categorical columns:

In [None]:
df["Weather_Condition"].fillna(df["Weather_Condition"].mode()[0], inplace = True)
df["Sunrise_Sunset"].fillna(df["Sunrise_Sunset"].mode()[0], inplace = True)

In [None]:
df["City"].fillna(value = "None", inplace = True)

In [None]:
df.isna().sum().sort_values(ascending = False)

> #### As we can see we've dealt with all the null values. We've a clean data for our analysis. Now let's do some memory optimization

## 2. Memory Optimization

> #### There are lots of column with a string datatype which could be converted into categorial datatype columns for performance improvement. Let’s take a look at which columns might be good candidates for a categorical datatype. 

In [None]:
df.head(3).transpose()

> #### There are many boolean datatypes as well, we'll have to be careful while selecting the columns.

In [None]:
for col in df.columns:
    if df[col].nunique() < 2100 and df[col].nunique() > 10 and df[col].dtype== "object": 
        df[col] = df[col].astype("category")

In [None]:
df["Country"] = df["Country"].astype("category")

In [None]:
df["Side"] = df["Side"].astype("category")

> #### Now let's convert the datatypes of columns Start_Time and End_Time as these should be in datetime datatypes for our analysis.


In [None]:
convert_columns1 = ["Start_Time","End_Time"]
df[convert_columns1] = df[convert_columns1].astype("datetime64[ns]")
df.info()

> #### We've reduced the memory usage by approximately 55% i.e 790 MB to 343.8 MB after removing all the null values and converting few columns to categorical datatype.

> #### We're all done with the data preparation. Now let's explore our data.

## 3. Explore The Data

> ### In this, We'll analyse each column of our dataset excluding some which don't impact or have any meaningful insights whatsoever. There are many columns worth exploring like State, City, Street, County, Start_Time, Temperature(F), Weather_Condition, Visibility(mi). We'll gain many insights and will try to answer a lot of questions about the dataset.



In [None]:
df.describe().T

In [None]:
plt.figure(figsize = (25,15))
Var_Corr = df.corr()
sns.heatmap(Var_Corr, cmap = "coolwarm", xticklabels=Var_Corr.columns, yticklabels=Var_Corr.columns, annot=True)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.show()

### a) State

> #### The US have 50 states in the total. We have the records of 49 states.

> #### Let's plot a bar plot to understand which state has the most accidents records.

In [None]:
plt.figure(figsize = (25,15))
df["State"].value_counts(ascending = True).plot(kind = "barh", color = "b")
plt.title("Accidents by States", fontsize = 20)

> #### We can clearly see that California (CA) has the highest number of Accidents by a large margine with Florida (FL) which has the second most accident records. Lets plot a percentage graph for a clearer picture.

In [None]:
plt.figure(figsize = (25,10))
(df["State"].value_counts(ascending = False)*100/len(df)).plot(kind = "bar", color = "r")
plt.title("Percentage of Accidents by States", fontsize = 20)

> #### In these past 5 years **25%** of accidents have happened in the state of California which is signifiant. Following that **9%** in Florida and **8%** in Texas.

### b) City

In [None]:
cities = df["City"].value_counts(ascending = False).reset_index()
cities.columns = ["City", "Number_of_Accidents"]
cities["% of_Accidents"] =(cities["Number_of_Accidents"]*100)/len(df)
cities.sort_values(by = "Number_of_Accidents",ascending = False, inplace = True)
cities_accidents = cities.head(50)
cities_accidents.head()

In [None]:
plt.figure(figsize=(20,20))
plt.xticks(rotation = 90)
plt.title("Accident by Cities (Top 50)",fontsize= 20)
sns.barplot(y = "City", x = "Number_of_Accidents", data = cities_accidents )

> #### Los Angeles has the highest number of accidents with houston being second with almost similar number of accidents. Top 10 cities account for the most of the accidents.

In [None]:
len(cities)

In [None]:
cities["% of_Accidents"].head(1000).sum()

In [None]:
cities["% of_Accidents"].head(100).sum()

In [None]:
cities["% of_Accidents"].head(10).sum()

> #### We have **11790** cities. Of that Top 1000 cities account for **81%** of the Accidents, Top 100 cities account for **45%** of the Accidents and Top 10 cities account for **16%** of the Accidents.

### c) Street

In [None]:
streets = df["Street"].value_counts(ascending = False).reset_index()
streets.columns = ["Street", "Number_of_Accidents"]
streets["% of_Accidents"] =(streets["Number_of_Accidents"]*100)/len(df)
streets.sort_values(by = "Number_of_Accidents",ascending = False, inplace = True)
streets_accidents = streets.head(50)
streets_accidents.head()

In [None]:
plt.figure(figsize=(20,20))
plt.xticks(rotation = 90)
plt.title("Accident by Streets (Top 50)",fontsize= 20)
sns.barplot(y = "Street", x = "Number_of_Accidents", data = streets_accidents )

> #### I-5N had the most number of accidents, following that I-95N, I-95S. Steets follow the same trend as cities with top streets having the most number of accidents.

In [None]:
len(streets)

In [None]:
streets["% of_Accidents"].head(10000).sum()

In [None]:
streets["% of_Accidents"].head(1000).sum()

In [None]:
streets["% of_Accidents"].head(100).sum()

> #### We have **175527** Streets. Of that Top 10000 Streets account for **77%** of the Accidents, Top 1000 Streets account for **49%** of the Accidents and Top 100 Streets account for **25%** of the Accidents.

### d) Start_Time

> #### Lets' add a year column

In [None]:
df["Year"] = df["Start_Time"].dt.year

In [None]:
plt.figure(figsize = (15,10))
df["Start_Time"].dt.year.value_counts().plot(kind = "line")
plt.title("Yearly Accidents Trend", fontsize = 15)

> #### There's an increasing trend in 2020. The accidents are growing rapidly in numbers.

In [None]:
plt.figure(figsize = (25,10))
explode = (0, 0.1, 0, 0, 0)
colors = ['#c2c2f0','#ffb3e6', '#99ff99', '#66b3ff', '#ffcc99']
(df["Start_Time"].dt.year.value_counts(ascending = True)*100/len(df)).plot(kind = "pie", autopct = "%1.1f%%", colors = colors, explode = explode, shadow = True)
plt.title("Percentage of yearly Accidents", fontsize = 20)

> #### Out of all the accident records **35.6%** of accidents have happened in 2020. Accidents are increasing at an alarming rate every year.

In [None]:
plt.figure(figsize = (15,30))
sns.countplot(y = "State", hue="Year", data=df, order = df["State"].value_counts().index)
plt.title("Percentage of yearly Accidents by States", fontsize = 15)

> #### 2020 have had most of the accidents for all the States. For Florida it seems the number has increased **3 times** than the previous year. California also had **1.6 times** increase over the previous year. PA, VA also had significant increase in 2020. 

> #### Let's add a month column

In [None]:
df["Month"] = df["Start_Time"].dt.month_name()

In [None]:
plt.figure(figsize = (20,10))
(df["Start_Time"].dt.month_name().value_counts(ascending = True)*100/len(df)).plot(kind = "bar", color = "m")
plt.title("Percentage of Monthly Accidents", fontsize = 20)

> #### Most of the accidents happen during last quarter of the year with December having the most accidents. July registered least number of accidents.

In [None]:
plt.figure(figsize = (20,10))
sns.countplot(x = "Month", hue="Year", data=df, order = df["Month"].value_counts().index)
plt.title("Percentage of yearly Accidents by Months", fontsize = 20)
plt.show()

> #### For December and November, 2020 had more accidents approx **3 times** that of the previous year. For July and August, it seems the number has decreased by **half**. It's interesting to note that except 2020, all months had balanced number of accidents throughout all years.

In [None]:
df["Day_of_Week"] = df["Start_Time"].dt.day_name()

In [None]:
plt.figure(figsize = (20,10))
(df["Start_Time"].dt.day_name().value_counts(ascending = True)*100/len(df)).plot(kind = "bar", color = "purple")
plt.title("Percentage of Accidents by Days", fontsize = 20)

> #### Saturday and Sundays have least number of Accidents. Weekends are off days for most of the working people, that could be the reason behind less accidents. We can get clear understanding about this when we plot a graph for timeline of every hour.

> #### Let's add an hour column.

In [None]:
df["Hour"] = df["Start_Time"].dt.hour

In [None]:
plt.figure(figsize = (15,10))
sns.histplot(data = df, x = "Hour", bins = 24)
plt.title("Frequency of Accidents throughout the Day", fontsize = 15)

> #### Most accidents happen between 6AM-9AM and between 3PM-6PM. People commute to work and from work in those time gaps respectively. This might be the reason that Saturday and Sunday has least number of Accidents.

### e) Severity

In [None]:
plt.figure(figsize = (25,10))
explode = (0, 0.1, 0, 0)
df["Severity"].value_counts().plot(kind = "pie", autopct = "%1.1f%%", colors = ('#c2c2f0','#ffb3e6', '#99ff99', '#66b3ff' ), explode = explode, shadow = True)
plt.title("Percentage of Severity of Accidents", fontsize = 20)

 > #### **73%** reported accidents have Severity 2 which could mean that there are a lot of accidents which caused some injuries and had little impact.

In [None]:
plt.figure(figsize = (25,10))
sns.countplot(x = "Severity", hue = "Year", data = df)

> #### It seems there are little to no records of Severity 1. Year 2020 had the most number of Severity 2 Accidents though it doesn't seem to be the case in Severity 3 and 4 which is an interesting find.

### f) Weather Condition

In [None]:
weather = df["Weather_Condition"].value_counts().reset_index()
weather.columns = ["Weather", "Number_of_Accidents"]
weather["% of_Accidents"] =(weather["Number_of_Accidents"]*100)/len(df)
weather.sort_values(by = "Number_of_Accidents",ascending = False, inplace = True)
weather_condition = weather.head(30)
weather_condition.head()

In [None]:
plt.rcParams["figure.figsize"] = (20,15)
weather_condition.plot(x = "Weather", y = "% of_Accidents", kind = "bar")
plt.title("Accidents by Weather Condition (Top 30)", fontsize = 20)
plt.xticks(rotation = 90)
plt.show()

In [None]:
weather_condition["% of_Accidents"].head(6).sum()

> #### "Fair" weather condition has the large number of accidents i.e 26% of the accidents. Clear and Mostly Cloudy also have 17% and 13% respectively. Also for Partly Cloudy has 9%, Cloudy has 8%, Overcast has 8% accidents. These top 6 Weather conditions amounts to **82.6%** of total accidents.

> #### Let's plot a pie chart for the above six Weather Conditions with Severity.

In [None]:
group = df.groupby(["Weather_Condition", "Severity"])["Severity"].count().sort_values(ascending = False).unstack("Weather_Condition")

In [None]:
weather_severity = group[["Fair","Clear", "Mostly Cloudy", "Partly Cloudy", "Cloudy", "Overcast"]].unstack()

In [None]:
plt.figure(figsize = (30,30))
colors = ('lightblue', "beige", "cyan", 'lightsteelblue')
explode = (0, 0, 0.1, 0)
plt.subplot(2,3,1)
weather_severity.loc["Fair"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors =colors, explode = explode, shadow = True)
plt.title("Fair", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,2)
weather_severity.loc["Clear"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Clear", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,3)
weather_severity.loc["Mostly Cloudy"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Mostly Cloudy", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,4)
weather_severity.loc["Partly Cloudy"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Partly Cloudy", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,5)
weather_severity.loc["Cloudy"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Cloudy", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,6)
weather_severity.loc["Overcast"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Overcast", fontsize = 30)
plt.ylabel("")

> #### It seems all six Weather Conditions has most accidents happened in Severity 2 i.e **above 65%**. Clear and Overcast Weather had no Severity 1 accidents.

In [None]:
group1 = df.groupby(["Weather_Condition", "Year"])["Year"].count().sort_values(ascending = False).unstack("Weather_Condition")

In [None]:
weather_year = group1[["Fair","Clear", "Mostly Cloudy", "Partly Cloudy", "Cloudy", "Overcast"]].unstack()

In [None]:
plt.figure(figsize = (30,30))
explode = (0,0, 0, 0.1, 0)
colors = ['#c2c2f0', '#ffcc99', '#99ff99', '#66b3ff','#ff6666']
plt.subplot(2,3,1)
weather_year.loc["Fair"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Fair", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,2)
weather_year.loc["Clear"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Clear", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,3)
weather_year.loc["Mostly Cloudy"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Mostly Cloudy", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,4)
weather_year.loc["Partly Cloudy"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Partly Cloudy", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,5)
weather_year.loc["Cloudy"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Cloudy", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,6)
weather_year.loc["Overcast"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("Overcast", fontsize = 30)
plt.ylabel("")

> #### Overcast and Clear weather has no accident records for 2020. This could be an error while collecting data since 2020 recorded most accidents overall. Fair and Cloudy weather conditions had more than 65% accidents happen in 2020.

### g) Temperature(F)

In [None]:
df["Temp Range"] = pd.cut(df["Temperature(F)"], [-100, -50, 0, 50,100, 150, 200, 250], labels = ["-100 - -50", "-50 - 0", "0 - 50", "50 - 100", "100 - 150", "150 - 200", "200 - 250"])

In [None]:
plt.figure(figsize = (25,10))
explode = (0, 0.1, 0, 0, 0, 0, 0)
df["Temp Range"].value_counts().plot(kind = "pie", autopct = "%1.1f%%", textprops={'fontsize': 15}, explode = explode, shadow = True)
plt.title("Percentage of Accidents in Temp Range", fontsize = 20)

> #### 72.5% of accidents have happened in Temperature Range of 50-100 F and 26.9% of accidents have happened in 0 - 50 F Temp Range.

In [None]:
group2 = df.groupby(["Temp Range", "Visibility(mi)"])["Temp Range"].count().unstack().stack().sort_values(ascending = False).head(30)

In [None]:
plt.figure(figsize = (20,10))
(group2*100/len(df)).plot(kind = "bar", color = "y")
plt.title("Accidents by Temp Range and Visibility (Top 30)", fontsize = 20)

> #### 60% of the accidents have happened in 50-100F Temp Range with 10.0 mi Visibility and 19% of the accidents have happened in 0-50F with 10.0 mi Visibility

### f) Visibility(mi)

In [None]:
plt.figure(figsize = (20,10))
(df["Visibility(mi)"].value_counts().head(30)*100/len(df)).plot(kind = "bar")
plt.title("Accidents by Visibility (Top 30)", fontsize = 20)

> #### 80% of the accident records have 10.0 mi Visibility

### h) Distance(mi)

In [None]:
df["Dist Range"] = pd.cut(df["Distance(mi)"], [-1,0,1,2,3,4,350], labels = [ "-1-0","0-1","1-2", "2-3", "3-4", "4+"])

In [None]:
plt.figure(figsize = (25,10))
explode = (0, 0.1, 0, 0, 0, 0)
df["Dist Range"].value_counts().plot(kind = "pie", autopct = "%1.1f%%", textprops={'fontsize': 15}, colors = ['#c2c2f0','#ffb3e6', '#99ff99', '#66b3ff','#ff6666', '#ffcc99'], explode = explode, shadow = True)
plt.title("Percentage of Accidents in Dist Range", fontsize = 20)

> #### **54%** of the accidents happened on the spot. **37%** of accidents have happened in 0 -1 mi Dist Range.

In [None]:
group3 = df.groupby(["Dist Range", "Severity"])["Dist Range"].count().unstack().stack().sort_values(ascending = False).head(30)

In [None]:
plt.figure(figsize = (20,10))
(group3*100/len(df)).plot(kind = "bar", color = "y")
plt.title("Accidents by Dist Range and Severity (Top 30)", fontsize = 20)

> #### **39%** of the accidents have happened on the spot with severity 2 accidents. **28%** accidents have happened with 0 - 1 (mi) Dist Range with Severity 2 accidents.

In [None]:
plt.figure(figsize = (30,30))
explode = (0, 0, 0.1, 0)
colors = ['#ff9999','#66b3ff','#99ff99','#ffcc99']
plt.subplot(2,3,1)
group3.loc["-1-0"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("0 (mi)", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,2)
group3.loc["0-1"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("0 - 1 (mi)", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,3)
group3.loc["1-2"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("1 - 2 (mi)", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,4)
group3.loc["2-3"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("2 - 3 (mi)", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,5)
group3.loc["3-4"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("3 - 4 (mi)", fontsize = 30)
plt.ylabel("")
plt.subplot(2,3,6)
group3.loc["4+"].sort_values().plot(kind = "pie",autopct = "%1.1f%%", textprops={'fontsize': 20}, colors = colors, explode = explode, shadow = True)
plt.title("4+ (mi)", fontsize = 30)
plt.ylabel("")

> #### As the Distance increases the percentage of Severity 3 & 4 in that Dist Range also increases. Though all Dist Ranges has mostly Severity 2 Accidents among them.

### i) Other Columns

In [None]:
new_columns = ['Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming', 'Traffic_Signal', 'Turning_Loop']

In [None]:
new_df = pd.DataFrame(columns = ["Columns", "% of Accidents"])

In [None]:
new_df["Columns"] = new_columns

In [None]:
for i in range(len(new_df)):
    new_df["% of Accidents"].loc[i] = len(df[df[new_df["Columns"].loc[i]] == True])*100/len(df)

In [None]:
new_df.sort_values("% of Accidents", ascending = False, inplace = True)
new_df

In [None]:
sns.barplot(x = "Columns", y = "% of Accidents", data = new_df)
plt.title("Accidents in presence of different factors", fontsize = 20)

> #### For **15%** of Accidents Traffic Signal is nearby. **10%** for Junction and **8%** for Crossing.

#### Let's create a barplot for the State of California since it has the highest number of accidents with the same columns.

In [None]:
ca_df = pd.DataFrame(columns = ["Columns", "% of Accidents"])

In [None]:
ca_df["Columns"] = new_columns

In [None]:
for i in range(len(ca_df)):
    mask1 = df[ca_df["Columns"].loc[i]] == True
    mask2 = df["State"]  == "CA"
    ca_df["% of Accidents"].loc[i] = len(df[mask1 & mask2])*100/len(df[df["State"] == "CA"])

In [None]:
ca_df.sort_values("% of Accidents", ascending = False, inplace = True)
ca_df

In [None]:
sns.barplot(x = "Columns", y = "% of Accidents", data = ca_df)
plt.title("Accidents in California presence of different factors", fontsize = 20)

> #### Junction could be one of the cause of Accidents in California which amounts for **14%** of the total accidents in the State. And Traffic Signal with 8%.

In [None]:
plt.figure(figsize = (25,10))
df["Sunrise_Sunset"].value_counts().plot(kind = "pie", autopct = "%1.1f%%", textprops={'fontsize': 15}, explode = (0.05,0), colors = ['lightblue','lightsteelblue'], shadow = True)
plt.title("Accidents in Day/Night", fontsize = 15)


> #### **2/3rd** of accidents happen during the day.