# US Accidents - Exploratory Data Analysis

## Import essential libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium

## Data Preparation and Cleaning

### Loading File Using Pandas

In [None]:
df = pd.read_csv('us_accidents.csv')

In [None]:
df.head()

### Look at some basic information about the data & the columns

In [None]:
df.columns

In [None]:
len(df.columns)

In [None]:
df.info()

In [None]:
# describe() is used to get the total statistical analysis of all the columns; 
df.describe()

In [None]:
# Checking the number of numerical columns present in our dataset
numerics = ['int16' , 'int32' , 'int64', 'float16', 'float32', 'float64']

numeric_df = df.select_dtypes(include = numerics)
print(str(len(numeric_df.columns)) + ' numeric columns')
print()
numeric_df.columns

### Fix Any Missing or Incorrect values

In [None]:
# missing values
# total count of columns in the DataFrame df that have at least one missing value.
df.isna().any().sum()

In [None]:
df.isna().sum().sort_values(ascending = False)

In [None]:
# Top 10 clolumns with highest percentage of missing values;
missing_percent = df.isna().sum().sort_values(ascending = False) / len(df) *100
missing_percent[ : 10]

In [None]:
sns.set_style('whitegrid')
missing_percent[missing_percent != 0].plot(kind = 'barh' , figsize = (8,6))
plt.title("Missing values percentage of top 10 columns")

In [None]:
# Remove columns that have more than 50 percentage of missing values or that are not used for analysis;

In [None]:
df.drop(columns = ['End_Lng' , 'End_Lat'] , axis = 1 , inplace = True)

In [None]:
df.columns

### Now impute the missing values

To handle missing data or null values in numerical columns of a dataset, they are filled with appropriate replacement values.
Missing values of numerical columns can be filled by mean or median.

In [None]:
# Impute missing values for necessary numerical columns: 
df["Temperature(F)"] = df["Temperature(F)"].fillna(df["Temperature(F)"].median())
df["Humidity(%)"]=df["Humidity(%)"].fillna(df["Humidity(%)"].median())

In [None]:
# Impute missing values for categorical data:
df["Weather_Condition"]=df["Weather_Condition"].fillna(df["Weather_Condition"].mode()[0])

In [None]:
# Now our data is clean;

## Exploratory Analysis And Visualization

#### Columns to be analysed:
#### City
#### Start_Time 
#### Start_Lat and Start_Lng
#### Temperature
#### Weather_Condition and severity

### City

In [None]:
cities = len(df['City'].unique())

In [None]:
print(f'There are total of {cities} number of cities.')

In [None]:
# Lets check the cities by accidents
cities_by_accidents = df['City'].value_counts()

In [None]:
cities_by_accidents_20 = cities_by_accidents[ : 20]

In [None]:
cities_by_accidents_20.plot(kind = 'barh' , figsize = (8,6))
plt.title("TOP 20 CITIES WITH HIGHEST NUMBER OF ACCIDENTS")

In [None]:
# Lets find out the cities with highest and lowest number of accidents

In [None]:
high_accident_cities = cities_by_accidents[cities_by_accidents > 1000]
low_accident_cities = cities_by_accidents[cities_by_accidents < 1000]

In [None]:
print("Number of cities with more than 1000 accidents: " + str(len(high_accident_cities)))
print('percentage :' + str(len(high_accident_cities) / cities * 100))

In [None]:
sns.histplot(high_accident_cities , kde = True , bins = 5 )
plt.title('Cities with more than 1000 accidents')

In [None]:
print("Number of cities with less than 1000 accidents: "+ str(len(low_accident_cities)))

In [None]:
# percentage of lowest accident cities
print('Percentage: '+str(len(low_accident_cities) / cities * 100))

In [None]:
sns.displot(low_accident_cities, kde = True , bins = 3)
plt.title("Cities with less than 1000 accidents")

In [None]:
# Cities with one number of accidents;
cities_by_accidents[cities_by_accidents == 1].sum()

### Summary:

#### Number of accidents per city decreases exponentially.
#### Less than five percent of cities have more than 1000 accidents.
#### Less tham 1000 accidents are recorded for 95% of cities.
#### It seems like over 1500 cities reported only one accident.

### Start time

In [None]:
# Lets analyse start_time column;
df['Start_Time']

In [None]:
# start_time column is in string form. 
# converting this column into date datatype;
df['Start_Time'] = pd.to_datetime(df['Start_Time'])

In [None]:
df['Start_Time'][0] # Now it is in date form

In [None]:
# Lets check at what time of the day there is high percentage of accidents
sns.set_style('whitegrid')
sns.histplot(df['Start_Time'].dt.hour , bins = 24 , kde = False)

In [None]:
# check for trend of accidents in weak
sns.histplot(df['Start_Time'].dt.dayofweek , bins = 7)
plt.xlabel("Day of week")

In [None]:
# lets analyse whether accidents are more prone between 6AM and 10AM on weekends also;

weekend_starttime = df[(df['Start_Time'].dt.dayofweek == 5) | (df['Start_Time'].dt.dayofweek == 6)]

In [None]:
sns.histplot(weekend_starttime['Start_Time'].dt.hour , bins = 24)

In [None]:
# check trend of accidents in month;
sns.histplot(df['Start_Time'].dt.month , bins = 12)

In [None]:
# Now we will interpret accident rates in every month for each year:

In [None]:
df['Month'] = df['Start_Time'].dt.month
df['Year'] = df['Start_Time'].dt.year
monthly_accidents=df[["Month","Year"]].value_counts().reset_index()
monthly_accidents.columns=["month","year","number_of_accidents"]

In [None]:
plt.figure(figsize=(15,8))
sns.barplot(x="month",y="number_of_accidents",hue="year",data=monthly_accidents)

### Summary

#### Most of the accidents are occured between 7AM to 10AM.
#### On weekends it seems like accidents are less.
#### unlike in week days, more accidents in weekends are occuring during afternoon between 11AM and 3PM.
#### I think there is no particular trend in accidents by month in a year.
#### MOre accidents are recorded in the years 2018 and 2019 in most of the months.

### Start Latitude and Longitude

In [None]:
df[['Start_Lat' , 'Start_Lng']]

In [None]:
sns.scatterplot(data = df , x = 'Start_Lng' , y = 'Start_Lat' , s = 5)

In [None]:
# Lets try to put it in a map
import folium
folium.Map() # it gives world map
# lets plot one accident in map;
lat, lng = df['Start_Lat'][0] , df['Start_Lat'][0]
map = folium.Map()
marker = folium.Marker((lat, lng))
marker.add_to(map)
map
# Pointing one accident spot in map

In [None]:
# Only 0.001% of sample is taken to mark multiple points on map:
sample_df1 = df.sample(int(0.0001 * len(df)))
locations = sample_df1[['Start_Lng' , 'Start_Lat']]
location_list = locations.values.tolist()

In [None]:
len(location_list)

In [None]:
map = folium.Map()
for x in range(0, len(location_list)):
    marker = folium.Marker(location_list[x])
    marker.add_to(map)
map

### Heatmap of areas where accidents have occured

In [None]:
lat_lng = list(zip(list(df['Start_Lat']) , list(df['Start_Lng'])))

In [None]:
# Lets create a heatmap

from folium.plugins import HeatMap

map = folium.Map()
marker = HeatMap(lat_lng).add_to(map)
map

In [None]:
# Lets create heatmap for sample data

In [None]:
sample_df = df.sample(int(0.01 * len(df)))
samp_lat_lng = list(zip(list(sample_df['Start_Lat']) , list(sample_df['Start_Lng'])))
map = folium.Map()
HeatMap(samp_lat_lng).add_to(map)
map

### Severity

In [None]:
df.columns

In [None]:
df[['Severity', 'Year']]

#### Severity of accidents in each year

In [None]:
pd.crosstab(df["Year"],df["Severity"]).plot(kind="bar")

### Summary:

#### It seems like the trend of severity level 2 is common in all the years

### Temperature

In [None]:
df.columns

In [None]:
df['Temperature(F)'].value_counts()

In [None]:
sns.displot(df['Temperature(F)'] , bins = 5, kde = True)

### Summary:

#### More number of accidents occured in the temperatures between 30°F to 50°F.

### Humidity

In [None]:
sns.histplot(df['Humidity(%)'] , bins = 10)

### Summary:

#### There is increasing trend of accidents with increase in the percentage of humidity

### Weather condition

In [None]:
df['Weather_Condition'].unique()

In [None]:
df['Weather_Condition']

In [None]:
df['Weather_Condition'].value_counts().sort_values()

In [None]:
df['Weather_Condition'].value_counts().sort_values(ascending = False)[:10]

In [None]:
weather_top = df['Weather_Condition'].value_counts().sort_values(ascending = False)[:10]
weather_top.plot(kind = 'barh')

#### Analyse weather conditions along with the severity of accidents:

In [None]:
df['Severity'].unique()

In [None]:
df[['Weather_Condition' , 'Severity']].value_counts().sort_values(ascending = False)[:10]

In [None]:
top_cond = df[['Weather_Condition' , 'Severity']].value_counts().sort_values(ascending = False)[:10]
top_cond.plot(kind = 'barh', figsize = (10,6))
plt.xlabel('number of accidents')
plt.ylabel('Weather_condition with severity')

### 10 of the main weather conditions for accidents at severity 1, 2, 3, 4

In [None]:
for x in range(1,5):
    plt.subplots(figsize = (10,6))
    severity =  df.loc[df['Severity'] == x , ['Weather_Condition']].value_counts().sort_values(ascending = False)[:10].reset_index()
    severity.columns = ['Weather condition' , 'Number of accidents']
    sns.barplot(y = severity['Weather condition'] , x = severity['Number of accidents'])
    plt.ylabel('Weather Condition',fontsize=16)
    plt.xlabel('Accident Count',fontsize=16)
    plt.xticks(fontsize=16)
    plt.yticks(fontsize=16)
    plt.title('10 of The Main Weather Conditions for Accidents of Severity'+str(x))
    plt.tight_layout()

#### Summary

#### Most of the accidents have occured in fair weather conditions in all severity levels.
#### The second most common weather condition is clear weather for severity 2 and 3, which is not the case with severity 1 and 4.