In [None]:
# Import the necessary dependencies
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Importing the warnings module 
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading the dataset
df = pd.read_csv("us-accidents.csv")

In [None]:
# Inspecting the top five rows of the dataset
df.head()

In [None]:
df.shape # Finding the shape of the dataset

In [None]:
# Find the information on the columns 
df.info()

In [None]:
df.describe().T # Getting the descriptive stats of the dataset 

Observations from us_accident_data

In [None]:
# Find the number of columns that are numeric 
numerics = ['int16','int32','int64','float16','float32','float64']
numeric_df = df.select_dtypes(numerics)
len(numeric_df.columns)

In [None]:
# Find number of missing values in dataset 
missing_percentages = round(df.isnull().sum().sort_values(ascending=False) /len(df) *100,2)

In [None]:
missing_percentages[missing_percentages.values > 0].plot(kind='barh')
plt.show()

In [None]:
# Analyzing the columns having higher number of missing values 
df['Distance(mi)'].describe()

In [None]:
df.columns

In [None]:
# Analyzing the data by state column
df['State'].value_counts().head().plot(kind='bar') # The data indicates california is the highest accident state 

In [None]:
# population data extracted from wikipedia 
population_data = {
    'CA': 39538223,
    'FL': 21538187,
    'TX': 29145505,
    'SC': 5118425,
    'NY': 20201249
}

# Iterating over states 
for state, population in population_data.items():
    accidents_per_capita = round((df[df['State'] == state].shape[0] / population)*100,2)
    print(f"{state} Accident per capita: {accidents_per_capita:.6f}")


In [None]:
# Analyzing the cities columns 
cities_by_accidents = df.City.value_counts()
df.City.nunique() # There are record of 13678 cities 

In [None]:
df.City.value_counts(ascending=False).head(20).plot(kind='bar')
plt.title('Top 20 US cities by accident')
plt.show()

In [None]:
sns.set_style('darkgrid')

In [None]:
sns.distplot(cities_by_accidents) # Based on the diagram we see that the probability of accident occuring is very less 
plt.title("Number of accidents distributed across the cities")
plt.show()

In [None]:
# based on the pdf we plot two charts
# one having accident greater then 2000 --> will be termed as high accident cities
# those having accident less then 2000 are called low accident cities 
high_accident_cities = cities_by_accidents[cities_by_accidents > 2000]
low_accident_cities = cities_by_accidents[cities_by_accidents < 2000]

In [None]:
print(len(high_accident_cities))
print(len(low_accident_cities))

In [None]:
# Find the accident count of high accident cities and accident count of low accident cities 
print(high_accident_cities.values.sum()/len(df))
print(low_accident_cities.values.sum()/len(df))

In [None]:
sns.displot(low_accident_cities,kde=True,log_scale=True)

In [None]:
# Analyzing the start time column 
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')

In [None]:
# Accident over time 
plt.figure(figsize=(10,5))
sns.barplot(x = df['Start_Time'].dt.hour.value_counts().index,y = (df['Start_Time'].dt.hour.value_counts().values/len(df))*100)
plt.title('Count of Accidents over time')
plt.show()

In [None]:
# Accident over day of week 
plt.figure(figsize=(10,5))
sns.barplot(x = df['Start_Time'].dt.day_of_week.value_counts().index,y = (df['Start_Time'].dt.day_of_week.value_counts().values/len(df))*100)
plt.title('Count of Accidents over time')
plt.show()

In [None]:
# Analyze the distribution of accidents on saturday and sunday
weekends_data = df[(df['Start_Time'].dt.day_of_week == 5)|(df['Start_Time'].dt.day_of_week == 6)]

In [None]:
# Plotting the trend on weekends 
weekends_data['Start_Time'].dt.hour.value_counts()
plt.figure(figsize=(10,5))
sns.barplot(x = weekends_data['Start_Time'].dt.hour.value_counts().index,y = weekends_data['Start_Time'].dt.hour.value_counts().values)
plt.title('Count of Accidents over time on weekends')
plt.show()

In [None]:
df['Year'] = df['Start_Time'].dt.year

# Create subplots for the particular years 
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 8))
fig.suptitle('Accidents by the month of the year (2016-2023)', fontsize=16)

# Plot each year
for i, year in enumerate(range(2016, 2024)):
    ax = axes[i // 4, i % 4]
    
    year_df = df[df['Year'] == year]
    
    # Plot the bar chart
    year_df['Start_Time'].dt.month.value_counts().sort_index().plot(kind='bar', ax=ax)
    
    ax.set_title(f'Year {year}')
    ax.set_xlabel('Month')
    ax.set_ylabel('Count')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# Analyzing the month which has most accidents
df['Start_Time'].dt.month.value_counts().sort_index().plot(kind='bar')
plt.title('Accidents by the month of the year')
plt.show() # We cannot rely heavily on this data as most of it is missing during the first 3 months of 2016 and also for the year 2023

In [None]:
# Analyzing the trend over the years 
df['Start_Time'].dt.year.value_counts().sort_index().plot(kind='bar')
plt.xlabel('Year')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Analyze the source from where the data is coming from 
df['Source'].value_counts().plot(kind='pie')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 4))
fig.suptitle('Accidents by the month of the year (Source-wise)', fontsize=16)

for i, source in enumerate(['Source1', 'Source2', 'Source3']):
    ax = axes[i]
    
    # Filter the DataFrame for the current source
    source_df = df[df['Source'] == source]
    
    # Plot the bar chart
    source_df['Start_Time'].dt.month.value_counts().sort_index().plot(kind='bar', ax=ax)
    
    ax.set_title(f'Source: {source}')
    ax.set_xlabel('Month')
    ax.set_ylabel('Count')

plt.tight_layout(rect=[0, 0.03, 1, 0.9])
plt.show()

In [None]:
# Visualizing the data for Start Lat and Start Long 
import folium
from folium.plugins import HeatMap

In [None]:
# Create a heatmap using folium for the subset of 5000 data points 
subset_df = df.head(5000)
map_center = [subset_df['Start_Lat'].mean(), subset_df['Start_Lng'].mean()]
mymap = folium.Map(location=map_center, zoom_start=10)

In [None]:
heat_data = [[point['Start_Lat'], point['Start_Lng']] for _, point in subset_df.iterrows()]
HeatMap(heat_data).add_to(mymap)
mymap

In [None]:
df.isnull().sum()

In [None]:
# Analyzing the severity of the accidents 
df['Severity'].value_counts().plot(kind='pie')  # 1 being the least severe and 4 being the most severe 

In [None]:
# Analyzing weather condition of the accident
df['Weather_Condition'].value_counts().head().plot(kind='barh')
plt.show()

In [None]:
# Analyzing the severity of the accident and Traffic distance effected due to it 
sev_tr_data = df[['Severity','Distance(mi)']]

In [None]:
average_distance_by_severity = sev_tr_data.groupby('Severity')['Distance(mi)'].mean()
average_distance_by_severity.plot(kind='bar')
plt.title('Average Distance by Severity Level')
plt.xlabel('Severity')
plt.ylabel('Average Distance (mi)')
plt.show()

## Assumptions used while Analyzing Temperature Data
- Temperatures above 32°F might be considered "warm."
- Temperatures below 32°F might be considered "cold."

In [None]:
weather_data = df[~df.isna()] # collecting all the weather data

In [None]:
sns.histplot(weather_data['Temperature(F)']) # Temperature is following the normal distribution curve 

In [None]:
print('Total number of accidents reported above 32 degree F : ' + str(weather_data[weather_data['Temperature(F)'] > 32].shape[0]))
print('Total number of accidents reported below 32 degree F : '+ str(weather_data[weather_data['Temperature(F)'] < 32].shape[0]))

## Insights 
* Majority of the Data comes from source 2 (Names of the data provider is not present)
* Even though Data for california has highest accident rate, southern california rates the highest in accident per capita 
* High accident cities results in 72 percent of the accident occuring in USA
* 5 percent of cities results in 1000 yearly accident yearly
* Over 1200 cities have reported 1 accident --> This needs to be investigated in detail
* High percentage of accidents between 6am to 10am in the morning --(since people are commuting to work )
* Next insight is the area between 3pm to 6pm, we see the rise of accidents 
* On weekends the number of accidents are lower(but during the normal week days most accidents occuring in the mornings between 6 to 10 ) while on sundays accidents occur during the afternoon hours  
* The accident rise continues to increase from the month of July
* There was rise in accidents from the year 2016 till 2021, probably after covid accidents seems to have been reduced ( as per the article study citied it can be due to reduced traffic volume )
* There has been sharp increase in the accidents during summers (once the covid restrictions had been started, the accidents seems to have declined during this stage )
* Even though the all sources have the data for all the month, but the accident distribution is not consistent ( Major data provider is the source1 followed by source2 and source 3)
* Weather conditions per accident { The weather was mostly fair, but we need to investigate further as major proportion of the data is missing } 
* Severity 2 and 4 related accident effected the traffic the most ( for severity 2 since the value count is high average distance is on the greater side but as per data severity 4 accident has greatest impact on traffic )
* Increase in ambient temperature increases the accident rate ( but its effect is not known )

