In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
data= pd.read_csv('/kaggle/input/us-accidents/US_Accidents_March23.csv')
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.describe().T

In [None]:
# Find the number of columns that are numeric
numerics = ['int16','int32','int64','float16','float32','float64']
numeric_df = data.select_dtypes(numerics)
len(numeric_df.columns)

In [None]:
# Find number of missing values in dataset
missing_percentages = round(data.isnull().sum().sort_values(ascending=True) /len(data) *100,2)

In [None]:
missing_per = missing_percentages[missing_percentages.values > 0]

In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x=missing_per , y= missing_per.index)
plt.xlabel('Missing Percentage')
plt.ylabel('Features')
plt.title('Missing Data Percentage by Feature')
plt.show()

In [None]:
# Analyzing the data by state column
states = data['State'].value_counts().head() # The data indicates california is the highest accident state


In [None]:
plt.figure(figsize=(10,7))
sns.barplot(y=states , x = states.index, palette="RdPu")
plt.title('Top 5 highest accident States')
plt.xlabel('State')
plt.ylabel('Count')
plt.show()

In [None]:
# Analyzing the cities columns
cities_by_accidents = data['City'].value_counts()
data['City'].nunique()                                  # There are record of 13678 cities

In [None]:
city = cities_by_accidents.sort_values(ascending=False).head(20)

In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x=city.values, y=city.index, color='green')

plt.xlabel('Number of Accidents')
plt.ylabel('City')
plt.title('Top 20 Cities with Highest Number of Accidents', y=1.05)
plt.show()

In [None]:
sns.set_style('whitegrid')

In [None]:
sns.distplot(cities_by_accidents) # Based on the diagram we see that the probability of accident occuring is very less
plt.title("Number of accidents distributed across the cities")
plt.show()

In [None]:
# Analyzing the start time column
data['Start_Time'] = pd.to_datetime(data['Start_Time'], errors='coerce')

In [None]:
# Accident over time
plt.figure(figsize=(10,5))
sns.barplot(x = data['Start_Time'].dt.hour.value_counts().index,y = (data['Start_Time'].dt.hour.value_counts().values/len(data))*100, palette='pastel')
plt.title('Count of Accidents over time')
plt.show()

In [None]:
# Accident over day of week
plt.figure(figsize=(10,5))
sns.barplot(x = data['Start_Time'].dt.day_of_week.value_counts().index,y = (data['Start_Time'].dt.day_of_week.value_counts().values/len(data))*100, palette='icefire')
plt.title('Count of Accidents over week days')
plt.show()

In [None]:
# Analyze the distribution of accidents on saturday and sunday
weekends_data = data[(data['Start_Time'].dt.day_of_week == 5)|(data['Start_Time'].dt.day_of_week == 6)]


In [None]:
# Plotting the trend on weekends
weekends_data['Start_Time'].dt.hour.value_counts()
plt.figure(figsize=(10,5))
sns.barplot(x = weekends_data['Start_Time'].dt.hour.value_counts().index,y = weekends_data['Start_Time'].dt.hour.value_counts().values)
plt.title('Count of Accidents over time on weekends')
plt.show()


In [None]:
# Analyzing the month which has most accidents
data['Start_Time'].dt.month.value_counts().sort_index().plot(kind='bar')
plt.title('Accidents by the month of the year')
plt.show() # We cannot rely heavily on this data as most of it is missing during the first 3 months of 2016 and also for the year 2023

In [None]:
year = data['Start_Time'].dt.year

# Count the occurrences of each year and create a bar plot using Seaborn
sns.countplot(x= year, data=data, palette='viridis')

# Set plot labels and title
plt.xlabel('Year')
plt.xticks(rotation=45)
plt.title('Distribution of Incidents Over the Years')

# Display the plot
plt.show()

In [None]:
severity_count = data['Severity'].value_counts()

In [None]:
plt.figure(figsize = (10,7))
plt.pie(severity_count, labels=severity_count.index, autopct='%1.1f%%',textprops={'weight':'bold'})
plt.legend()
plt.title('Distribution of Incident Severity')
plt.show()

# **INSIGHTS:**

1. State-wise Analysis:

    California, Florida, Texas, South Carolina, and New York emerge as the top 5 states with the highest number of accidents.

2. City Breakdown:

    Miami takes the lead with the highest number of accidents among the top 20 cities.
    Columbia holds the distinction of having the lowest number of accidents in this select group.

3. Peak Hour Findings:

    The time window of 7-8 AM witnesses the majority of accidents, highlighting the importance of heightened awareness during morning rush hours. ⏰

4. Severity 2 and 4 related accident effected the traffic the most ( for severity 2 since the value count is high average distance is on the greater side but as per data severity 4 accident has greatest impact on traffic )

5. The accidents are high from December and  it is lowest at july. The rise continues to increase from the month of July.

6. Accident Counts are higher in the year 2021.