# Data Aggregating and EDA

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_clean= pd.read_csv('..\data\clean_crash_data.csv')
display(df_clean)

# 1) What are top 5 “Road Conditions” with most accidents? 

In [None]:
road_condition_count = df_clean['road_condition'].value_counts().head(5)
display(road_condition_count.to_frame())

In [None]:
road_condition_count.index = ['No Defects', 'Holes, Ruts, etc..', 'Loose Surface Material', 'Foreign Material', 'View Obstructed']
sns.barplot(x=road_condition_count.values, y=road_condition_count.index, orient='h',  palette='viridis')
plt.title('Road Conditions Causing the Most Accidents')
plt.xlabel('Number of Accidents')
plt.ylabel('Road Condition')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))

sns.lineplot(x=road_condition_count.index, y=road_condition_count.values, marker='o', color='skyblue', linewidth=2, markersize=8)

# Customize the plot
plt.title('Road Conditions Causing the Most Accidents', fontsize='14')
plt.xlabel('Road Condition', fontsize='12')
plt.ylabel('Number of Accidents',fontsize='12')
plt.grid(True)
plt.show()

# 2) What are the top 5 most frequent  “'Collision Type”? 

In [None]:
collision_type_count = df_clean['collision_type'].value_counts().head(5)
display(collision_type_count.to_frame())

In [None]:
collision_type_count.index = ['Same Direction Rear End', 'Single Vehicle', 'Straight Movement Angle', 'Other', 'Same Direction Sideswipe']
sns.barplot(x=collision_type_count.values, y=collision_type_count.index, orient='h',  palette='viridis')
plt.title('Most Frequent Collision Types', fontsize='14')
plt.xlabel('Number of Accidents', fontsize='12')
plt.ylabel('Collision Type', fontsize='12')
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
colors = sns.color_palette("viridis", n_colors=len(collision_type_count))
plt.pie(collision_type_count, labels=['']*len(collision_type_count), autopct='%1.1f%%', startangle=90, colors=colors)
plt.title('Most Frequent Collision Types', fontsize=14)


# Add a legend
plt.legend(title="Collision Type", loc="upper right", labels=collision_type_count.index)
plt.show()

# 3) What are the top 5 most frequent “Weather” conditions?

In [None]:
weather_count = df_clean['weather'].value_counts().head(5)
display(weather_count.to_frame())

In [None]:
weather_count.index = ['Clear', 'Raining', 'Cloudy', 'Snow' , 'Foggy']
sns.barplot(x=weather_count.values, y=weather_count.index, orient='h',  palette='viridis')
plt.title('Weather Conditions Resulting in the Most Accidents', fontsize='14')
plt.xlabel('Number of Accidents', fontsize='12')
plt.ylabel('Weather Conditions', fontsize='12')
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
colors = sns.color_palette("viridis", n_colors=len(weather_count))
plt.pie(weather_count, labels=['']*len(weather_count), autopct='%1.1f%%', startangle=90, colors=colors)
plt.title('Weather Conditions Resulting in the Most Accidents', fontsize=14)

# Add a legend
plt.legend(title="Weather Conditions", loc="upper right", labels=weather_count.index)
plt.show()

# 4) What are the top 5 most frequent “'Light” conditions? 

In [None]:
light_count = df_clean['light'].value_counts().head(5)
display(light_count.to_frame())

In [None]:
light_count.index = ['Daylight', 'Dark With Lights On', 'Dark With No Lights' , 'Dusk' , 'Dawn']
sns.barplot(x=light_count.values, y=light_count.index, orient='h',  palette='viridis')
plt.title('Light Conditions Resulting in the Most Accidents', fontsize='14')
plt.xlabel('Number of Accidents', fontsize='12')
plt.ylabel('Light Conditions', fontsize='12')
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
colors = sns.color_palette("viridis", n_colors=len(light_count))
plt.pie(light_count, labels=['']*len(light_count), autopct='%1.1f%%', startangle=90, colors=colors, pctdistance=0.9)
plt.title('Light Conditions Resulting in the Most Accidents', fontsize=14)

# Add a legend
plt.legend(title="Light Conditions", loc="upper right", labels=weather_count.index)
plt.show()

# 5) What are the most frequent “Road Condition” and “Road grade”?

In [None]:
rc_and_rg = df_clean.pivot_table(index='road_condition', columns='road_grade', aggfunc='size', fill_value=0)
display(rc_and_rg)
# Find the most frequent combination for each road_type
most_frequent_combination = rc_and_rg.idxmax(axis=1)

In [None]:
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
sns.heatmap(rc_and_rg, annot=True, fmt='d', cmap='viridis')

# Set plot labels and title
plt.xlabel('Road Grade')
plt.ylabel('Road Condition')
plt.title('Heatmap of Road Type vs. Road Condition')

# Show the plot
plt.show()

# 8) Get the evolution of the number of crashes per hour

In [None]:
df_clean['crash_date/time']=pd.to_datetime(df_clean['crash_date/time'])
type(df_clean['crash_date/time'])
df_clean['crash_date/time']

df_clean['hour'] = df_clean['crash_date/time'].dt.hour

# Group the data by hour and calculate the count of crashes
crash_count_by_hour = df_clean.groupby('hour').size()
display(sum(crash_count_by_hour))

sns.set_palette('viridis')
# Create a line plot to visualize the trend
plt.plot(crash_count_by_hour.index, crash_count_by_hour.values, marker='o')
plt.xlabel('Hour')
plt.ylabel('Number of Crashes')
plt.title('Evolution of the Number of Crashes per Hour')

# 6) How the number of crashes changes over day time?

In [None]:
# Group the data by hour and calculate the count of crashes
crash_count_by_hour = df_clean.groupby('hour').size()

sns.set_palette('viridis')

# Create a bar plot
plt.bar(crash_count_by_hour.index, crash_count_by_hour.values)
plt.xlabel('Hour')
plt.ylabel('Number of Crashes')
plt.title('Number of Crashes Over Daytime')
plt.show()


# 7) Which day of the week has more crashes?

In [None]:
# Extract the day of the week from the 'Timestamp' column
df_clean['DayOfWeek'] = df_clean['crash_date/time'].dt.day_name()

# Count the number of crashes per day of the week
crash_count_by_day = df_clean['DayOfWeek'].value_counts()

days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
crash_count_by_day = crash_count_by_day.reindex(days_order)

plt.figure(figsize=(8, 4)) 
# Create a bar plot to visualize the number of crashes per day of the week
plt.bar(crash_count_by_day.index, crash_count_by_day.values)
plt.xlabel('Day of the Week', fontsize='12')
plt.ylabel('Number of Crashes', fontsize='12')
plt.title('Number of Crashes by Day of the Week', fontsize='14')

plt.show()

In [None]:
display(crash_count_by_day)

In [None]:
# Create a line plot to visualize the number of crashes per day of the week
plt.figure(figsize=(8, 4))
plt.plot(crash_count_by_day.index, crash_count_by_day.values, marker='o', linestyle='-')
plt.xlabel('Day of the Week', fontsize='12')
plt.ylabel('Number of Crashes', fontsize='12')
plt.title('Number of Crashes by Day of the Week', fontsize='14')

plt.show()

# 9) Get the evolution of the number of crashes per year

In [None]:
df_clean['Year'] = df_clean['crash_date/time'].dt.year

# Count the number of crashes per year
crash_count_by_year = df_clean['Year'].value_counts().sort_index()

# Create a line plot to visualize the evolution of crashes per year
plt.figure(figsize=(10, 6))
plt.plot(crash_count_by_year.index, crash_count_by_year.values, marker='o', linestyle='-')
plt.xlabel('Year', fontsize='12')
plt.ylabel('Number of Crashes', fontsize='12')
plt.title('Evolution of Crashes per Year', fontsize='14')
plt.grid(True)
plt.show()

# 10) Which agency has reported more crashes?

In [None]:
#Count the number of crashes reported by each agency
crash_count_by_agency = df_clean['agency_name'].value_counts()

#Create a bar plot to visualize the number of crashes reported by each agency
plt.bar(crash_count_by_agency.index, crash_count_by_agency.values)
plt.xlabel('Agency')
plt.ylabel('Number of Crashes')
plt.title('Number of Crashes Reported by Agency')

plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.tight_layout()  # Adjust spacing if necessary

plt.show()

In [None]:
#Count the number of crashes reported by each agency
crash_count_by_agency = df_clean['agency_name'].value_counts()

# Sort the crash counts in ascending order by agency name
crash_count_by_agency = crash_count_by_agency.sort_index()

#Create a line plot to visualize the number of crashes reported by each agency
plt.plot(crash_count_by_agency.index, crash_count_by_agency.values, marker='o')
plt.xlabel('Agency')
plt.ylabel('Number of Crashes')
plt.title('Number of Crashes Reported by Agency')

plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.tight_layout()  # Adjust spacing if necessary

plt.show()
