In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

# Load the dataset
url = "/home/kirito/Data Science with prodigy/PRODIGY_DS_05/archive (1)/US_Accidents_March23.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset
print(df.head())

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

# Drop columns with too many missing values or irrelevant information
df_cleaned = df.drop(columns=['Unnamed: 0', 'ID', 'Description', 'Number', 'End_Lat', 'End_Lng'])

# Handle missing values in the remaining columns (e.g., fill with mode, drop rows, etc.)
df_cleaned = df_cleaned.dropna()

# Convert relevant columns to appropriate data types
df_cleaned['Start_Time'] = pd.to_datetime(df_cleaned['Start_Time'])
df_cleaned['End_Time'] = pd.to_datetime(df_cleaned['End_Time'])
df_cleaned['Year'] = df_cleaned['Start_Time'].dt.year
df_cleaned['Month'] = df_cleaned['Start_Time'].dt.month
df_cleaned['Hour'] = df_cleaned['Start_Time'].dt.hour

# Display cleaned data
print(df_cleaned.head())


In [None]:
# Analyze the distribution of accidents by road condition
sns.countplot(y='Weather_Condition', data=df_cleaned, order=df_cleaned['Weather_Condition'].value_counts().iloc[:10].index)
plt.title('Distribution of Accidents by Weather Condition')
plt.show()

# Analyze the distribution of accidents by time of day
sns.histplot(df_cleaned['Hour'], bins=24, kde=False)
plt.title('Distribution of Accidents by Hour of the Day')
plt.xlabel('Hour')
plt.ylabel('Number of Accidents')
plt.show()

# Analyze the distribution of accidents by road condition
sns.countplot(y='Road_Condition', data=df_cleaned, order=df_cleaned['Road_Condition'].value_counts().index)
plt.title('Distribution of Accidents by Road Condition')
plt.show()


In [None]:
import folium
from folium.plugins import HeatMap

# Create a base map
base_map = folium.Map(location=[df_cleaned['Start_Lat'].mean(), df_cleaned['Start_Lng'].mean()], zoom_start=5)

# Create a heatmap of accident locations
heat_data = [[row['Start_Lat'], row['Start_Lng']] for index, row in df_cleaned.iterrows()]
HeatMap(heat_data).add_to(base_map)

# Display the map
base_map


In [None]:
# Investigate the relationship between weather condition and accident severity
severity_by_weather = df_cleaned.groupby('Weather_Condition')['Severity'].mean().sort_values(ascending=False)
print(severity_by_weather)

# Investigate the relationship between road condition and accident severity
severity_by_road = df_cleaned.groupby('Road_Condition')['Severity'].mean().sort_values(ascending=False)
print(severity_by_road)

# Investigate the relationship between time of day and accident severity
severity_by_hour = df_cleaned.groupby('Hour')['Severity'].mean()
print(severity_by_hour)


In [None]:
# Bar chart for severity by weather condition
severity_by_weather.plot(kind='bar')
plt.title('Average Severity by Weather Condition')
plt.ylabel('Average Severity')
plt.show()

# Bar chart for severity by road condition
severity_by_road.plot(kind='bar')
plt.title('Average Severity by Road Condition')
plt.ylabel('Average Severity')
plt.show()

# Line plot for severity by hour of day
severity_by_hour.plot(kind='line')
plt.title('Average Severity by Hour of the Day')
plt.xlabel('Hour')
plt.ylabel('Average Severity')
plt.show()
