In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium.plugins import HeatMap
from scipy import stats

# Set visual style
sns.set_style("whitegrid")

# ==========================================
# 1. LOAD & PREPARE DATA
# ==========================================
print("Loading data...")
df = pd.read_csv('US_Accidents_March23.csv', nrows=200000)

# Cleaning (Same as Milestone 2)
cols_to_drop = ['End_Lat', 'End_Lng']
df_clean = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
df_clean = df_clean.dropna(subset=['Street', 'City', 'Zipcode', 'Start_Lat', 'Start_Lng'])
df_clean['Start_Time'] = pd.to_datetime(df_clean['Start_Time'], errors='coerce')
df_clean['Hour'] = df_clean['Start_Time'].dt.hour
df_clean['Weekday'] = df_clean['Start_Time'].dt.day_name()

print(f"Data Ready: {df_clean.shape}")

# ==========================================
# WEEK 5: GEOSPATIAL ANALYSIS (MAPS)
# ==========================================
print("\n--- Generatng Maps ---")

# 1. Create a Base Map centered on the US
m = folium.Map(location=[37.0902, -95.7129], zoom_start=4)

# 2. Create a Heatmap of Accidents
# We use a sample of 10,000 points to keep the map fast
sample_data = df_clean.sample(n=10000, random_state=42)
heat_data = list(zip(sample_data['Start_Lat'], sample_data['Start_Lng']))

HeatMap(heat_data).add_to(m)

# 3. Save the map
m.save("accident_heatmap.html")
print("Map saved as 'accident_heatmap.html'. Open this file in your browser to see it.")

# ==========================================
# WEEK 6: HYPOTHESIS TESTING
# ==========================================
print("\n--- Running Hypothesis Tests ---")

# HYPOTHESIS 1: Are accidents more severe during the Night vs. Day?
# Null Hypothesis (H0): Severity is the same for Day and Night.
# Alternate Hypothesis (H1): Severity is different.

day_severity = df_clean[df_clean['Sunrise_Sunset'] == 'Day']['Severity']
night_severity = df_clean[df_clean['Sunrise_Sunset'] == 'Night']['Severity']

t_stat, p_val = stats.ttest_ind(day_severity, night_severity, equal_var=False)

print(f"Test 1 (Day vs Night Severity): P-value = {p_val}")
if p_val < 0.05:
    print(">> Result: REJECT Null Hypothesis. There is a significant difference in severity between Day and Night.")
else:
    print(">> Result: ACCEPT Null Hypothesis. No significant difference found.")


# HYPOTHESIS 2: Do accidents happen more on Weekdays vs. Weekends?
# We compare the average number of accidents per day type.

weekend_counts = df_clean[df_clean['Weekday'].isin(['Saturday', 'Sunday'])]['Weekday'].value_counts()
weekday_counts = df_clean[~df_clean['Weekday'].isin(['Saturday', 'Sunday'])]['Weekday'].value_counts()

t_stat_2, p_val_2 = stats.ttest_ind(weekday_counts, weekend_counts, equal_var=False)

print(f"\nTest 2 (Weekday vs Weekend Frequency): P-value = {p_val_2}")
if p_val_2 < 0.05:
    print(">> Result: REJECT Null Hypothesis. Accident frequency differs significantly between Weekdays and Weekends.")
else:
    print(">> Result: ACCEPT Null Hypothesis. No significant difference found.")

Loading data...
Data Ready: (199956, 46)

--- Generatng Maps ---
Map saved as 'accident_heatmap.html'. Open this file in your browser to see it.

--- Running Hypothesis Tests ---
Test 1 (Day vs Night Severity): P-value = 1.528560131438592e-60
>> Result: REJECT Null Hypothesis. There is a significant difference in severity between Day and Night.

Test 2 (Weekday vs Weekend Frequency): P-value = 0.00011805354513334485
>> Result: REJECT Null Hypothesis. Accident frequency differs significantly between Weekdays and Weekends.
