In [10]:
DATA_PATH = '/content/drive/MyDrive/RoadSafety_Nov25/data/raw/US_Accidents_March23.csv'

In [None]:
import pandas as pd
df = pd.read_csv(DATA_PATH)

#### Q1. Mean distance differs between day and night

- Is the mean affected road distance ```Distance(mi)``` different for accidents that occur during day vs night ```Sunrise_Sunset```?

In [None]:
            ## H0: The mean distance affected by accidents during the day equals the mean distance affected during night.
## H1: The mean distance affected by accidents during the day differs from the mean distance affected during night.


import pandas as pd
from scipy import stats

# Keep needed columns, drop missing
df1 = df[['Distance(mi)', 'Sunrise_Sunset']].dropna()

day = df1[df1['Sunrise_Sunset'] == 'Day']['Distance(mi)']
night = df1[df1['Sunrise_Sunset'] == 'Night']['Distance(mi)']

# Check normality roughly with large n -> CLT, use Welch t-test
t_stat, p_val = stats.ttest_ind(day, night, equal_var=False)

alpha = 0.05
print(f"T-statistic: {t_stat:.3f}, p-value: {p_val:.4g}")
if p_val < alpha:
    print("Reject H0: Mean distance differs between day and night accidents.")
else:
    print("Fail to reject H0: No evidence of different mean distance.")


T-statistic: -35.172, p-value: 5.907e-271
Reject H0: Mean distance differs between day and night accidents.


#### Q2. Proportion of severe accidents differs by weather condition
- Does the proportion of severe accidents (e.g., Severity 4) differ between clear weather vs rainy weather (Weather_Condition)?

In [18]:
## H0: The proportion of severe accidents in clear weather equals the proportion in rainy weather.
## H1: The proportion of severe accidents in clear weather differs from the proportion in rainy weather.

import numpy as np
# !pip install statsmodels
from statsmodels.stats.proportion import proportions_ztest

# Filter to clear vs rain-like conditions (simple string contains)
clear_mask = df['Weather_Condition'].str.contains('Clear', case=False, na=False)
rain_mask  = df['Weather_Condition'].str.contains('Rain', case=False, na=False)

sub = df[clear_mask | rain_mask].copy()
sub['weather_group'] = np.where(clear_mask.loc[sub.index], 'Clear', 'Rain')

# Define "severe" as Severity == 4
sub['severe'] = (sub['Severity'] == 4).astype(int)

# Counts
clear_severe = sub.loc[sub['weather_group'] == 'Clear', 'severe'].sum()
clear_total  = (sub['weather_group'] == 'Clear').sum()

rain_severe  = sub.loc[sub['weather_group'] == 'Rain', 'severe'].sum()
rain_total   = (sub['weather_group'] == 'Rain').sum()

count = np.array([clear_severe, rain_severe])
nobs  = np.array([clear_total,  rain_total])

stat, p_val = proportions_ztest(count, nobs)
alpha = 0.05
print(f"Z-statistic: {stat:.3f}, p-value: {p_val:.4g}")
if p_val < alpha:
    print("Reject H0: Severe accident proportion differs between clear and rainy weather.")
else:
    print("Fail to reject H0: No evidence of difference in proportions.")


Z-statistic: 22.569, p-value: 8.653e-113
Reject H0: Severe accident proportion differs between clear and rainy weather.


#### Q3. Average temperature differs between two states

- Is the mean temperature at accident time ```Temperature(F)```different between two states, e.g., California (CA) and New York (NY)?

In [19]:
## H0: The mean temperature for accidents in California equals the mean temperature for accidents in New York.
## H1: The mean temperature for accidents in California differs from the mean temperature for accidents in New York.

from scipy import stats

df2 = df[['State', 'Temperature(F)']].dropna()
ca = df2[df2['State'] == 'CA']['Temperature(F)']
ny = df2[df2['State'] == 'NY']['Temperature(F)']

t_stat, p_val = stats.ttest_ind(ca, ny, equal_var=False)
alpha = 0.05
print(f"T-statistic: {t_stat:.3f}, p-value: {p_val:.4g}")
if p_val < alpha:
    print("Reject H0: Mean temperature differs between CA and NY accident records.")
else:
    print("Fail to reject H0: No evidence of different mean temperature.")

T-statistic: 262.633, p-value: 0
Reject H0: Mean temperature differs between CA and NY accident records.


### Q4. Accident counts independent of day vs night

- Are accident counts independent of time of day (day vs night) using ```Sunrise_Sunset``` and a binary indicator of high severity (e.g., ```Severity``` 3–4 vs 1–2)?

In [20]:
## H0: Accident severity level is independent of sunrise/sunset period (no association between high severity and day/night).
## H1: Accident severity level is associated with sunrise/sunset period (high severity depends on day/night).

import pandas as pd
from scipy.stats import chi2_contingency

df3 = df[['Severity', 'Sunrise_Sunset']].dropna()
df3 = df3[df3['Sunrise_Sunset'].isin(['Day', 'Night'])].copy()

# High severity = 3 or 4
df3['high_sev'] = np.where(df3['Severity'] >= 3, 1, 0)

# Contingency table: rows = Sunrise_Sunset, cols = high/low severity
cont_table = pd.crosstab(df3['Sunrise_Sunset'], df3['high_sev'])
print(cont_table)

chi2, p_val, dof, expected = chi2_contingency(cont_table)
alpha = 0.05
print(f"Chi2: {chi2:.3f}, p-value: {p_val:.4g}, dof: {dof}")
if p_val < alpha:
    print("Reject H0: Severity distribution depends on day vs night.")
else:
    print("Fail to reject H0: No evidence of dependence.")


high_sev              0        1
Sunrise_Sunset                  
Day             4281995  1052558
Night           1922069   448526
Chi2: 687.361, p-value: 1.676e-151, dof: 1
Reject H0: Severity distribution depends on day vs night.


#### Q5. Mean visibility differs by presence of fog-like conditions.

- Is the mean visibility ```Visibility(mi)``` during accidents lower under fog-related weather vs non-fog weather? (one-sided test).

In [16]:
## H0: The mean visibility in fog-related weather equals the mean visibility in non-fog weather.
## H1: The mean visibility in fog-related weather is lower than in non-fog weather.

from scipy import stats
import numpy as np

df4 = df[['Visibility(mi)', 'Weather_Condition']].dropna()

fog_mask = df4['Weather_Condition'].str.contains('Fog', case=False, na=False)
fog_vis  = df4[fog_mask]['Visibility(mi)']
nofog_vis = df4[~fog_mask]['Visibility(mi)']

# Welch t-test, then convert to one-sided
t_stat, p_two_sided = stats.ttest_ind(fog_vis, nofog_vis, equal_var=False)

# For H1: mean_fog < mean_nofog, one-sided p-value
if t_stat < 0:
    p_one_sided = p_two_sided / 2
else:
    p_one_sided = 1 - p_two_sided / 2

alpha = 0.05
print(f"T-statistic: {t_stat:.3f}, one-sided p-value: {p_one_sided:.4g}")
if p_one_sided < alpha and t_stat < 0:
    print("Reject H0: Mean visibility is significantly lower in fog conditions.")
else:
    print("Fail to reject H0: No evidence that fog accidents have lower visibility.")


T-statistic: -1203.789, one-sided p-value: 0
Reject H0: Mean visibility is significantly lower in fog conditions.
