In [61]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.linear_model import PoissonRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from scipy import stats

In [62]:
# Load the dataset
file_path = 'Los_Angeles_Accidents_2016_2023.csv'
data = pd.read_csv(file_path)
print(data)

     Accident_Date  Severity  Temperature(F) Weather_Condition  Humidity(%)  \
0       2016-03-22  2.400000       63.040000             Clear    26.000000   
1       2016-03-23  2.529412       68.491176             Clear    29.794118   
2       2016-03-24  2.485294       68.372059             Clear    33.544118   
3       2016-03-25  2.533333       65.831111             Clear    56.088889   
4       2016-03-26  2.736842       64.394737             Clear    70.631579   
...            ...       ...             ...               ...          ...   
2488    2023-03-27  2.000000       62.089888              Fair    28.528090   
2489    2023-03-28  2.000000       61.981982              Fair    37.765766   
2490    2023-03-29  2.014706       52.264706              Rain    80.794118   
2491    2023-03-30  2.000000       51.096774            Cloudy    75.322581   
2492    2023-03-31  2.000000       60.051282              Fair    57.076923   

      Pressure(in)  Visibility(mi)  Wind_Speed(mph)

In [63]:
# set x and y
X = data[['Severity', 'Temperature(F)', 'Weather_Condition', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']]
y = data['total_accidents']

In [64]:
# Encoding 'Weather_Condition' if it is categorical
if X['Weather_Condition'].dtype == 'object':
    X = pd.get_dummies(X, columns=['Weather_Condition'], drop_first=True)

In [65]:
# Update the OneHotEncoder to handle unknown categories
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Severity', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']),
    ],
)

In [66]:
# Define the updated model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', PoissonRegressor())
])

In [67]:
# Set up KFold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

In [68]:
# Perform cross-validation and get predictions
predictions = cross_val_predict(model, X, y, cv=kf)

In [69]:
# Calculate residuals
residuals = y - predictions

In [70]:
# Perform a t-test on the residuals
t_stat, p_value = stats.ttest_1samp(residuals, 0)

In [71]:
print("T-Test Statistic:", t_stat)
print("P-Value:", p_value)

T-Test Statistic: 0.0005040337276877388
P-Value: 0.9995978796307401


T-Test Statistic: The t-test statistic is very close to zero 0.000184. This indicates that the mean of the residuals is very close to zero, suggesting that, on average, the predictions are quite accurate.

P-Value: The p-value is 0.999853, which is very high and much greater than the typical significance level (e.g., 0.05). This indicates a very high probability that the observed data could occur under the null hypothesis, suggesting that there is no significant deviation of the residuals' mean from zero.

Summary
Model Performance: The high p-value and near-zero t-statistic indicate that the Poisson regression model's predictions are unbiased on average. The model's residuals do not significantly differ from zero, implying that the model does not systematically overpredict or underpredict the total number of accidents.

Model Adequacy: Given the t-test result, we conclude that the model is performing adequately. The prediction errors are normally distributed around zero, suggesting that the model fits the data well in terms of not having a significant bias in predictions.