In [1]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
# Load the dataset
file_path = 'Los_Angeles_Accidents_2016_2023.csv'
data = pd.read_csv(file_path)
print(data)

     Accident_Date  Severity  Temperature(F) Weather_Condition  Humidity(%)  \
0       2016-03-22  2.400000       63.040000             Clear    26.000000   
1       2016-03-23  2.529412       68.491176             Clear    29.794118   
2       2016-03-24  2.485294       68.372059             Clear    33.544118   
3       2016-03-25  2.533333       65.831111             Clear    56.088889   
4       2016-03-26  2.736842       64.394737             Clear    70.631579   
...            ...       ...             ...               ...          ...   
2488    2023-03-27  2.000000       62.089888              Fair    28.528090   
2489    2023-03-28  2.000000       61.981982              Fair    37.765766   
2490    2023-03-29  2.014706       52.264706              Rain    80.794118   
2491    2023-03-30  2.000000       51.096774            Cloudy    75.322581   
2492    2023-03-31  2.000000       60.051282              Fair    57.076923   

      Pressure(in)  Visibility(mi)  Wind_Speed(mph)

In [3]:
# set x and y
X = data[['Severity', 'Temperature(F)', 'Weather_Condition', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']]
y = data['Accident_Date']

In [4]:
# Encoding 'Weather_Condition' if it is categorical
if X['Weather_Condition'].dtype == 'object':
    X = pd.get_dummies(X, columns=['Weather_Condition'], drop_first=True)

In [5]:
# Convert 'Accident_Date' to numeric (e.g., timestamp) if necessary
y = pd.to_datetime(y).astype(int) / 10**9  # convert to seconds since epoch

In [6]:
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Set up the model
model = LinearRegression()

In [8]:
# Perform K-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='r2')
print(f"R² Scores: {cv_scores}")
print(f"Mean R² Score: {cv_scores.mean()}")
print(f"Mean R² Score Standard: {cv_scores.std()}")

R² Scores: [0.72844574 0.77585511 0.74732849 0.7625664  0.76109573 0.75057881
 0.76712748 0.78327673 0.73139319 0.73492519]
Mean R² Score: 0.7542592861341765
Mean R² Score Standard: 0.01792850731622789


Summary: The mean R² score of approximately 0.75 suggests that the model explains about 75% of the variance in Accident by Date based on the given features. The relatively low standard deviation indicates consistent performance across the folds.