In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

df = pd.read_csv('collisions.csv', low_memory=False)

# initialize the scaler
scaler = MinMaxScaler(feature_range=(1, 100))

In [2]:
# Model for Time
le = LabelEncoder()
filtered_for_time = df[['CRASH DATE', 'CRASH TIME']].copy()

# Convert 'CRASH TIME' to datetime format and extract the hour
filtered_for_time['CRASH TIME'] = pd.to_datetime(filtered_for_time['CRASH TIME'])
filtered_for_time['Hour'] = filtered_for_time['CRASH TIME'].dt.hour

bins_time = [0, 4, 8, 12, 16, 20, 24]
labels_time = ['00:00-03:59', '04:00-07:59', '08:00-11:59', '12:00-15:59', '16:00-19:59', '20:00-23:59']
filtered_for_time['Timeslot'] = pd.cut(filtered_for_time['Hour'], bins=bins_time, labels=labels_time, right=False)
filtered_for_time['CRASH DATE'] = pd.to_datetime(filtered_for_time['CRASH DATE'])
filtered_for_time['Month'] = filtered_for_time['CRASH DATE'].dt.month
filtered_for_time['Day'] = filtered_for_time['CRASH DATE'].dt.day
df_grouped_time = filtered_for_time.groupby(['Day', 'Month', 'Timeslot']).size().reset_index(name='Count')

# compute danger score
df_grouped_time['Danger Score'] = scaler.fit_transform(df_grouped_time[['Count']])

df_grouped_time['Timeslot'] = le.fit_transform(df_grouped_time['Timeslot'])

features_time = df_grouped_time[['Day', 'Month', 'Timeslot']]
target_time = df_grouped_time['Danger Score']

X_train_time, X_test_time, y_train_time, y_test_time = train_test_split(features_time, target_time, test_size=0.2, random_state=42)

model_for_time = RandomForestRegressor(n_estimators=100, random_state=42)
model_for_time.fit(X_train_time, y_train_time)

In [21]:
le2 = LabelEncoder()
# Model for Street
filtered_for_street = df[['ON STREET NAME']].copy()
filtered_for_street['ON STREET NAME'] = filtered_for_street['ON STREET NAME'].str.upper().str.strip()

df_grouped_street = filtered_for_street.groupby(['ON STREET NAME']).size().reset_index(name='Count')

rows_with_broadway = df_grouped_street[df_grouped_street['ON STREET NAME'].str.contains('BROADWAY')]

df_grouped_street['Danger Score'] = scaler.fit_transform(df_grouped_street[['Count']])

df_grouped_street['ON STREET NAME'] = le2.fit_transform(df_grouped_street['ON STREET NAME'])

features_street = df_grouped_street[['ON STREET NAME']]
target_street = df_grouped_street['Danger Score']
                                                        
X_train_street, X_test_street, y_train_street, y_test_street = train_test_split(features_street, target_street, test_size=0.2, random_state=42)

model_for_street = RandomForestRegressor(n_estimators=100, random_state=42)
model_for_street.fit(X_train_street, y_train_street)

      ON STREET NAME  Count  Danger Score
2444            2444  19149         100.0


In [23]:
# Prepare the input data

street = le2.transform(['BROADWAY'])
t

input_data = pd.DataFrame([
    [le.transform(['12:00-15:59']), 11, 7, street]
], columns=['Timeslot', 'Day', 'Month', 'ON STREET NAME'])

print(test)

# Make predictions using the time and street models
time_prediction = model_for_time.predict(input_data[['Day', 'Month', 'Timeslot']])
street_prediction = model_for_street.predict(input_data[['ON STREET NAME']])

# Calculate the average prediction
average_prediction = (time_prediction + street_prediction) / 2

print(f"Prediction: {average_prediction[0]}")
print(f"Street Prediction: {street_prediction[0]}")
print(f"Time Prediction: {time_prediction[0]}")

[2444]
Prediction: 69.97307826491436
Street Prediction: 60.81480937956965
Time Prediction: 79.13134715025907
