In [223]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

In [209]:
df = pd.read_csv('collisions.csv', low_memory=False)
df['CRASH DATE'] = pd.to_datetime(df['CRASH DATE'])
df['ON STREET NAME'] = df['ON STREET NAME'].str.upper().str.strip()

In [210]:
# calculate danger score per date

grouped_by_date = df[['CRASH DATE']].copy().dropna()

grouped_by_date['Month'] = grouped_by_date['CRASH DATE'].dt.month
grouped_by_date['Day'] = grouped_by_date['CRASH DATE'].dt.day
grouped_by_date = grouped_by_date[['Day', 'Month']]

grouped_by_date = grouped_by_date.groupby(['Day', 'Month']).size().reset_index(name='Count')
grouped_by_date['Danger Score Date'] = (grouped_by_date['Count'] / grouped_by_date['Count'].max()) * 100
grouped_by_date = grouped_by_date[['Day', 'Month', 'Danger Score Date']]

print(grouped_by_date)

     Day  Month  Danger Score Date
0      1      1          64.295808
1      1      2          80.938923
2      1      3          87.282148
3      1      4          77.704506
4      1      5          79.839849
..   ...    ...                ...
361   31      5          88.035798
362   31      7          93.468362
363   31      8          87.784582
364   31     10          90.783482
365   31     12          73.528026

[366 rows x 3 columns]


In [211]:
# calculate danger score per hour

grouped_by_time = df[['CRASH TIME']].copy().dropna()

grouped_by_time['CRASH TIME'] = pd.to_datetime(grouped_by_time['CRASH TIME']).dt.hour;

grouped_by_time = grouped_by_time[['CRASH TIME']].groupby(['CRASH TIME']).size().reset_index(name='Count')

grouped_by_time['Danger Score Time'] = (grouped_by_time['Count'] / grouped_by_time['Count'].max()) * 100

grouped_by_time = grouped_by_time[['CRASH TIME', 'Danger Score Time']]

print(grouped_by_time)

    CRASH TIME  Danger Score Time
0            0          43.717088
1            1          23.459492
2            2          17.992164
3            3          15.775360
4            4          17.853917
5            5          19.489948
6            6          30.466981
7            7          41.555167
8            8          76.016006
9            9          73.573423
10          10          68.681311
11          11          71.692857
12          12          76.166062
13          13          80.073083
14          14          92.600697
15          15          86.540786
16          16         100.000000
17          17          97.724841
18          18          85.951676
19          19          70.254123
20          20          58.590027
21          21          49.524127
22          22          45.123171
23          23          37.779445


In [212]:
# calculate danger score per Street

grouped_by_street = df[['ON STREET NAME']].copy().dropna()

grouped_by_street = grouped_by_street[['ON STREET NAME']].groupby(['ON STREET NAME']).size().reset_index(name='Count')

grouped_by_street['Danger Score Street'] = (grouped_by_street['Count'] / grouped_by_street['Count'].max()) * 100
grouped_by_street = grouped_by_street[['ON STREET NAME', 'Danger Score Street']]

print(grouped_by_street)

                              ON STREET NAME  Danger Score Street
0                                                        0.078333
1      1 278 ROBERT F KENNEDY BRIDGE N//2020             0.005222
2            1 900 G ROBERT F KENNEDY BRIDGE             0.005222
3                                      1 AVE             0.005222
4                                   1 AVENUE            32.367225
...                                      ...                  ...
10644                            ZOLLER ROAD             0.062666
10645                               ZOO PATH             0.005222
10646                         ZULETTE AVENUE             0.344666
10647                          ZWICKY AVENUE             0.020889
10648                         ESTFARMS ROAD             0.005222

[10649 rows x 2 columns]


In [213]:
df_filtered = df[['CRASH DATE', 'CRASH TIME', 'ON STREET NAME']].copy()

df_filtered['Day'] = df_filtered['CRASH DATE'].dt.day
df_filtered['Month'] = df_filtered['CRASH DATE'].dt.month

df_filtered = df_filtered[['Day', 'Month', 'CRASH TIME', 'ON STREET NAME']]


df_filtered['CRASH TIME'] = pd.to_datetime(df_filtered['CRASH TIME']).dt.hour;

df_filtered = pd.merge(df_filtered, grouped_by_date, on=['Day', 'Month'], how='left')
df_filtered = pd.merge(df_filtered, grouped_by_time, on='CRASH TIME', how='left')
df_filtered = pd.merge(df_filtered, grouped_by_street, on=['ON STREET NAME'], how='left')
df_filtered = df_filtered.rename(columns={'CRASH TIME' : 'Hour', 'ON STREET NAME' : 'Street'})

for column in ['Danger Score Time', 'Danger Score Date', 'Danger Score Street']:
    df_filtered[column] = df_filtered[column].fillna(50)
    

df_filtered['Street'] = df_filtered['Street'].fillna('EMPTY')

df_filtered['Danger Score Total'] = (df_filtered['Danger Score Date'] + df_filtered['Danger Score Street'] + df_filtered['Danger Score Time']) / 3

print(df_filtered)

         Day  Month  Hour                   Street  Danger Score Date  \
0         11      9     2    WHITESTONE EXPRESSWAY          92.102371   
1         26      3    11  QUEENSBORO BRIDGE UPPER          77.107866   
2         29      6     6       THROGS NECK BRIDGE          89.558800   
3         11      9     9                    EMPTY          92.102371   
4         14     12     8          SARATOGA AVENUE          97.911760   
...      ...    ...   ...                      ...                ...   
1987316   25      4    12          RICHMOND AVENUE          78.615167   
1987317   18      4    21             EDSON AVENUE          76.919454   
1987318   19      4     7           BEDFORD AVENUE          79.572931   
1987319    3      4    15          NOSTRAND AVENUE          78.363950   
1987320   25      4    19         ROOSEVELT AVENUE          78.615167   

         Danger Score Time  Danger Score Street  Danger Score Total  
0                17.992164            12.867513      

In [216]:
# Random Forest Regressor
le = LabelEncoder()
#df_sampled = df_filtered[:100000].copy()
df_sampled = df_filtered

df_sampled['Street'] = le.fit_transform(df_sampled['Street'])

features = df_sampled[['Day', 'Month', 'Hour', 'Street']]
target = df_sampled[['Danger Score Time', 'Danger Score Date', 'Danger Score Street', 'Danger Score Total']]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [218]:
# Make predictions on the test set
predictions = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, predictions)

# Calculate the mean absolute error
mae = mean_absolute_error(y_test, predictions)

# Calculate the r-squared score
r2 = r2_score(y_test, predictions)

print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)
print('R-squared Score:', r2)

Mean Squared Error: 3.385931948998256
Mean Absolute Error: 0.7553776528824623
R-squared Score: 0.9453928623186059


In [247]:
def predict_danger_score(day, month, time, street):
    # Convert street to uppercase
    street = street.upper().strip()

    # Encode street using the LabelEncoder if it exists in the classes -> otherwise EMPTY
    if street in le.classes_:
        street_encoded = le.transform([street])[0]
    else:
        street_encoded = le.transform(['EMPTY'])[0]

    # Create input data for prediction
    input_data = pd.DataFrame([[day, month, time, street_encoded]], columns=['Day', 'Month', 'Hour', 'Street'])

    # Make predictions
    predictions = model.predict(input_data)

    # Return the predicted danger score
    return predictions[0]

In [248]:
test_day = 9
test_month = 7
test_timeslot = 6
test_street = 'Broadway'

predicted_score = predict_danger_score(test_day, test_month, test_timeslot, test_street)
print('Predicted Danger Score Total:', predicted_score)

Predicted Danger Score Total: [30.46698067 86.55989951 50.         55.67562673]
