In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load dataset
ds = pd.read_csv("collisions.csv", low_memory=False)

# rename columns
ds = ds.rename(columns={'CRASH DATE' : 'Date', 'CRASH TIME' : 'Time', 'ON STREET NAME' : 'Street', 
                        'VEHICLE TYPE CODE 1' : 'Vehicle 1', 'VEHICLE TYPE CODE 2' : 'Vehicle 2',
                        'VEHICLE TYPE CODE 3' : 'Vehicle 3', 'VEHICLE TYPE CODE 4' : 'Vehicle 4',
                        'VEHICLE TYPE CODE 5' : 'Vehicle 5'})

# List of columns you want to keep
columns_to_keep = ['Date', 'Time', 'Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4','Vehicle 5']
ds = ds[columns_to_keep]


# Remove rows where Date, Time, Street, or Vehicle 1 is empty
ds = ds.dropna(subset=['Date', 'Time', 'Street', 'Vehicle 1'])

# Preprocess the date column into timestamps
ds['Date'] = pd.to_datetime(ds['Date'])
ds['Date'] = (ds['Date'] - pd.Timestamp("2010-01-01")) // pd.Timedelta('1s')

# Preprocess the time column into minutes from midnight
ds['Time'] = pd.to_datetime(ds['Time'], format='%H:%M').dt.time
ds['Time'] = ds['Time'].apply(lambda t: t.hour * 60 + t.minute)

print('Done')

Done


In [11]:
le = LabelEncoder()

def count_vehicles_not_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_lower = str(vehicle).lower()
        if vehicle_lower not in exclude_set:
            count += 1
    return count

def count_vehicles_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_lower = str(vehicle).lower()
        if vehicle_lower in exclude_set:
            count += 1
    return count

vehicle_columns = ['Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4', 'Vehicle 5']
ds['danger_level_car'] = ds[vehicle_columns].apply(count_vehicles_not_in, args= ({'bike', 'e-bike', 'ebike', 'e-scooter', 'escooter', 'bicycle', 'motorcycle', 'motorscooter', 'moped'},), axis=1)
ds['danger_level_bike'] = ds[vehicle_columns].apply(count_vehicles_in, args= ({'bike', 'e-bike', 'ebike', 'e-scooter', 'escooter', 'bicycle'},), axis=1)
ds['danger_level_motorcycle'] = ds[vehicle_columns].apply(count_vehicles_in, args= ({'motorcycle', 'motorscooter', 'moped'},), axis=1)

print('Done')

Done


In [12]:
# Fit the LabelEncoder on the entire dataset before splitting
le = LabelEncoder()
all_streets = pd.concat([ds['Street'], ds['Vehicle 1'], ds['Vehicle 2'], ds['Vehicle 3'], ds['Vehicle 4'], ds['Vehicle 5']])
le.fit(all_streets)

# Split data
X = ds[['Date', 'Time', 'Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4', 'Vehicle 5']]
Y = ds[['danger_level_car', 'danger_level_bike', 'danger_level_motorcycle']]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Label encoding and scaling for training set
for column in ['Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4', 'Vehicle 5']:
    X_train[column] = le.transform(X_train[column])

# Initialize a new scaler for danger levels
scaler = MinMaxScaler(feature_range=(0, 100))
for level in ['danger_level_car', 'danger_level_bike', 'danger_level_motorcycle']:
    Y_train[level] = scaler.fit_transform(Y_train[level].values.reshape(-1, 1))
    
    # Label encoding and scaling for test set
# NOTE: Use the same encoder object as used for the training set.
for column in ['Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4', 'Vehicle 5']:
    X_test[column] = le.transform(X_test[column])

for level in ['danger_level_car', 'danger_level_bike', 'danger_level_motorcycle']:
    Y_test[level] = scaler.transform(Y_test[level].values.reshape(-1, 1))

# Train model
model = RandomForestRegressor()
model.fit(X_train, Y_train)

# Evaluate model
predictions = model.predict(X_test)

# Calculate the performance metrics
mae = mean_absolute_error(Y_test, predictions)
mse = mean_squared_error(Y_test, predictions)
r2 = r2_score(Y_test, predictions)

# Output the metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'R-squared: {r2}')

Mean Absolute Error (MAE): 3.1928862458671663
Mean Squared Error (MSE): 32.51760908599156
R-squared: 0.827139243759575
