In [4]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

df = pd.read_csv('../data/collisions_clean.csv', parse_dates=['CRASH DATE'])

In [8]:
#Temporal Features (from CRASH DATE + existing hour)
df['day_of_week'] = df['CRASH DATE'].dt.dayofweek  # Monday=0
df['month'] = df['CRASH DATE'].dt.month

# Season
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['month'].apply(get_season)

def time_interval(hour):
    if 6 <= hour <= 9:
        return "Morning Rush"
    elif 10 <= hour <= 15:
        return "Midday"
    elif 16 <= hour <= 19:
        return "Evening Rush"
    elif 20 <= hour <= 23:
        return "Night"
    else:
        return "Late Night"

df['time_interval'] = df['hour'].apply(time_interval)


In [9]:
# One-hot encode boroughs
if 'BOROUGH' in df.columns:
    df['BOROUGH'] = df['BOROUGH'].fillna('Unknown')
    df = pd.get_dummies(df, columns=['BOROUGH'], prefix='BORO')

# ZIP code grouping (first 3 digits)
if 'ZIP CODE' in df.columns:
    df['ZIP CODE'] = df['ZIP CODE'].fillna(0).astype(int)
    df['zip_group'] = df['ZIP CODE'].astype(str).str[:3]
    df = pd.get_dummies(df, columns=['zip_group'], prefix='ZIP')

In [10]:
# Vehicle types
df['vehicle_type_clean'] = df['vehicle_type_clean'].fillna('Unknown')
df = pd.get_dummies(df, columns=['vehicle_type_clean'], prefix='VEH_TYPE')

# Contributing factors
df['CONTRIBUTING FACTOR VEHICLE 1'] = df['CONTRIBUTING FACTOR VEHICLE 1'].fillna('Unknown')
df['CONTRIBUTING FACTOR VEHICLE 2'] = df['CONTRIBUTING FACTOR VEHICLE 2'].fillna('Unknown')
df = pd.get_dummies(df, 
                    columns=['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2'],
                    prefix=['FACTOR1','FACTOR2'])


In [11]:
# Traffic planners care about serious collisions (Injury or Fatal)
df['target_severity'] = df['severity'].apply(lambda x: 1 if x != 'Property Damage Only' else 0)

# Optional target category
df['total_casualties'] = df[['NUMBER OF PERSONS INJURED',
                             'NUMBER OF PERSONS KILLED',
                             'NUMBER OF PEDESTRIANS INJURED',
                             'NUMBER OF PEDESTRIANS KILLED',
                             'NUMBER OF CYCLIST INJURED',
                             'NUMBER OF CYCLIST KILLED',
                             'NUMBER OF MOTORIST INJURED',
                             'NUMBER OF MOTORIST KILLED']].sum(axis=1)

In [16]:
#train test split
temporal_cutoff = pd.to_datetime('2024-01-01')  # train on 2022-2023, test on 2024+
train_df = df[df['CRASH DATE'] < temporal_cutoff]
test_df = df[df['CRASH DATE'] >= temporal_cutoff]

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (192037, 177)
Test shape: (182988, 177)
