In [54]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Load dataset
ds = pd.read_csv("collisions.csv", low_memory=False)

# rename columns

ds = ds.rename(columns={'CRASH DATE' : 'Date', 'CRASH TIME' : 'Time', 'ON STREET NAME' : 'Street', 
                        'VEHICLE TYPE CODE 1' : 'Vehicle 1', 'VEHICLE TYPE CODE 2' : 'Vehicle 2',
                        'VEHICLE TYPE CODE 3' : 'Vehicle 3', 'VEHICLE TYPE CODE 4' : 'Vehicle 4',
                        'VEHICLE TYPE CODE 5' : 'Vehicle 5'})

# List of columns you want to keep
columns_to_keep = ['Date', 'Time', 'Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4','Vehicle 5']
ds = ds[columns_to_keep]


# Remove rows where Date, Time, Street, or Vehicle 1 is empty
ds.dropna(subset=['Date', 'Time', 'Street', 'Vehicle 1'])

# Preprocess the date column into timestamps
ds['Date'] = pd.to_datetime(ds['Date'])
ds['Date'] = (ds['Date'] - pd.Timestamp("2010-01-01")) // pd.Timedelta('1s')

# Preprocess the time column into minutes from midnight
ds['Time'] = pd.to_datetime(ds['Time'], format='%H:%M').dt.time
ds['Time'] = ds['Time'].apply(lambda t: t.hour * 60 + t.minute)

In [55]:
le = LabelEncoder()

def count_vehicles_not_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_lower = str(vehicle).lower()
        if vehicle_lower not in exclude_set:
            count += 1
    return count

def count_vehicles_in(row, include_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_lower = str(vehicle).lower()
        if vehicle_lower in include_set:
            count += 1
    return count

vehicle_columns = ['Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4', 'Vehicle 5']
ds['danger_level_car'] = ds[vehicle_columns].apply(count_vehicles_not_in, args= ({'Bike', 'Motorcycle', ''},), axis=1)
ds['danger_level_bike'] = ds[vehicle_columns].apply(count_vehicles_in, args= ({'Bike'},), axis=1)
ds['danger_level_motorcycle'] = ds[vehicle_columns].apply(count_vehicles_in, args= ({'Motorcycle'},), axis=1)

In [56]:
print(ds)
#ds['Street']    = le.fit_transform(ds['Street'])
#ds['Vehicle 1'] = le.fit_transform(ds['Vehicle 1'])
#ds['Vehicle 2'] = le.fit_transform(ds['Vehicle 2'])
#ds['Vehicle 3'] = le.fit_transform(ds['Vehicle 3'])
#ds['Vehicle 4'] = le.fit_transform(ds['Vehicle 4'])
#ds['Vehicle 5'] = le.fit_transform(ds['Vehicle 5'])

#scaler = MinMaxScaler(feature_range=(0, 100))
#ds['danger_level'] = scaler.fit_transform(np.array(ds['danger_level']).reshape(-1, 1))

# Split data
#X = ds[['Date', 'Time', 'Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4', 'Vehicle 5']]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train model
#model = RandomForestRegressor()
#model.fit(X_train, y_train)

# Evaluate model
#predictions = model.predict(X_test)
#error = mean_squared_error(y_test, predictions)
#print(f"Mean Squared Error: {error}")

# Check if target variable has only one unique value
#if len(np.unique(y_train)) == 1:
#    print("Target variable has only one unique value.")

# Check if any feature column has only one unique value
#for i, column in enumerate(X_train.columns):
#    if len(X_train[column].unique()) == 1:
#        print(f"Feature column {i} ({column}) has only one unique value.")

              Date  Time                   Street  \
0        337478400   159    WHITESTONE EXPRESSWAY   
1        322876800   705  QUEENSBORO BRIDGE UPPER   
2        331084800   415       THROGS NECK BRIDGE   
3        337478400   575                      NaN   
4        345600000   493          SARATOGA AVENUE   
...            ...   ...                      ...   
1987316  325468800   735          RICHMOND AVENUE   
1987317  324864000  1276             EDSON AVENUE   
1987318  324950400   460           BEDFORD AVENUE   
1987319  323568000   950          NOSTRAND AVENUE   
1987320  325468800  1145         ROOSEVELT AVENUE   

                                        Vehicle 1  \
0                                           Sedan   
1                                           Sedan   
2                                           Sedan   
3                                           Sedan   
4                                             NaN   
...                                          