In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
ds = pd.read_csv("collisions.csv", low_memory=False)

# Select relevant features
initial_features = ['CRASH DATE', 'BOROUGH', 'VEHICLE TYPE CODE 1']
target = 'NUMBER OF PERSONS INJURED'  # Make sure that 'RANKING' is the correct target column

# Preprocess the data
ds['CRASH DATE'] = pd.to_datetime(ds['CRASH DATE'])
ds['MONTH'] = ds['CRASH DATE'].dt.month
ds['YEAR'] = ds['CRASH DATE'].dt.year
ds = ds.drop(columns=['CRASH DATE'])

# Create dummy variables for categorical features
ds = pd.get_dummies(ds, columns=['BOROUGH', 'VEHICLE TYPE CODE 1'])

# Update features list to include dummy variables, MONTH, and YEAR
features = ['MONTH', 'YEAR'] + [col for col in ds.columns if 'BOROUGH_' in col or 'VEHICLE TYPE CODE 1_' in col]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(ds[features], ds[target], test_size=0.2, random_state=42)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Example prediction for a specific driver
# Note that you must include all the dummy variables, even if they are 0, for the model to make a prediction
driver_data = {
    'MONTH': 6,
    'YEAR': 2023,
    'BOROUGH_Manhattan': 1,
    # Include dummy variables for other boroughs with 0
    # Include dummy variables for 'VEHICLE TYPE CODE 1' with relevant values
}

# Make sure that driver_data contains all the features the model expects
for feature in features:
    if feature not in driver_data:
        driver_data[feature] = 0

driver_ds = pd.DataFrame([driver_data])

# Make a prediction for the driver
prediction = model.predict(driver_ds[features])[0]
print("Danger Ranking Prediction:", prediction)

ValueError: Input y contains NaN.