In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
#Read dataset
df = pd.read_csv("collisions.csv", low_memory=False)

#Contained Columns:
#CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,
#NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,
#NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,
#CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,
#CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,
#COLLISION_ID,
#VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5

In [3]:
#Preprocess the data

#rename columns
df = df.rename(columns={'CRASH DATE' : 'Date', 'CRASH TIME' : 'Time', 'ON STREET NAME' : 'Street', 
                        'VEHICLE TYPE CODE 1' : 'Vehicle 1', 'VEHICLE TYPE CODE 2' : 'Vehicle 2',
                        'VEHICLE TYPE CODE 3' : 'Vehicle 3', 'VEHICLE TYPE CODE 4' : 'Vehicle 4',
                        'VEHICLE TYPE CODE 5' : 'Vehicle 5', 'NUMBER OF PERSONS INJURED':'Injuries', 
                        'NUMBER OF PERSONS KILLED': 'Deaths'})

# Keep rows fith defined 'street', 'crash time' and Vehicle Type 1
df = df.dropna(subset=['Date','Street', 'Time', 'Vehicle 1'])

# Preprocess the date column into timestamps
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = (df['Date'] - pd.Timestamp("2010-01-01")) // pd.Timedelta('1s')

# Preprocess the time column into minutes from midnight
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.time
df['Time'] = df['Time'].apply(lambda t: t.hour * 60 + t.minute)

# Format dataset for uniform strings
df['Street'] = df['Street'].str.upper()
df['Vehicle 1'] = df['Vehicle 1'].str.upper()
df['Vehicle 2'] = df['Vehicle 2'].str.upper()
df['Vehicle 3'] = df['Vehicle 3'].str.upper()
df['Vehicle 4'] = df['Vehicle 4'].str.upper()
df['Vehicle 5'] = df['Vehicle 5'].str.upper()

##List of columns we keep
columns_to_keep = ['Date', 'Time', 'Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4','Vehicle 5','Injuries','Deaths']
df = df[columns_to_keep]

In [14]:
#Limit dataframe size
new_df = pd.DataFrame.from_records(df[:100000])  


# Create a list of the columns that represent vehicle types
vehicle_columns = ['Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4','Vehicle 5']

def count_vehicles_not_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_upper = str(vehicle).upper()
        if vehicle_upper not in exclude_set:
            count += 1
    return count

def count_vehicles_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_upper = str(vehicle).upper()
        if vehicle_upper in exclude_set:
            count += 1
    return count

new_df['CARS_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_not_in, args= ({'BIKE', 'E-BIKE', 'EBIKE', 'E-SCOOTER', 'ESCOOTER', 'BICYCLE', 'MOTORCYCLE', 'MOTORSCOOTER', 'MOPED'},), axis=1)
new_df['BIKES_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_in, args= ({'BIKE', 'E-BIKE', 'EBIKE', 'E-SCOOTER', 'ESCOOTER', 'BICYCLE'},), axis=1)
new_df['MOTORBIKES_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_in, args= ({'MOTORCYCLE', 'MOTORSCOOTER', 'MOPED'},), axis=1)

In [15]:
# Now the 'new_df' DataFrame contains the transformed data
needed_columns = ['Date', 'Time', 'Street','CARS_INVOLVED','BIKES_INVOLVED', 'MOTORBIKES_INVOLVED','Injuries', 'Deaths']

new_df = new_df[needed_columns]

print(new_df)

            Date  Time                        Street  CARS_INVOLVED  \
0      337478400   159         WHITESTONE EXPRESSWAY              2   
1      322876800   705       QUEENSBORO BRIDGE UPPER              1   
2      331084800   415            THROGS NECK BRIDGE              2   
3      324518400   767  MAJOR DEEGAN EXPRESSWAY RAMP              2   
4      345600000  1025    BROOKLYN QUEENS EXPRESSWAY              2   
...          ...   ...                           ...            ...   
99995  329270400  1285                 WATSON AVENUE              2   
99996  329270400  1005                SPRAGUE AVENUE              1   
99997  329356800     0                     94 AVENUE              1   
99998  329356800   797                      8 AVENUE              1   
99999  329443200  1275                     90 STREET              1   

       BIKES_INVOLVED  MOTORBIKES_INVOLVED  Injuries  Deaths  
0                   0                    0       2.0     0.0  
1                   0

In [17]:
sum_of_injuries = new_df['Injuries'].sum()
sum_of_deaths = new_df['Deaths'].sum()
vehicle_sums = {
    'CARS': new_df['CARS_INVOLVED'].sum(),
    'BIKES': new_df['BIKES_INVOLVED'].sum(),
    'MOTORBIKES': new_df['MOTORBIKES_INVOLVED'].sum()
}

#TODO: Fix score function
# Define the scoring mechanism for each vehicle type
def calculate_vehicle_score(row, vehicle_type):
    involved_vehicles = row[f'{vehicle_type}_INVOLVED']
    injuries = row['Injuries']
    deaths = row['Deaths']
    
    # Define the weights for the scoring mechanism
    vehicle_weight = {'CARS': 0.5, 'BIKES': 0.3, 'MOTORBIKES': 0.2}
    injury_weight = 0.2
    death_weight = 0.3
    
    # Calculate the vehicle score
    vehicle_score = (involved_vehicles / vehicle_sums[vehicle_type]) * 100 * vehicle_weight[vehicle_type]
    
    # Calculate the injury score
    injury_score = (injuries / sum_of_injuries) * 100 * injury_weight
    
    # Calculate the death score
    death_score = (deaths / sum_of_deaths) * 100 * death_weight
    
    # Calculate the total score
    total_score = vehicle_score + injury_score + death_score
    
    return total_score

# Create a list of vehicle types
vehicle_types = ['CARS', 'BIKES', 'MOTORBIKES']

# Calculate the score for each vehicle type and create new columns
for vehicle_type in vehicle_types:
    new_df[f'{vehicle_type}_DANGERSCORE'] = new_df.apply(calculate_vehicle_score, args=(vehicle_type,), axis=1)

# Now the DataFrame contains new columns with the scores for each vehicle type (e.g., 'cars_score', 'bikes_score', 'motorbikes_score')

In [18]:
print(new_df[new_df['CARS_DANGERSCORE'] > 3])

Empty DataFrame
Columns: [Date, Time, Street, CARS_INVOLVED, BIKES_INVOLVED, MOTORBIKES_INVOLVED, Injuries, Deaths, CARS_DANGERSCORE, BIKES_DANGERSCORE, MOTORBIKES_DANGERSCORE]
Index: []


In [19]:
needed_columns= ['Date', 'Time', 'Street','CARS_DANGERSCORE','BIKES_DANGERSCORE','MOTORBIKES_DANGERSCORE']

df_for_model = pd.DataFrame.from_records(new_df[needed_columns])

print(df_for_model)

            Date  Time                        Street  CARS_DANGERSCORE  \
0      337478400   159         WHITESTONE EXPRESSWAY          0.001302   
1      322876800   705       QUEENSBORO BRIDGE UPPER          0.000651   
2      331084800   415            THROGS NECK BRIDGE          0.000585   
3      324518400   767  MAJOR DEEGAN EXPRESSWAY RAMP          0.000585   
4      345600000  1025    BROOKLYN QUEENS EXPRESSWAY          0.000585   
...          ...   ...                           ...               ...   
99995  329270400  1285                 WATSON AVENUE          0.000585   
99996  329270400  1005                SPRAGUE AVENUE          0.000651   
99997  329356800     0                     94 AVENUE          0.000292   
99998  329356800   797                      8 AVENUE          0.000651   
99999  329443200  1275                     90 STREET          0.000292   

       BIKES_DANGERSCORE  MOTORBIKES_DANGERSCORE  
0               0.000718                0.000718  
1        

In [20]:
# Preprocess the data for model training

encoder = LabelEncoder()
df_for_model['Street'] = encoder.fit_transform(df_for_model['Street'])  # Convert 'Street' column to numerical label

In [21]:
# Train separate linear regression models for each vehicle type
car_data = df_for_model[['Time', 'Date', 'Street', 'CARS_DANGERSCORE']].dropna()
car_X = car_data[['Time', 'Date', 'Street']]
car_y = car_data['CARS_DANGERSCORE']

car_X_train, car_X_test, car_Y_train, car_Y_test = train_test_split(car_X, car_y, test_size=0.2)

car_model = LinearRegression()
car_model.fit(car_X_train, car_Y_train)

car_predictions = car_model.predict(car_X_test)

# Calculate the performance metrics
mae = mean_absolute_error(car_Y_test, car_predictions)
mse = mean_squared_error(car_Y_test, car_predictions)
r2 = r2_score(car_Y_test, car_predictions)

# Output the metrics
print(f'Mean Absolute Error for car model (MAE): {mae}')
print(f'Mean Squared Error (MSE) for car model: {mse}')
print(f'R-squared for car model: {r2}')

Mean Absolute Error for car model (MAE): 0.0007407560421754718
Mean Squared Error (MSE) for car model: 2.9452253564416797e-05
R-squared for car model: -0.00018247865637421867


In [22]:
# Train separate linear regression models for each vehicle type
bike_data = df_for_model[['Time', 'Date', 'Street', 'BIKES_DANGERSCORE']].dropna()
bike_X = bike_data[['Time', 'Date', 'Street']]
bike_y = bike_data['BIKES_DANGERSCORE']

bike_X_train, bike_X_test, bike_Y_train, bike_Y_test = train_test_split(bike_X, bike_y, test_size=0.2)

bike_model = LinearRegression()
bike_model.fit(bike_X_train, bike_Y_train)

bike_predictions = bike_model.predict(bike_X_test)

# Calculate the performance metrics
mae = mean_absolute_error(bike_Y_test, bike_predictions)
mse = mean_squared_error(bike_Y_test, bike_predictions)
r2 = r2_score(bike_Y_test, bike_predictions)

# Output the metrics
print(f'Mean Absolute Error for car model (MAE): {mae}')
print(f'Mean Squared Error (MSE) for car model: {mse}')
print(f'R-squared for car model: {r2}')

Mean Absolute Error for car model (MAE): 0.0011352345761117336
Mean Squared Error (MSE) for car model: 2.6401830420303184e-05
R-squared for car model: 1.0399371799918455e-05


In [23]:
# Train separate linear regression models for each vehicle type
motorbike_data = df_for_model[['Time', 'Date', 'Street', 'MOTORBIKES_DANGERSCORE']].dropna()
motorbike_X = motorbike_data[['Time', 'Date', 'Street']]
motorbike_y = motorbike_data['MOTORBIKES_DANGERSCORE']

motorbike_X_train, motorbike_X_test, motorbike_Y_train, motorbike_Y_test = train_test_split(motorbike_X, motorbike_y, test_size=0.2)

motorbike_model = LinearRegression()
motorbike_model.fit(motorbike_X_train, motorbike_Y_train)

motorbike_predictions = motorbike_model.predict(motorbike_X_test)

# Calculate the performance metrics
mae = mean_absolute_error(motorbike_Y_test, motorbike_predictions)
mse = mean_squared_error(motorbike_Y_test, motorbike_predictions)
r2 = r2_score(motorbike_Y_test, motorbike_predictions)

# Output the metrics
print(f'Mean Absolute Error for car model (MAE): {mae}')
print(f'Mean Squared Error (MSE) for car model: {mse}')
print(f'R-squared for car model: {r2}')

Mean Absolute Error for car model (MAE): 0.0010137791738986685
Mean Squared Error (MSE) for car model: 2.8884851539961865e-05
R-squared for car model: 9.681392249005771e-05


In [24]:
# Function to predict the danger score for a given combination of 'Time', 'Date', 'Street' and vehicle type
def predict_danger_score(time, date, street, vehicle_type):   
    street_label = encoder.transform([street])
    data = np.asarray([[time, date, street_label]],order='K',dtype=object)
    
    if vehicle_type == 'cars':
        model = car_model
    elif vehicle_type == 'bikes':
        model = bike_model
    elif vehicle_type == 'motorbikes':
        model = motorbike_model
    else:
        return None
    
    return model.predict(data)[0]

def parseTime(time):
    return time.hour * 60 + time.minute

# Example usage
time = '14:02'  # Example value for 'Time'
date = '07/07/2023'  # Example value for 'Date'
street = 'BROADWAY'  # Example value for 'Street'
vehicle_type = 'motorbikes'  # Example vehicle type

# Preprocess the date column into timestamps
date = pd.to_datetime(date)
date = (date - pd.Timestamp("2010-01-01")) // pd.Timedelta('1s')

# Preprocess the time column into minutes from midnight
time = pd.to_datetime(time, format='%H:%M').time()
time = parseTime(time)

predicted_score = predict_danger_score(time, date, street, vehicle_type)
print(f"The predicted danger score for {vehicle_type} at {time}:00 on {date} on {street} is {predicted_score}")

The predicted danger score for motorbikes at 842:00 on 426384000 on BROADWAY is 0.0011234176928275573


