In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
#Read dataset
df = pd.read_csv("collisions.csv", low_memory=False)

#Contained Columns:
#CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,
#NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,
#NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,
#CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,
#CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,
#COLLISION_ID,
#VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5

In [3]:
#Preprocess the data

#rename columns
df = df.rename(columns={'CRASH DATE' : 'Date', 'CRASH TIME' : 'Time', 'ON STREET NAME' : 'Street', 
                        'VEHICLE TYPE CODE 1' : 'Vehicle 1', 'VEHICLE TYPE CODE 2' : 'Vehicle 2',
                        'VEHICLE TYPE CODE 3' : 'Vehicle 3', 'VEHICLE TYPE CODE 4' : 'Vehicle 4',
                        'VEHICLE TYPE CODE 5' : 'Vehicle 5', 'NUMBER OF PERSONS INJURED':'Injuries', 
                        'NUMBER OF PERSONS KILLED': 'Deaths'})

# Keep rows fith defined 'street', 'crash time' and Vehicle Type 1
df = df.dropna(subset=['Date','Street', 'Time', 'Vehicle 1'])

# Preprocess the date column into timestamps
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = (df['Date'] - pd.Timestamp("2010-01-01")) // pd.Timedelta('1s')

# Preprocess the time column into minutes from midnight
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.time
df['Time'] = df['Time'].apply(lambda t: t.hour * 60 + t.minute)

# Format dataset for uniform strings
df['Street'] = df['Street'].str.upper()
df['Vehicle 1'] = df['Vehicle 1'].str.upper()
df['Vehicle 2'] = df['Vehicle 2'].str.upper()
df['Vehicle 3'] = df['Vehicle 3'].str.upper()
df['Vehicle 4'] = df['Vehicle 4'].str.upper()
df['Vehicle 5'] = df['Vehicle 5'].str.upper()

##List of columns we keep
columns_to_keep = ['Date', 'Time', 'Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4','Vehicle 5','Injuries','Deaths']
df = df[columns_to_keep]

In [4]:
#Limit dataframe size
new_df = pd.DataFrame.from_records(df[:100000])  


# Create a list of the columns that represent vehicle types
vehicle_columns = ['Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4','Vehicle 5']

def count_vehicles_not_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_upper = str(vehicle).upper()
        if vehicle_upper not in exclude_set:
            count += 1
    return count

def count_vehicles_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_upper = str(vehicle).upper()
        if vehicle_upper in exclude_set:
            count += 1
    return count

new_df['CARS_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_not_in, args= ({'BIKE', 'E-BIKE', 'EBIKE', 'E-SCOOTER', 'ESCOOTER', 'BICYCLE', 'MOTORCYCLE', 'MOTORSCOOTER', 'MOPED'},), axis=1)
new_df['BIKES_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_in, args= ({'BIKE', 'E-BIKE', 'EBIKE', 'E-SCOOTER', 'ESCOOTER', 'BICYCLE'},), axis=1)
new_df['MOTORBIKES_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_in, args= ({'MOTORCYCLE', 'MOTORSCOOTER', 'MOPED'},), axis=1)

In [5]:
# Now the 'new_df' DataFrame contains the transformed data
needed_columns = ['Date', 'Time', 'Street','CARS_INVOLVED','BIKES_INVOLVED', 'MOTORBIKES_INVOLVED','Injuries', 'Deaths']

new_df = new_df[needed_columns]

print(new_df)

            Date  Time                        Street  CARS_INVOLVED  \
0      337478400   159         WHITESTONE EXPRESSWAY              2   
1      322876800   705       QUEENSBORO BRIDGE UPPER              1   
2      331084800   415            THROGS NECK BRIDGE              2   
3      324518400   767  MAJOR DEEGAN EXPRESSWAY RAMP              2   
4      345600000  1025    BROOKLYN QUEENS EXPRESSWAY              2   
...          ...   ...                           ...            ...   
99995  329270400  1285                 WATSON AVENUE              2   
99996  329270400  1005                SPRAGUE AVENUE              1   
99997  329356800     0                     94 AVENUE              1   
99998  329356800   797                      8 AVENUE              1   
99999  329443200  1275                     90 STREET              1   

       BIKES_INVOLVED  MOTORBIKES_INVOLVED  Injuries  Deaths  
0                   0                    0       2.0     0.0  
1                   0

In [6]:
car_rows = new_df[new_df['CARS_INVOLVED'] > 0]
bike_rows = new_df[new_df['BIKES_INVOLVED'] > 0]
motorbike_rows = new_df[new_df['MOTORBIKES_INVOLVED'] > 0]

maxVehicle = {
    'CARS': new_df['CARS_INVOLVED'].max(),
    'BIKES': new_df['BIKES_INVOLVED'].max(),
    'MOTORBIKES': new_df['MOTORBIKES_INVOLVED'].max()
}
maxDeaths = {
    'CARS': car_rows['Deaths'].max(),
    'BIKES': bike_rows['Deaths'].max(),
    'MOTORBIKES': motorbike_rows['Deaths'].max()
}
maxInjuries = {
    'CARS': car_rows['Injuries'].max(),
    'BIKES': bike_rows['Injuries'].max(),
    'MOTORBIKES': motorbike_rows['Injuries'].max()
}

# Function for calculating the severity of death count
def calculate_death_weight(number):
    if number == 0:
        return 0.0
    else:
        return 1 - (0.5 ** number)

#Function for calculating the severity of injury count
def calculate_injury_weight(number):
    if number == 0:
        return 0.0
    else:
        return 1 - ((1 - 0.1) ** number)
    
# Assuming your dataset is stored in a pandas DataFrame called 'df'
# and the columns are named 'Injuries', 'Deaths', 'Car_count', 'Bike_count', 'Motorbike_count', 'Streetname'

# The scoring mechanism
def calculate_danger_score(row,vehicle_type):
    involved_vehicles = row[f'{vehicle_type}_INVOLVED']
    
    # Calculate the danger score
    danger_score = ((calculate_injury_weight(row['Injuries']) +
                    calculate_death_weight(row['Deaths'])) * involved_vehicles)
    
    # Normalize the danger score to a 0-100 range
    max_score = ((calculate_injury_weight(maxInjuries[vehicle_type]) +
                 calculate_death_weight(maxDeaths[vehicle_type])) * involved_vehicles)
    
    danger_score_normalized = (danger_score / max_score) * 100 if max_score > 0 else 0
    
    return danger_score_normalized

# Create a list of vehicle types
vehicle_types = ['CARS', 'BIKES', 'MOTORBIKES']

# Calculate the score for each vehicle type and create new columns
for vehicle_type in vehicle_types:
    new_df[f'{vehicle_type}_DANGERSCORE'] = new_df.apply(calculate_danger_score, args={vehicle_type}, axis=1)

# Now the DataFrame contains new columns with the scores for each vehicle type (e.g., 'cars_score', 'bikes_score', 'motorbikes_score')

In [7]:
print(new_df[new_df['CARS_DANGERSCORE'] > 60])

            Date  Time           Street  CARS_INVOLVED  BIKES_INVOLVED  \
85066  332035200  1247  HYLAN BOULEVARD              2               0   

       MOTORBIKES_INVOLVED  Injuries  Deaths  CARS_DANGERSCORE  \
85066                    0       6.0     3.0         72.225846   

       BIKES_DANGERSCORE  MOTORBIKES_DANGERSCORE  
85066                0.0                     0.0  


In [8]:
#print(new_df['CARS_DANGERSCORE'].max())
top_10_rows = new_df.nlargest(10, 'CARS_DANGERSCORE')
print(top_10_rows)

            Date  Time                         Street  CARS_INVOLVED  \
85066  332035200  1247                HYLAN BOULEVARD              2   
48075  333244800  1238             ROCKAWAY BOULEVARD              2   
98663  332985600   356  HUTCHINSON RIVER PARKWAY RAMP              1   
80148  320284800  1429               CROSS BRONX EXPY              3   
32822  316137600   251              GUY R BREWER BLVD              2   
64940  317606400     5             GRAND CENTRAL PKWY              1   
80320  331516800  1333                     121 STREET              5   
16798  327801600   286                  BORDEN AVENUE              1   
82203  320544000   104           HENRY HUDSON PARKWAY              1   
82249  320976000   276                      FDR DRIVE              1   

       BIKES_INVOLVED  MOTORBIKES_INVOLVED  Injuries  Deaths  \
85066               0                    0       6.0     3.0   
48075               0                    0       4.0     2.0   
98663          

In [9]:
needed_columns= ['Date', 'Time', 'Street','CARS_DANGERSCORE','BIKES_DANGERSCORE','MOTORBIKES_DANGERSCORE']

df_for_model = pd.DataFrame.from_records(new_df[needed_columns])

print(df_for_model)

            Date  Time                        Street  CARS_DANGERSCORE  \
0      337478400   159         WHITESTONE EXPRESSWAY         10.213851   
1      322876800   705       QUEENSBORO BRIDGE UPPER          5.375711   
2      331084800   415            THROGS NECK BRIDGE          0.000000   
3      324518400   767  MAJOR DEEGAN EXPRESSWAY RAMP          0.000000   
4      345600000  1025    BROOKLYN QUEENS EXPRESSWAY          0.000000   
...          ...   ...                           ...               ...   
99995  329270400  1285                 WATSON AVENUE          0.000000   
99996  329270400  1005                SPRAGUE AVENUE          5.375711   
99997  329356800     0                     94 AVENUE          0.000000   
99998  329356800   797                      8 AVENUE          5.375711   
99999  329443200  1275                     90 STREET          0.000000   

       BIKES_DANGERSCORE  MOTORBIKES_DANGERSCORE  
0               0.000000                     0.0  
1        

In [10]:
# Preprocess the data for model training

encoder = LabelEncoder()
df_for_model['Street'] = encoder.fit_transform(df_for_model['Street'])  # Convert 'Street' column to numerical label

In [11]:
# Train separate linear regression models for each vehicle type
car_data = df_for_model[['Time', 'Date', 'Street', 'CARS_DANGERSCORE']].dropna()
car_X = car_data[['Time', 'Date', 'Street']]
car_y = car_data['CARS_DANGERSCORE']

car_X_train, car_X_test, car_Y_train, car_Y_test = train_test_split(car_X, car_y, test_size=0.3)

car_model = LinearRegression()
car_model.fit(car_X_train, car_Y_train)

car_predictions = car_model.predict(car_X_test)

# Calculate the performance metrics
mae = mean_absolute_error(car_Y_test, car_predictions)
mse = mean_squared_error(car_Y_test, car_predictions)
r2 = r2_score(car_Y_test, car_predictions)

# Output the metrics
print(f'Mean Absolute Error for car model (MAE): {mae}')
print(f'Mean Squared Error (MSE) for car model: {mse}')
print(f'R-squared for car model: {r2}')

Mean Absolute Error for car model (MAE): 3.4438070125246654
Mean Squared Error (MSE) for car model: 18.887856435792994
R-squared for car model: 0.0010470202838854803


In [12]:
# Train separate linear regression models for each vehicle type
bike_data = df_for_model[['Time', 'Date', 'Street', 'BIKES_DANGERSCORE']].dropna()
bike_X = bike_data[['Time', 'Date', 'Street']]
bike_y = bike_data['BIKES_DANGERSCORE']

bike_X_train, bike_X_test, bike_Y_train, bike_Y_test = train_test_split(bike_X, bike_y, test_size=0.2)

bike_model = LinearRegression()
bike_model.fit(bike_X_train, bike_Y_train)

bike_predictions = bike_model.predict(bike_X_test)

# Calculate the performance metrics
mae = mean_absolute_error(bike_Y_test, bike_predictions)
mse = mean_squared_error(bike_Y_test, bike_predictions)
r2 = r2_score(bike_Y_test, bike_predictions)

# Output the metrics
print(f'Mean Absolute Error for car model (MAE): {mae}')
print(f'Mean Squared Error (MSE) for car model: {mse}')
print(f'R-squared for car model: {r2}')

Mean Absolute Error for car model (MAE): 1.2229478764499566
Mean Squared Error (MSE) for car model: 6.109568771390397
R-squared for car model: 0.004255490747592128


In [13]:
# Train separate linear regression models for each vehicle type
motorbike_data = df_for_model[['Time', 'Date', 'Street', 'MOTORBIKES_DANGERSCORE']].dropna()
motorbike_X = motorbike_data[['Time', 'Date', 'Street']]
motorbike_y = motorbike_data['MOTORBIKES_DANGERSCORE']

motorbike_X_train, motorbike_X_test, motorbike_Y_train, motorbike_Y_test = train_test_split(motorbike_X, motorbike_y, test_size=0.2)

motorbike_model = LinearRegression()
motorbike_model.fit(motorbike_X_train, motorbike_Y_train)

motorbike_predictions = motorbike_model.predict(motorbike_X_test)

# Calculate the performance metrics
mae = mean_absolute_error(motorbike_Y_test, motorbike_predictions)
mse = mean_squared_error(motorbike_Y_test, motorbike_predictions)
r2 = r2_score(motorbike_Y_test, motorbike_predictions)

# Output the metrics
print(f'Mean Absolute Error for car model (MAE): {mae}')
print(f'Mean Squared Error (MSE) for car model: {mse}')
print(f'R-squared for car model: {r2}')

Mean Absolute Error for car model (MAE): 0.4342392230556272
Mean Squared Error (MSE) for car model: 2.7285385238127136
R-squared for car model: 0.0014663982757765481


In [27]:
# Function to predict the danger score for a given combination of 'Time', 'Date', 'Street' and vehicle type
def predict_danger_score(time, date, street, vehicle_type):   
    street_label = encoder.transform([street])
    data = np.asarray([[time, date, street_label]],order='K',dtype=object)
    
    if vehicle_type == 'cars':
        model = car_model
    elif vehicle_type == 'bikes':
        model = bike_model
    elif vehicle_type == 'motorbikes':
        model = motorbike_model
    else:
        return None
    
    return model.predict(data)[0]

def parseTime(time):
    return time.hour * 60 + time.minute

# Example usage
time = '09:02'  # Example value for 'Time'
date = '19/06/2020'  # Example value for 'Date'
street = 'BROADWAY'  # Example value for 'Street'
vehicle_type = 'cars'  # Example vehicle type

# Preprocess the date column into timestamps
date = pd.to_datetime(date)
date = (date - pd.Timestamp("2010-01-01")) // pd.Timedelta('1s')

# Preprocess the time column into minutes from midnight
time = pd.to_datetime(time, format='%H:%M').time()
time = parseTime(time)

predicted_score = predict_danger_score(time, date, street, vehicle_type)
print(f"The predicted danger score for {vehicle_type} at {time}:00 on {date} on {street} is {predicted_score}")

The predicted danger score for cars at 542:00 on 330220800 on BROADWAY is 2.7620873660840743


  date = pd.to_datetime(date)
