In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
#Read dataset
df = pd.read_csv("collisions.csv", low_memory=False)

#Contained Columns:
#CRASH DATE,CRASH TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,
#NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,
#NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,
#CONTRIBUTING FACTOR VEHICLE 1,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,
#CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,
#COLLISION_ID,
#VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5

In [3]:
#Preprocess the data

#rename columns
df = df.rename(columns={'CRASH DATE' : 'Date', 'CRASH TIME' : 'Time', 'ON STREET NAME' : 'Street', 
                        'VEHICLE TYPE CODE 1' : 'Vehicle 1', 'VEHICLE TYPE CODE 2' : 'Vehicle 2',
                        'VEHICLE TYPE CODE 3' : 'Vehicle 3', 'VEHICLE TYPE CODE 4' : 'Vehicle 4',
                        'VEHICLE TYPE CODE 5' : 'Vehicle 5', 'NUMBER OF PERSONS INJURED':'Injuries', 
                        'NUMBER OF PERSONS KILLED': 'Deaths'})

# Keep rows fith defined 'street', 'crash time' and Vehicle Type 1
df = df.dropna(subset=['Street', 'Time', 'Vehicle 1'])

# Preprocess the date column into timestamps
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = (df['Date'] - pd.Timestamp("2010-01-01")) // pd.Timedelta('1s')

# Preprocess the time column into minutes from midnight
df['Time'] = pd.to_datetime(df['Time'], format='%H:%M').dt.time
df['Time'] = df['Time'].apply(lambda t: t.hour * 60 + t.minute)

# Format dataset for uniform strings
df['Street'] = df['Street'].str.upper()
df['Vehicle 1'] = df['Vehicle 1'].str.upper()
df['Vehicle 2'] = df['Vehicle 2'].str.upper()
df['Vehicle 3'] = df['Vehicle 3'].str.upper()
df['Vehicle 4'] = df['Vehicle 4'].str.upper()
df['Vehicle 5'] = df['Vehicle 5'].str.upper()

##List of columns we keep
columns_to_keep = ['Date', 'Time', 'Street', 'Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4','Vehicle 5','Injuries','Deaths']
df = df[columns_to_keep]

In [4]:
#Limit dataframe size
new_df = pd.DataFrame.from_records(df[:4000])  


# Create a list of the columns that represent vehicle types
vehicle_columns = ['Vehicle 1', 'Vehicle 2', 'Vehicle 3', 'Vehicle 4','Vehicle 5']

def count_vehicles_not_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_upper = str(vehicle).upper()
        if vehicle_upper not in exclude_set:
            count += 1
    return count

def count_vehicles_in(row, exclude_set):
    count = 0
    for vehicle in row:
        if pd.isna(vehicle):
            continue
        vehicle_upper = str(vehicle).upper()
        if vehicle_upper in exclude_set:
            count += 1
    return count

new_df['CARS_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_not_in, args= ({'BIKE', 'E-BIKE', 'EBIKE', 'E-SCOOTER', 'ESCOOTER', 'BICYCLE', 'MOTORCYCLE', 'MOTORSCOOTER', 'MOPED'},), axis=1)
new_df['BIKES_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_in, args= ({'BIKE', 'E-BIKE', 'EBIKE', 'E-SCOOTER', 'ESCOOTER', 'BICYCLE'},), axis=1)
new_df['MOTORBIKES_INVOLVED'] = new_df[vehicle_columns].apply(count_vehicles_in, args= ({'MOTORCYCLE', 'MOTORSCOOTER', 'MOPED'},), axis=1)

In [5]:
# Now the 'new_df' DataFrame contains the transformed data
needed_columns = ['Date', 'Time', 'Street','CARS_INVOLVED','BIKES_INVOLVED', 'MOTORBIKES_INVOLVED','Injuries', 'Deaths']

new_df = new_df[needed_columns]

print(new_df)

           Date  Time                        Street  CARS_INVOLVED  \
0     337478400   159         WHITESTONE EXPRESSWAY              2   
1     322876800   705       QUEENSBORO BRIDGE UPPER              1   
2     331084800   415            THROGS NECK BRIDGE              2   
3     324518400   767  MAJOR DEEGAN EXPRESSWAY RAMP              2   
4     345600000  1025    BROOKLYN QUEENS EXPRESSWAY              2   
...         ...   ...                           ...            ...   
3995  325382400  1110               WEST 137 STREET              1   
3996  325382400   860                      65 PLACE              2   
3997  325296000  1035               WEST 124 STREET              1   
3998  325382400     1                     79 STREET              1   
3999  325382400  1297                EAST 34 STREET              2   

      BIKES_INVOLVED  MOTORBIKES_INVOLVED  Injuries  Deaths  
0                  0                    0       2.0     0.0  
1                  0               

In [6]:
# Define the scoring mechanism for each vehicle type
def calculate_vehicle_score(row, vehicle_type):
    involved_vehicles = row[f'{vehicle_type}_INVOLVED']
    injuries = row['Injuries']
    deaths = row['Deaths']
    
    # Define the weights for the scoring mechanism
    vehicle_weight = {'CARS': 0.5, 'BIKES': 0.3, 'MOTORBIKES': 0.2}
    injury_weight = 0.2
    death_weight = 0.3
    
    # Calculate the vehicle score
    vehicle_score = (involved_vehicles / new_df[f'{vehicle_type}_INVOLVED'].sum()) * 100 * vehicle_weight[vehicle_type]
    
    # Calculate the injury score
    injury_score = (injuries / new_df['Injuries'].sum()) * 100 * injury_weight
    
    # Calculate the death score
    death_score = (deaths / new_df['Deaths'].sum()) * 100 * death_weight
    
    # Calculate the total score
    total_score = vehicle_score + injury_score + death_score
    
    return total_score

# Create a list of vehicle types
vehicle_types = ['CARS', 'BIKES', 'MOTORBIKES']

# Calculate the score for each vehicle type and create new columns
for vehicle_type in vehicle_types:
    new_df[f'{vehicle_type}_DANGERSCORE'] = new_df.apply(calculate_vehicle_score, args=(vehicle_type,), axis=1)

# Now the DataFrame contains new columns with the scores for each vehicle type (e.g., 'cars_score', 'bikes_score', 'motorbikes_score')

In [7]:
print(new_df[new_df['CARS_DANGERSCORE'] > 3])

           Date  Time              Street  CARS_INVOLVED  BIKES_INVOLVED  \
2998  345772800    15  MORRIS PARK AVENUE              1               0   

      MOTORBIKES_INVOLVED  Injuries  Deaths  CARS_DANGERSCORE  \
2998                    0       0.0     2.0          3.165124   

      BIKES_DANGERSCORE  MOTORBIKES_DANGERSCORE  
2998           3.157895                3.157895  


In [8]:
needed_columns= ['Date', 'Time', 'Street','CARS_DANGERSCORE','BIKES_DANGERSCORE','MOTORBIKES_DANGERSCORE']

df_for_model = pd.DataFrame.from_records(new_df[needed_columns])

print(df_for_model)

           Date  Time                        Street  CARS_DANGERSCORE  \
0     337478400   159         WHITESTONE EXPRESSWAY          0.032825   
1     322876800   705       QUEENSBORO BRIDGE UPPER          0.016412   
2     331084800   415            THROGS NECK BRIDGE          0.014459   
3     324518400   767  MAJOR DEEGAN EXPRESSWAY RAMP          0.014459   
4     345600000  1025    BROOKLYN QUEENS EXPRESSWAY          0.014459   
...         ...   ...                           ...               ...   
3995  325382400  1110               WEST 137 STREET          0.007230   
3996  325382400   860                      65 PLACE          0.032825   
3997  325296000  1035               WEST 124 STREET          0.016412   
3998  325382400     1                     79 STREET          0.007230   
3999  325382400  1297                EAST 34 STREET          0.014459   

      BIKES_DANGERSCORE  MOTORBIKES_DANGERSCORE  
0              0.018365                0.018365  
1              0.009183

In [9]:
# Preprocess the data

# Perform one-hot encoding for 'Street' column
# TODO: FIX ENCODING!!!!
street_encoder = OneHotEncoder(sparse=False, drop='first')
street_encoded = street_encoder.fit_transform(df[['Street']])
street_columns = street_encoder.get_feature_names(['Street'])

df_encoded = pd.concat([df.drop('Street', axis=1), pd.DataFrame(street_encoded, columns=street_columns)], axis=1)

# Train separate linear regression models for each vehicle type
car_data = df_for_model[['Time', 'Date', 'Street', 'CARS_DANGERSCORE']].dropna()
car_X = car_data[['Time', 'Date', 'Street']]
car_y = car_data['CARS_DANGERSCORE']
car_model = LinearRegression()
car_model.fit(car_X, car_y)

bike_data = df_for_model[['Time', 'Date', 'Street', 'BIKES_DANGERSCORE']].dropna()
bike_X = bike_data[['Time', 'Date', 'Street']]
bike_y = bike_data['BIKES_DANGERSCORE']
bike_model = LinearRegression()
bike_model.fit(bike_X, bike_y)

motorbike_data = df_for_model[['Time', 'Date', 'Street', 'MOTORBIKES_DANGERSCORE']].dropna()
motorbike_X = motorbike_data[['Time', 'Date', 'Street']]
motorbike_y = motorbike_data['MOTORBIKES_DANGERSCORE']
motorbike_model = LinearRegression()
motorbike_model.fit(motorbike_X, motorbike_y)



MemoryError: Unable to allocate 179. GiB for an array with shape (1562181, 15359) and data type float64

In [None]:
# Function to predict the danger score for a given combination of 'Time', 'Date', 'Street' and vehicle type
def predict_danger_score(time, date, street, vehicle_type):
    # TODO: FIX ENCODING!!!!
    street_encoded = street_encoder.transform([[street]])
    street_columns = street_encoder.get_feature_names(['Street'])
    street_df = pd.DataFrame(street_encoded, columns=street_columns)
    
    data = [[time, date, *street_df.values[0]]]
    
    if vehicle_type == 'cars':
        model = car_model
    elif vehicle_type == 'bikes':
        model = bike_model
    elif vehicle_type == 'motorbikes':
        model = motorbike_model
    else:
        return None
    
    return model.predict(data)[0]

# Example usage
time = 14  # Example value for 'Time'
date = pd.to_datetime('2023-07-07')  # Example value for 'Date'
street = 'BROADWAY'  # Example value for 'Street'
vehicle_type = 'cars'  # Example vehicle type

predicted_score = predict_danger_score(time, date, street, vehicle_type)
print(f"The predicted danger score for {vehicle_type} at {time}:00 on {date.date()} on {street} is {predicted_score}")