In [303]:
#Imports
import pandas as pd
import numpy as np
import math
from collections import Counter

In [304]:
#Import dataset from CSV file
column_types = {
    "Crash Date": str,
    "Crash Time": str,     
    "Borough": str,      
    "Zip Code": str,  
    "Latitude": float,
    "Longitude": float, 
    "Location": str,
    "On Street Name": str, 
    "Cross Street Name": str, 
    "Off Street Name": str,         
    "Numbers of Persons injured": int,      
    "Numbers of Persons killed": int,         
    "Number of Pedestrians injured": int,   
    "Number of Pedestrians killed": int,         
    "Number of cyclists injured": int,    
    "Number of cyclists killed": int,         
    "Number of motorists injured": int,   
    "Number of motorists killed": int,       
    "Contributing Factor Vehicle 1": str,       
    "Contributing Factor Vehicle 2": str,  
    "Contributing Factor Vehicle 3": str,      
    "Contributing Factor Vehicle 4": str,    
    "Contributing Factor Vehicle 5": str,
    "Collision ID": int,
    "Vehicle Type Code 1": str,
    "Vehicle Type Code 2": str,
    "Vehicle Type Code 3": str,
    "Vehicle Type Code 4": str,
    "Vehicle Type Code 5": str
}
#crashes = pd.read_csv("collisions.csv", dtype=column_types, low_memory=False)
path = '/kaggle/input/motor-vehicle-collisions/Motor_Vehicle_Collisions_-_Crashes.csv'
crashes = pd.read_csv(path, dtype=column_types, low_memory=False)

In [305]:
#ADJUST VALUES TO IMPACT MODEL
#Define Grid over New York City
#Distance between these two points is about 71.43km,
#so about 50km lateral and 50km longitudinal
south_west_corner = (40.485347, -74.276931)
north_east_corner = (40.948379, -73.689515)

#Define grid properties
lat_density = 1000 #Represents the number of grid cells for latitude, ~50km / 1000 = ~50m between each grid point
grid_dist = (north_east_corner[0] - south_west_corner[0]) / lat_density
lon_density = math.ceil((north_east_corner[1] - south_west_corner[1]) / grid_dist)
print(lat_density)
print(lon_density)

500
635


In [322]:
#For testing, remove when training
#print(len(crashes))
#crashes = crashes.head(100000)

In [337]:
#Fitlter rows without latitude and longitude values or 0 values
crashes = crashes.dropna(subset=['LATITUDE', 'LONGITUDE'])
crashes = crashes[(crashes['LATITUDE'] != 0) & (crashes['LONGITUDE'] != 0)]

In [343]:
#For progress tracking in next step
progress_density = 10
progress = 0

In [344]:
#Map lat and long in the dataset to the closest point on the grid
crash_counts = np.zeros((lat_density, lon_density), dtype=int)
for i, row in crashes.iterrows():
    lat_i = round((row['LATITUDE'] - south_west_corner[0]) / grid_dist)
    lon_i = round((row['LONGITUDE'] - south_west_corner[1]) / grid_dist)
    if (lat_i > 0 and lat_i < lat_density and lon_i > 0 and lon_i < lon_density):
        crash_counts[lat_i][lon_i]+=1
       
    #Progress tracking because this takes a while sometimes
#    if (i > (len(crashes) / (100/progress_density)) * progress and progress * progress_density <= 100):
#        print("Progress: ", progress*progress_density, "%")
#        progress += 1        
#print("Finsihed")
#progress = 0

Progress:  0 %
Progress:  10 %
Progress:  20 %
Progress:  30 %
Progress:  40 %
Progress:  50 %
Progress:  60 %
Progress:  70 %
Progress:  80 %
Progress:  90 %
Progress:  100 %
Finsihed


In [277]:
#ADJUST VALUES TO IMPACT MODEL
#Define validation grid
val_dist = 3        #Distance between validation points
val_offset_lat = 0  #Offset of first validation point latitude wise
val_offset_lon = 0  #Offset of first validation point longitude wise

In [278]:
#For debugging, this value represenets the percentage of grid points that will be used for validation
#Add 2 to density
val_points_lat = ((lat_density + val_dist - val_offset_lat - 1) // val_dist)
val_points_lon = ((lon_density + val_dist - val_offset_lon - 1) // val_dist)
val_percentage = (val_points_lat * val_points_lon) / (lat_density * lon_density)
print("Number of validation points latitude: ", val_points_lat)
print("Number of validation points longitude: ", val_points_lon)
print("Percentage of points used for validation", val_percentage)

Number of validation points latitude:  167
Number of validation points longitude:  212
Percentage of points used for validation 0.11150866141732284


In [279]:
features     = []
labels       = []
features_val = []
labels_val   = []


for lat_i, row in enumerate(crash_counts):
    for lon_i, count in enumerate(row):
        lat = south_west_corner[0] + lat_i * grid_dist
        lon = south_west_corner[1] + lon_i * grid_dist
        
        if ((lat_i + val_dist - val_offset_lat) % val_dist == 0) and ((lon_i + val_dist - val_offset_lon) % val_dist == 0):
            features_val.append((lat, lon))
            labels_val.append(count)
        else:
            features.append((lat, lon))
            labels.append(count)

In [346]:
#Debugging
print("Lenght of validation feature array: ", len(features_val))
print("Lenght of training feature array: ", len(features))
print("Percentage of validation featrue grid point: ", len(features_val) / (lat_density * lon_density))
print("Percentage of training featrue grid point: ", len(features) / (lat_density * lon_density))

#print("Validation features and labels:")
#for i, _ in enumerate(features_val):
#    print(features_val[i][0], features_val[i][1], labels_val[i])

#print("Training features and labels:")
#for i, _ in enumerate(features):
#    print(features[i][0], features[i][1], labels[i])

Lenght of validation feature array:  35404
Lenght of training feature array:  282096
Percentage of validation featrue grid point:  0.11150866141732284
Percentage of training featrue grid point:  0.8884913385826771


In [281]:
#Import tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(64, activation='relu', input_shape=(2,)))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(1, activation='linear')) #Linear loss function because we are using the amount of crashes as labels

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
#Train model
model.fit(np.array(features), np.array(labels), epochs=5, batch_size=64)

In [None]:
#Evaluate model
model.evaluate(np.array(features_val), np.array(labels_val))