In [261]:
#Imports
import pandas as pd
import numpy as np
import math
from collections import Counter

In [262]:
#Import dataset from CSV file
column_types = {
    "Crash Date": str,
    "Crash Time": str,     
    "Borough": str,      
    "Zip Code": str,  
    "Latitude": float,
    "Longitude": float, 
    "Location": str,
    "On Street Name": str, 
    "Cross Street Name": str, 
    "Off Street Name": str,         
    "Numbers of Persons injured": int,      
    "Numbers of Persons killed": int,         
    "Number of Pedestrians injured": int,   
    "Number of Pedestrians killed": int,         
    "Number of cyclists injured": int,    
    "Number of cyclists killed": int,         
    "Number of motorists injured": int,   
    "Number of motorists killed": int,       
    "Contributing Factor Vehicle 1": str,       
    "Contributing Factor Vehicle 2": str,  
    "Contributing Factor Vehicle 3": str,      
    "Contributing Factor Vehicle 4": str,    
    "Contributing Factor Vehicle 5": str,
    "Collision ID": int,
    "Vehicle Type Code 1": str,
    "Vehicle Type Code 2": str,
    "Vehicle Type Code 3": str,
    "Vehicle Type Code 4": str,
    "Vehicle Type Code 5": str
}
crashes = pd.read_csv("collisions.csv", dtype=column_types, low_memory=False)

In [263]:
#ADJUST VALUES TO IMPACT MODEL
#Define Grid over New York City
#Distance between these two points is about 71.43km,
#so about 50km lateral and 50km longitudinal
south_west_corner = (40.485347, -74.276931)
north_east_corner = (40.948379, -73.689515)

#Define grid properties
lat_density = 500 #Represents the number of grid cells for latitude, ~50km / 500 = ~100m between each grid point
grid_dist = (north_east_corner[0] - south_west_corner[0]) / lat_density
lon_density = math.ceil((north_east_corner[1] - south_west_corner[1]) / grid_dist)
print(lat_density)
print(lon_density)

500
635


In [264]:
#For testing, remove when training
#print(len(crashes))
crashes = crashes.head(100000)

In [265]:
#Fitlter rows without latitude and longitude values or 0 values
crashes = crashes.dropna(subset=['LATITUDE', 'LONGITUDE'])
crashes = crashes[(crashes['LATITUDE'] != 0) & (crashes['LONGITUDE'] != 0)]

In [266]:
#Map lat and long in the dataset to the closest point on the grid
crash_counts = np.zeros((lat_density, lon_density), dtype=int)
for _, row in crashes.iterrows():
    lat_i = round((row['LATITUDE'] - south_west_corner[0]) / grid_dist)
    lon_i = round((row['LONGITUDE'] - south_west_corner[1]) / grid_dist)
    if (lat_i > 0 and lat_i < lat_density and lon_i > 0 and lon_i < lon_density):
        crash_counts[lat_i][lon_i]+=1
    
    #lat = south_west_corner[0] + np.round((row['LATITUDE'] - south_west_corner[0]) / grid_dist) * grid_dist
    #lon = south_west_corner[1] + np.round((row['LONGITUDE'] - south_west_corner[1]) / grid_dist) * grid_dist

In [267]:
#ADJUST VALUES TO IMPACT MODEL
#Define validation grid
val_dist = 3        #Distance between validation points
val_offset_lat = 0  #Offset of first validation point latitude wise
val_offset_lon = 0  #Offset of first validation point longitude wise

In [268]:
#For debugging, this value represenets the percentage of grid points that will be used for validation
#Add 2 to density
val_points_lat = ((lat_density + val_dist - val_offset_lat - 1) // val_dist)
val_points_lon = ((lon_density + val_dist - val_offset_lon - 1) // val_dist)
val_percentage = (val_points_lat * val_points_lon) / (lat_density * lon_density)
print("Number of validation points latitude: ", val_points_lat)
print("Number of validation points longitude: ", val_points_lon)
print("Percentage of points used for validation", val_percentage)

Number of validation points latitude:  167
Number of validation points longitude:  212
Percentage of points used for validation 0.11150866141732284


In [269]:
features     = []
labels       = []
features_val = []
labels_val   = []


for lat_i, row in enumerate(crash_counts):
    for lon_i, count in enumerate(row):
        lat = south_west_corner[0] + lat_i * grid_dist
        lon = south_west_corner[1] + lon_i * grid_dist
        
        if ((lat_i + val_dist - val_offset_lat) % val_dist == 0) and ((lon_i + val_dist - val_offset_lon) % val_dist == 0):
            features_val.append((lat, lon))
            labels_val.append(count)
        else:
            features.append((lat, lon))
            labels.append(count)

In [270]:
#Debugging
print("Lenght of validation feature array: ", len(features_val))
print("Lenght of training feature array: ", len(features))
print("Percentage of validation featrue grid point: ", len(features_val) / (lat_density * lon_density))
print("Percentage of training featrue grid point: ", len(features) / (lat_density * lon_density))
#print("Validation features and labels:")
#for i, _ in enumerate(features_val):
#    print(features_val[i][0], features_val[i][1], labels_val[i])

print("Training features and labels:")
for i, _ in enumerate(features):
    print(features[i][0], features[i][1], labels[i])

Lenght of validation feature array:  35404
Lenght of training feature array:  282096
Percentage of validation featrue grid point:  0.11150866141732284
Percentage of training featrue grid point:  0.8884913385826771
Training features and labels:
40.485347 -74.276004936 0
40.485347 -74.27507887200001 0
40.485347 -74.273226744 0
40.485347 -74.27230068 0
40.485347 -74.270448552 0
40.485347 -74.269522488 0
40.485347 -74.26767036000001 0
40.485347 -74.266744296 0
40.485347 -74.264892168 0
40.485347 -74.263966104 0
40.485347 -74.26211397600001 0
40.485347 -74.26118791200001 0
40.485347 -74.259335784 0
40.485347 -74.25840972 0
40.485347 -74.25655759200001 0
40.485347 -74.25563152800001 0
40.485347 -74.2537794 0
40.485347 -74.252853336 0
40.485347 -74.251001208 0
40.485347 -74.25007514400001 0
40.485347 -74.24822301600001 0
40.485347 -74.247296952 0
40.485347 -74.245444824 0
40.485347 -74.24451876 0
40.485347 -74.24266663200001 0
40.485347 -74.241740568 0
40.485347 -74.23988844 0
40.485347 -74.2

40.5548018 -74.102830968 0
40.5548018 -74.10097884 0
40.5548018 -74.100052776 0
40.5548018 -74.098200648 0
40.5548018 -74.097274584 0
40.5548018 -74.09542245600001 0
40.5548018 -74.094496392 0
40.5548018 -74.092644264 0
40.5548018 -74.0917182 0
40.5548018 -74.089866072 0
40.5548018 -74.08894000800001 0
40.5548018 -74.08708788 0
40.5548018 -74.086161816 0
40.5548018 -74.084309688 0
40.5548018 -74.083383624 0
40.5548018 -74.081531496 0
40.5548018 -74.080605432 0
40.5548018 -74.078753304 0
40.5548018 -74.07782724 0
40.5548018 -74.07597511200001 0
40.5548018 -74.075049048 0
40.5548018 -74.07319692 0
40.5548018 -74.072270856 0
40.5548018 -74.070418728 0
40.5548018 -74.06949266400001 0
40.5548018 -74.067640536 0
40.5548018 -74.066714472 0
40.5548018 -74.064862344 0
40.5548018 -74.06393628000001 0
40.5548018 -74.062084152 0
40.5548018 -74.061158088 0
40.5548018 -74.05930596 0
40.5548018 -74.058379896 0
40.5548018 -74.056527768 0
40.5548018 -74.055601704 0
40.5548018 -74.053749576 0
40.5548018

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
#Import tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(64, activation='relu', input_shape=(2,)))
model.add(keras.layers.Dense(1, activation='linear'))

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
model.fit(features, labels, epochs=10, batch_size=32)