In [1]:
import requests
import pandas as pd

def fetch_data(limit, offset):
    collision_response = requests.get("https://data.cityofnewyork.us/resource/h9gi-nx95.json", params = {"$limit": limit, "$offset": offset})
    collision_data = collision_response.json()
    return collision_data

In [2]:
total_records = 2500000
collision_df = pd.DataFrame()

for offset in range (0, total_records, 50000):
    collision_data = fetch_data(limit=50000, offset=offset)
    collision_df = pd.concat([collision_df, pd.DataFrame(collision_data)], ignore_index=True)
collision_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2044341 entries, 0 to 2044340
Data columns (total 29 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date                     object
 1   crash_time                     object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code              

In [3]:
collision_df.head()

Unnamed: 0,crash_date,crash_time,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,cross_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
1,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
2,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
3,2021-09-11T00:00:00.000,9:35,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4,2021-12-14T00:00:00.000,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [4]:
# Correct column names
collision_df = collision_df.rename(columns={"off_street_name": "cross_street_name", "cross_street_name": "off_street_name"})

In [5]:
collision_clean_df = collision_df.copy() 
collision_clean_df.sort_values(by='crash_date', ascending=True)

Unnamed: 0,crash_date,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
1924028,2012-07-01T00:00:00.000,8:27,,,2,0,0,0,0,0,...,40.7087971,-73.7276200,"{'latitude': '40.7087971', 'longitude': '-73.7...",,,,,,,
1919950,2012-07-01T00:00:00.000,18:10,AVENUE R,EAST 13 STREET,0,0,0,0,0,0,...,40.6055989,-73.9595748,"{'latitude': '40.6055989', 'longitude': '-73.9...",,,,,,,
1923522,2012-07-01T00:00:00.000,19:55,PUTNAM AVENUE,BUSHWICK AVENUE,1,0,1,0,0,0,...,40.6892183,-73.9176492,"{'latitude': '40.6892183', 'longitude': '-73.9...",,,,,,,
1920691,2012-07-01T00:00:00.000,16:55,CITY ISLAND ROAD,SHORE ROAD,0,0,0,0,0,0,...,,,,,,,,,,
1923523,2012-07-01T00:00:00.000,22:40,50 STREET,5 AVENUE,0,0,0,0,0,0,...,40.6451058,-74.0103204,"{'latitude': '40.6451058', 'longitude': '-74.0...",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2029925,2023-11-19T00:00:00.000,19:10,MAJOR DEEGAN EXPRESSWAY,,0,0,0,0,0,0,...,40.855385,-73.9181,"{'latitude': '40.855385', 'longitude': '-73.91...",,,,,,,
2030050,2023-11-19T00:00:00.000,10:00,,,0,0,0,0,0,0,...,40.831547,-73.86343,"{'latitude': '40.831547', 'longitude': '-73.86...",1253 LELAND AVENUE,,,,,,
2030051,2023-11-19T00:00:00.000,15:29,JAMAICA AVENUE,239 STREET,1,0,0,0,0,0,...,40.723083,-73.72817,"{'latitude': '40.723083', 'longitude': '-73.72...",,,,,,,
2030054,2023-11-19T00:00:00.000,14:21,EAST MOUNT EDEN AVENUE,,1,0,1,0,0,0,...,40.842964,-73.91045,"{'latitude': '40.842964', 'longitude': '-73.91...",,,,,,,


In [6]:
collision_clean_df.set_index('collision_id', inplace=True)

In [7]:
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_date,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11T00:00:00.000,9:35,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14T00:00:00.000,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [8]:
crash_datetime = pd.to_datetime(collision_clean_df['crash_date'] + ' ' + collision_clean_df['crash_time'])
collision_clean_df.insert(1, 'crash_datetime', crash_datetime)
collision_clean_df.head(5)

  crash_datetime = pd.to_datetime(collision_clean_df['crash_date'] + ' ' + collision_clean_df['crash_time'])


Unnamed: 0_level_0,crash_date,crash_datetime,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11T00:00:00.000,2021-09-11 02:39:00,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26T00:00:00.000,2022-03-26 11:45:00,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29T00:00:00.000,2022-06-29 06:55:00,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11T00:00:00.000,2021-09-11 09:35:00,9:35,,,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14T00:00:00.000,2021-12-14 08:13:00,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [9]:
collision_clean_df.drop(columns=['crash_date', 'crash_time'], inplace=True)
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,2,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,1,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,THROGS NECK BRIDGE,,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,,,0,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14 08:13:00,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [10]:
def get_address(row):
    if pd.notna(row['off_street_name']):
        return row['off_street_name']
    else:
        # Join only non-null values
        return ' & '.join(filter(pd.notna, [row['on_street_name'], row['cross_street_name']]))

In [11]:
collision_clean_df['street_address'] = collision_clean_df.apply(get_address, axis=1)

In [12]:
columns = collision_clean_df.columns.tolist()
columns.insert(1, columns.pop(columns.index('street_address')))
collision_clean_df = collision_clean_df[columns]
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,WHITESTONE EXPRESSWAY & 20 AVENUE,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,QUEENSBORO BRIDGE UPPER,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,THROGS NECK BRIDGE,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 LORING AVENUE,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14 08:13:00,SARATOGA AVENUE & DECATUR STREET,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [13]:
collision_clean_df.drop(columns=['on_street_name', 'cross_street_name', 'off_street_name'], inplace=True, errors='ignore')

In [14]:
collision_clean_df['street_address'] = collision_clean_df['street_address'].str.title()
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,Queensboro Bridge Upper,1,0,0,0,0,0,1,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,Throgs Neck Bridge,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 Loring Avenue,0,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [15]:
collision_clean_df['borough'] = collision_clean_df['borough'].str.title()
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,Queensboro Bridge Upper,1,0,0,0,0,0,1,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,Throgs Neck Bridge,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 Loring Avenue,0,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [16]:
collision_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2044341 entries, 4455765 to 4647913
Data columns (total 25 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   crash_datetime                 datetime64[ns]
 1   street_address                 object        
 2   number_of_persons_injured      object        
 3   number_of_persons_killed       object        
 4   number_of_pedestrians_injured  object        
 5   number_of_pedestrians_killed   object        
 6   number_of_cyclist_injured      object        
 7   number_of_cyclist_killed       object        
 8   number_of_motorist_injured     object        
 9   number_of_motorist_killed      object        
 10  contributing_factor_vehicle_1  object        
 11  contributing_factor_vehicle_2  object        
 12  vehicle_type_code1             object        
 13  vehicle_type_code2             object        
 14  borough                        object        
 15  zip_code      

In [17]:
collision_clean_df.dropna(subset=['street_address'], inplace=True)

In [18]:
import datetime as dt
# convert the datetime to days of week 
position = collision_clean_df.columns.get_loc('crash_datetime') + 1
crash_day_of_week = collision_clean_df['crash_datetime'].dt.dayofweek
collision_clean_df.insert(position, 'crash_day_of_week', crash_day_of_week)
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,crash_day_of_week,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,5,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,5,Queensboro Bridge Upper,1,0,0,0,0,0,1,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,2,Throgs Neck Bridge,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,5,1211 Loring Avenue,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,1,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [19]:
collision_clean_df['number_of_persons_injured'] = collision_clean_df['number_of_persons_injured'].fillna(0).astype(int)
collision_clean_df['number_of_persons_killed'] = collision_clean_df['number_of_persons_killed'].fillna(0).astype(int)
collision_clean_df['number_of_pedestrians_injured'] = collision_clean_df['number_of_pedestrians_injured'].fillna(0).astype(int)
collision_clean_df['number_of_pedestrians_killed'] = collision_clean_df['number_of_pedestrians_killed'].fillna(0).astype(int)
collision_clean_df['number_of_cyclist_injured'] = collision_clean_df['number_of_cyclist_injured'].fillna(0).astype(int)
collision_clean_df['number_of_cyclist_killed'] = collision_clean_df['number_of_cyclist_killed'].fillna(0).astype(int)
collision_clean_df['number_of_motorist_injured'] = collision_clean_df['number_of_motorist_injured'].fillna(0).astype(int)
collision_clean_df['number_of_motorist_killed'] = collision_clean_df['number_of_motorist_killed'].fillna(0).astype(int)
collision_clean_df['zip_code'] = collision_clean_df['zip_code'].astype(str).str[:5]
collision_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2044341 entries, 4455765 to 4647913
Data columns (total 26 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   crash_datetime                 datetime64[ns]
 1   crash_day_of_week              int32         
 2   street_address                 object        
 3   number_of_persons_injured      int32         
 4   number_of_persons_killed       int32         
 5   number_of_pedestrians_injured  int32         
 6   number_of_pedestrians_killed   int32         
 7   number_of_cyclist_injured      int32         
 8   number_of_cyclist_killed       int32         
 9   number_of_motorist_injured     int32         
 10  number_of_motorist_killed      int32         
 11  contributing_factor_vehicle_1  object        
 12  contributing_factor_vehicle_2  object        
 13  vehicle_type_code1             object        
 14  vehicle_type_code2             object        
 15  borough       

In [20]:
import numpy as np
collision_clean_df['zip_code'] = collision_clean_df['zip_code'].replace('nan', np.nan)
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,crash_day_of_week,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,5,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,5,Queensboro Bridge Upper,1,0,0,0,0,0,1,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,2,Throgs Neck Bridge,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,5,1211 Loring Avenue,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,1,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [21]:
collision_2022_df = collision_clean_df[collision_clean_df['crash_datetime'].dt.year == 2022].copy()
collision_2022_missing_geo = collision_2022_df[collision_2022_df['latitude'].isna() | collision_2022_df['longitude'].isna() | collision_2022_df['zip_code'].isna()]
collision_2022_missing_geo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37236 entries, 4513547 to 4647650
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   crash_datetime                 37236 non-null  datetime64[ns]
 1   crash_day_of_week              37236 non-null  int32         
 2   street_address                 37236 non-null  object        
 3   number_of_persons_injured      37236 non-null  int32         
 4   number_of_persons_killed       37236 non-null  int32         
 5   number_of_pedestrians_injured  37236 non-null  int32         
 6   number_of_pedestrians_killed   37236 non-null  int32         
 7   number_of_cyclist_injured      37236 non-null  int32         
 8   number_of_cyclist_killed       37236 non-null  int32         
 9   number_of_motorist_injured     37236 non-null  int32         
 10  number_of_motorist_killed      37236 non-null  int32         
 11  contributing

In [22]:
# remove multiple space within the address
collision_2022_df['street_address'] = collision_2022_df['street_address'].str.replace(r'\s+', ' ', regex=True)

In [23]:
collision_2022_missing_geo = collision_2022_df[collision_2022_df['latitude'].isna() | collision_2022_df['longitude'].isna() | collision_2022_df['zip_code'].isna()]
collision_2022_missing_geo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 37236 entries, 4513547 to 4647650
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   crash_datetime                 37236 non-null  datetime64[ns]
 1   crash_day_of_week              37236 non-null  int32         
 2   street_address                 37236 non-null  object        
 3   number_of_persons_injured      37236 non-null  int32         
 4   number_of_persons_killed       37236 non-null  int32         
 5   number_of_pedestrians_injured  37236 non-null  int32         
 6   number_of_pedestrians_killed   37236 non-null  int32         
 7   number_of_cyclist_injured      37236 non-null  int32         
 8   number_of_cyclist_killed       37236 non-null  int32         
 9   number_of_motorist_injured     37236 non-null  int32         
 10  number_of_motorist_killed      37236 non-null  int32         
 11  contributing

In [24]:
with open('../api_keys/googlemap.txt') as f:
    google_apikey = f.readline().strip()

In [25]:
def get_latlon_from_google(address):
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
    params = {
        'address': address + ", New York City, NY, USA",
        'key': google_apikey
    }
    response = requests.get(base_url, params=params).json()

    lat, lon, zip_code = None, None,None
    
    if 'results' in response:
        for result in response['results']:
            lat = result['geometry']['location']['lat']
            lon = result['geometry']['location']['lng']
            for component in result['address_components']:
                    if 'postal_code' in component['types']:
                        zip_code = component['long_name']
            
            # Check if we've found all the values we're looking for
            if zip_code:
                break
                
    return lat, lon, zip_code

In [26]:
for i in range(0, len(collision_2022_missing_geo)):
    latitude, longitude, zipcode = get_latlon_from_google(collision_2022_missing_geo.iloc[i]['street_address'])
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'latitude'] = latitude
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'longitude'] = longitude
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'zip_code'] = zipcode

In [32]:
collision_2022_missing_geo = collision_2022_df[collision_2022_df['zip_code'].isna()]

In [33]:
def get_zipcode_from_latlon(lat, lon):
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
    params = {
        'latlng': f'{lat},{lon}',
        'key': google_apikey
    }
    response = requests.get(base_url, params=params).json()

    zip_code = None

    if 'results' in response:
        for result in response['results']:
            for component in result['address_components']:
                if 'postal_code' in component['types']:
                    zip_code = component['short_name']
                    break

            # Check if we've found a zip code
            if zip_code:
                break

    return zip_code


In [34]:
for i in range(0, len(collision_2022_missing_geo)):
    zipcode = get_zipcode_from_latlon(collision_2022_missing_geo.iloc[i]['latitude'], collision_2022_missing_geo.iloc[i]['longitude'])
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'zip_code'] = zipcode

In [36]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Verrazano Bridge Upper', 'zip_code'] = '10305'

In [39]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Queens Midtown Tunnel', 'zip_code'] = '11109'

In [42]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Verrazano Bridge', 'zip_code'] = '10305'

In [44]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Marine Parkway Bridge', 'zip_code'] = '11234'

In [46]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Cross Bay Bridge', 'zip_code'] = '11414'

In [49]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Rockaway Boulevard & Nassau Expressway', 'zip_code'] = '11430'
collision_2022_df.loc[collision_2022_df['street_address'] == 'Rockaway Boulevard & Nassau Expressway', 'latitude'] = 40.6606498
collision_2022_df.loc[collision_2022_df['street_address'] == 'Rockaway Boulevard & Nassau Expressway', 'longitude'] = -73.7755599

In [52]:
collision_2022_df = collision_2022_df.dropna(subset=['zip_code'])

In [53]:
collision_2022_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 103856 entries, 4513547 to 4648224
Data columns (total 26 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   crash_datetime                 103856 non-null  datetime64[ns]
 1   crash_day_of_week              103856 non-null  int32         
 2   street_address                 103856 non-null  object        
 3   number_of_persons_injured      103856 non-null  int32         
 4   number_of_persons_killed       103856 non-null  int32         
 5   number_of_pedestrians_injured  103856 non-null  int32         
 6   number_of_pedestrians_killed   103856 non-null  int32         
 7   number_of_cyclist_injured      103856 non-null  int32         
 8   number_of_cyclist_killed       103856 non-null  int32         
 9   number_of_motorist_injured     103856 non-null  int32         
 10  number_of_motorist_killed      103856 non-null  int32         
 11

In [54]:
collision_2022_df.to_csv('../Resources/collision_2022.csv', index=True, header=True)