In [1]:
import requests
import pandas as pd

def fetch_data(limit, offset):
    collision_response = requests.get("https://data.cityofnewyork.us/resource/h9gi-nx95.json", params = {"$limit": limit, "$offset": offset})
    collision_data = collision_response.json()
    return collision_data

In [2]:
total_records = 2500000
collision_df = pd.DataFrame()

for offset in range (0, total_records, 50000):
    collision_data = fetch_data(limit=50000, offset=offset)
    collision_df = pd.concat([collision_df, pd.DataFrame(collision_data)], ignore_index=True)
collision_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042904 entries, 0 to 2042903
Data columns (total 29 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date                     object
 1   crash_time                     object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code              

In [3]:
collision_df.head()

Unnamed: 0,crash_date,crash_time,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,cross_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
1,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
2,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
3,2021-09-11T00:00:00.000,9:35,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4,2021-12-14T00:00:00.000,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [4]:
# Correct column names
collision_df = collision_df.rename(columns={"off_street_name": "cross_street_name", "cross_street_name": "off_street_name"})

In [5]:
collision_clean_df = collision_df.copy() 
collision_clean_df.sort_values(by='crash_date', ascending=True)

Unnamed: 0,crash_date,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
1922572,2012-07-01T00:00:00.000,8:08,RALPH AVENUE,PROSPECT PLACE,0,0,0,0,0,0,...,40.6727913,-73.9222153,"{'latitude': '40.6727913', 'longitude': '-73.9...",,,,,,,
1922116,2012-07-01T00:00:00.000,23:00,FURMAN STREET,MONTAGUE STREET,0,0,0,0,0,0,...,40.6959999,-73.9982402,"{'latitude': '40.6959999', 'longitude': '-73.9...",,,,,,,
1920675,2012-07-01T00:00:00.000,16:10,2 AVENUE,EAST 67 STREET,0,0,0,0,0,0,...,40.7656139,-73.9606907,"{'latitude': '40.7656139', 'longitude': '-73.9...",,,,,,,
1921117,2012-07-01T00:00:00.000,9:59,EAST 167 STREET,GERARD AVENUE,0,0,0,0,0,0,...,40.8353970,-73.9203050,"{'latitude': '40.835397', 'longitude': '-73.92...",,,,,,,
1923549,2012-07-01T00:00:00.000,21:05,WEST 72 STREET,AMSTERDAM AVENUE,0,0,0,0,0,0,...,40.7786073,-73.9816215,"{'latitude': '40.7786073', 'longitude': '-73.9...",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2042751,2023-11-14T00:00:00.000,10:04,,,0,0,0,0,0,0,...,40.68378,-73.86734,"{'latitude': '40.68378', 'longitude': '-73.867...",293 GRANT AVENUE,,,,,,
2042752,2023-11-14T00:00:00.000,15:00,,,1,0,0,0,0,0,...,40.82029,-73.89214,"{'latitude': '40.82029', 'longitude': '-73.892...",925 SOUTHERN BOULEVARD,,,,,,
2042753,2023-11-14T00:00:00.000,17:50,HARLEM RIVER DRIVE,,0,0,0,0,0,0,...,40.802757,-73.93026,"{'latitude': '40.802757', 'longitude': '-73.93...",,,,,,,
2042742,2023-11-14T00:00:00.000,8:20,147 AVENUE,226 STREET,2,0,0,0,0,0,...,40.658688,-73.755745,"{'latitude': '40.658688', 'longitude': '-73.75...",,Unspecified,Sedan,,,,


In [6]:
collision_clean_df.set_index('collision_id', inplace=True)

In [7]:
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_date,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11T00:00:00.000,9:35,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14T00:00:00.000,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [8]:
crash_datetime = pd.to_datetime(collision_clean_df['crash_date'] + ' ' + collision_clean_df['crash_time'])
collision_clean_df.insert(1, 'crash_datetime', crash_datetime)
collision_clean_df.head(5)

  crash_datetime = pd.to_datetime(collision_clean_df['crash_date'] + ' ' + collision_clean_df['crash_time'])


Unnamed: 0_level_0,crash_date,crash_datetime,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11T00:00:00.000,2021-09-11 02:39:00,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26T00:00:00.000,2022-03-26 11:45:00,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29T00:00:00.000,2022-06-29 06:55:00,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11T00:00:00.000,2021-09-11 09:35:00,9:35,,,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14T00:00:00.000,2021-12-14 08:13:00,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [9]:
collision_clean_df.drop(columns=['crash_date', 'crash_time'], inplace=True)
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,2,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,1,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,THROGS NECK BRIDGE,,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,,,0,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14 08:13:00,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [10]:
def get_address(row):
    if pd.notna(row['off_street_name']):
        return row['off_street_name']
    else:
        # Join only non-null values
        return ' & '.join(filter(pd.notna, [row['on_street_name'], row['cross_street_name']]))

In [11]:
collision_clean_df['street_address'] = collision_clean_df.apply(get_address, axis=1)

In [12]:
columns = collision_clean_df.columns.tolist()
columns.insert(1, columns.pop(columns.index('street_address')))
collision_clean_df = collision_clean_df[columns]
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,WHITESTONE EXPRESSWAY & 20 AVENUE,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,QUEENSBORO BRIDGE UPPER,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,THROGS NECK BRIDGE,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 LORING AVENUE,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14 08:13:00,SARATOGA AVENUE & DECATUR STREET,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [13]:
collision_clean_df.drop(columns=['on_street_name', 'cross_street_name', 'off_street_name'], inplace=True, errors='ignore')

In [14]:
collision_clean_df['street_address'] = collision_clean_df['street_address'].str.title()
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,Queensboro Bridge Upper,1,0,0,0,0,0,1,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,Throgs Neck Bridge,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 Loring Avenue,0,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [15]:
collision_clean_df['borough'] = collision_clean_df['borough'].str.title()
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,Queensboro Bridge Upper,1,0,0,0,0,0,1,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,Throgs Neck Bridge,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 Loring Avenue,0,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [16]:
collision_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2042904 entries, 4455765 to 4679180
Data columns (total 25 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   crash_datetime                 datetime64[ns]
 1   street_address                 object        
 2   number_of_persons_injured      object        
 3   number_of_persons_killed       object        
 4   number_of_pedestrians_injured  object        
 5   number_of_pedestrians_killed   object        
 6   number_of_cyclist_injured      object        
 7   number_of_cyclist_killed       object        
 8   number_of_motorist_injured     object        
 9   number_of_motorist_killed      object        
 10  contributing_factor_vehicle_1  object        
 11  contributing_factor_vehicle_2  object        
 12  vehicle_type_code1             object        
 13  vehicle_type_code2             object        
 14  borough                        object        
 15  zip_code      

In [17]:
collision_clean_df.dropna(subset=['street_address'], inplace=True)

In [18]:
import datetime as dt
# convert the datetime to days of week 
position = collision_clean_df.columns.get_loc('crash_datetime') + 1
crash_day_of_week = collision_clean_df['crash_datetime'].dt.dayofweek
collision_clean_df.insert(position, 'crash_day_of_week', crash_day_of_week)
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,crash_day_of_week,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,5,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,5,Queensboro Bridge Upper,1,0,0,0,0,0,1,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,2,Throgs Neck Bridge,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,5,1211 Loring Avenue,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,1,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [22]:
collision_clean_df['number_of_persons_injured'] = collision_clean_df['number_of_persons_injured'].fillna(0).astype(int)
collision_clean_df['number_of_persons_killed'] = collision_clean_df['number_of_persons_killed'].fillna(0).astype(int)
collision_clean_df['number_of_pedestrians_injured'] = collision_clean_df['number_of_pedestrians_injured'].fillna(0).astype(int)
collision_clean_df['number_of_pedestrians_killed'] = collision_clean_df['number_of_pedestrians_killed'].fillna(0).astype(int)
collision_clean_df['number_of_cyclist_injured'] = collision_clean_df['number_of_cyclist_injured'].fillna(0).astype(int)
collision_clean_df['number_of_cyclist_killed'] = collision_clean_df['number_of_cyclist_killed'].fillna(0).astype(int)
collision_clean_df['number_of_motorist_injured'] = collision_clean_df['number_of_motorist_injured'].fillna(0).astype(int)
collision_clean_df['number_of_motorist_killed'] = collision_clean_df['number_of_motorist_killed'].fillna(0).astype(int)
collision_clean_df['zip_code'] = collision_clean_df['zip_code'].astype(str).str[:5]
collision_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2042904 entries, 4455765 to 4679180
Data columns (total 26 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   crash_datetime                 datetime64[ns]
 1   crash_day_of_week              int32         
 2   street_address                 object        
 3   number_of_persons_injured      int32         
 4   number_of_persons_killed       int32         
 5   number_of_pedestrians_injured  int32         
 6   number_of_pedestrians_killed   int32         
 7   number_of_cyclist_injured      int32         
 8   number_of_cyclist_killed       int32         
 9   number_of_motorist_injured     int32         
 10  number_of_motorist_killed      int32         
 11  contributing_factor_vehicle_1  object        
 12  contributing_factor_vehicle_2  object        
 13  vehicle_type_code1             object        
 14  vehicle_type_code2             object        
 15  borough       

In [24]:
import numpy as np
collision_clean_df['zip_code'] = collision_clean_df['zip_code'].replace('nan', np.nan)
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,crash_day_of_week,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,5,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,5,Queensboro Bridge Upper,1,0,0,0,0,0,1,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,2,Throgs Neck Bridge,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,5,1211 Loring Avenue,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,1,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [25]:
collision_2022_df = collision_clean_df[collision_clean_df['crash_datetime'].dt.year == 2022].copy()
collision_2022_missing_geo = collision_2022_df[collision_2022_df['latitude'].isna() | collision_2022_df['longitude'].isna() | collision_2022_df['zip_code'].isna()]
collision_2022_missing_geo.info()

<class 'pandas.core.frame.DataFrame'>
Index: 36522 entries, 4513547 to 4631496
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   crash_datetime                 36522 non-null  datetime64[ns]
 1   crash_day_of_week              36522 non-null  int32         
 2   street_address                 36522 non-null  object        
 3   number_of_persons_injured      36522 non-null  int32         
 4   number_of_persons_killed       36522 non-null  int32         
 5   number_of_pedestrians_injured  36522 non-null  int32         
 6   number_of_pedestrians_killed   36522 non-null  int32         
 7   number_of_cyclist_injured      36522 non-null  int32         
 8   number_of_cyclist_killed       36522 non-null  int32         
 9   number_of_motorist_injured     36522 non-null  int32         
 10  number_of_motorist_killed      36522 non-null  int32         
 11  contributing

In [32]:
import requests
import urllib.parse

def geocode_address_with_zipcode(address):
    # URL encode the address
    address = address + ", New York, NY"

    # Construct the request URL
    url = f"https://nominatim.openstreetmap.org/search?"

    # Send the request
    response = requests.get(url, params = {"format": "json", "q": address})

    # Check if the response is successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()

        # Check if data is returned
        if data:
            # Extract latitude and longitude
            latitude = data[0]["lat"]
            longitude = data[0]["lon"]

            # Try to extract the zipcode
            zipcode = data[0].get("display_name").split(',')[-2].strip() if len(data[0].get("display_name").split(',')) > 1 else "Zipcode not found"

            return latitude, longitude, zipcode
        else:
            return np.nan, np.nan, np.nan
    else:
        return np.nan, np.nan, np.nan

In [None]:
for i in range(0, len(collision_2022_missing_geo)):
    latitude, longitude, zipcode = geocode_address_with_zipcode(collision_2022_missing_geo.iloc[i]['street_address'])
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'latitude'] = latitude
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'longitude'] = longitude
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'zip_code'] = zipcode

In [57]:
collision_2022_missing_geo = collision_2022_df[collision_2022_df['latitude'].isna() | collision_2022_df['longitude'].isna() | collision_2022_df['zip_code'].isna()]

In [55]:
# remove multiple space within the address
collision_2022_df['street_address'] = collision_2022_df['street_address'].str.replace(r'\s+', ' ', regex=True)

In [56]:
collision_2022_df.to_csv('../Resources/collision_2022.csv', index=True, header=True)

In [58]:
with open('../api_keys/googlemap.txt') as f:
    google_apikey = f.readline().strip()

In [74]:
def get_latlon_from_google(address):
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
    params = {
        'address': address + ", New York City, NY, USA",
        'key': google_apikey
    }
    response = requests.get(base_url, params=params).json()

    lat, lon, zip_code = None, None,None
    
    if 'results' in response:
        for result in response['results']:
            lat = result['geometry']['location']['lat']
            lon = result['geometry']['location']['lng']
            for component in result['address_components']:
                    if 'postal_code' in component['types']:
                        zip_code = component['long_name']
            
            # Check if we've found all the values we're looking for
            if zip_code:
                break
                
    return lat, lon, zip_code

In [65]:
for i in range(0, len(collision_2022_missing_geo)):
    latitude, longitude, zipcode = get_latlon_from_google(collision_2022_missing_geo.iloc[i]['street_address'])
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'latitude'] = latitude
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'longitude'] = longitude
    collision_2022_df.loc[collision_2022_missing_geo.index[i], 'zip_code'] = zipcode

In [None]:
get_latlon_from_google(collision_2022_missing_geo.iloc[i]['street_address'])

In [67]:
collision_2022_df.tail()

Unnamed: 0_level_0,crash_datetime,crash_day_of_week,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4631473,2022-07-22 10:05:00,4,72 Howard Avenue,0,0,0,0,0,0,0,...,11221,40.68577,-73.920616,"{'latitude': '40.68577', 'longitude': '-73.920...",,,,,,
4631472,2022-07-08 13:00:00,4,Atlantic Avenue,2,0,2,0,0,0,0,...,United States,40.676127,-73.904765,"{'latitude': '40.677288', 'longitude': '-73.92...",,,,,,
4631496,2022-06-11 15:00:00,5,Marcus Garvey Boulevard,1,0,0,0,0,0,1,...,United States,40.689951,-73.939426,"{'latitude': '40.690483', 'longitude': '-73.93...",,,,,,
4631468,2022-03-23 14:25:00,2,Lexington Avenue & Patchen Avenue,0,0,0,0,0,0,0,...,11221,40.69037,-73.92755,"{'latitude': '40.69037', 'longitude': '-73.927...",,,,,,
4631469,2022-04-25 07:05:00,0,Marcus Garvey Boulevard & Vernon Avenue,0,0,0,0,0,0,0,...,11206,0.0,0.0,"{'latitude': '0.0', 'longitude': '0.0', 'human...",,,,,,


In [71]:
invalid_zipcode_pattern = r'^\d{5}$'
filtered_df = collision_2022_df[~collision_2022_df['zip_code'].str.match(invalid_zipcode_pattern, na=False)]

In [77]:
collision_2022_df.to_csv('../Resources/collision_2022.csv', index=True, header=True)

In [83]:
for i in range(0, len(filtered_df)):
    latitude, longitude, zipcode = get_latlon_from_google(filtered_df.iloc[i]['street_address'])
    collision_2022_df.loc[filtered_df.index[i], 'latitude'] = latitude
    collision_2022_df.loc[filtered_df.index[i], 'longitude'] = longitude
    collision_2022_df.loc[filtered_df.index[i], 'zip_code'] = zipcode

In [99]:
def get_zipcode_from_latlon(lat, lon):
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
    params = {
        'latlng': f'{lat},{lon}',
        'key': google_apikey
    }
    response = requests.get(base_url, params=params).json()

    zip_code = None

    if 'results' in response:
        for result in response['results']:
            for component in result['address_components']:
                if 'postal_code' in component['types']:
                    zip_code = component['short_name']
                    break

            # Check if we've found a zip code
            if zip_code:
                break

    return zip_code


In [102]:
filtered_df = collision_2022_df[~collision_2022_df['zip_code'].str.match(invalid_zipcode_pattern, na=False)]
for i in range(0, len(filtered_df)):
    zipcode = get_zipcode_from_latlon(filtered_df.iloc[i]['latitude'], filtered_df.iloc[i]['longitude'])
    collision_2022_df.loc[filtered_df.index[i], 'zip_code'] = zipcode


In [105]:
collision_2022_df[~collision_2022_df['zip_code'].str.match(invalid_zipcode_pattern, na=False)]

Unnamed: 0_level_0,crash_datetime,crash_day_of_week,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4514420,2022-03-25 14:35:00,4,Port Richmond Avenue & Hatfield Place,1,0,1,0,0,0,0,...,,40.606616,-74.04466,"{'latitude': '40.632786', 'longitude': '-74.13...",,,,,,
4514215,2022-03-26 01:30:00,5,Calamus Avenue & 72 Street,0,0,0,0,0,0,0,...,,40.606616,-74.04466,"{'latitude': '40.734486', 'longitude': '-73.89...",Unspecified,Sedan,,,,
4514261,2022-02-27 18:30:00,6,Verrazano Bridge Upper,0,0,0,0,0,0,0,...,,40.606616,-74.04466,,,,,,,
4514422,2022-03-25 15:06:00,4,Fahy Avenue & Felton Street,1,0,0,0,0,0,1,...,,40.606616,-74.04466,"{'latitude': '40.62026', 'longitude': '-74.167...",,,,,,
4514238,2022-03-23 14:30:00,2,New Dorp Plaza North,0,0,0,0,0,0,0,...,,40.606616,-74.04466,"{'latitude': '40.57256', 'longitude': '-74.118...",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4593156,2022-12-21 14:10:00,2,Verrazano Bridge,0,0,0,0,0,0,0,...,,40.606616,-74.04466,,,,,,,
4593336,2022-12-25 11:34:00,6,Verrazano Bridge Upper,0,0,0,0,0,0,0,...,,40.606616,-74.04466,,,,,,,
4595160,2022-12-27 13:05:00,1,Verrazano Bridge Upper,0,0,0,0,0,0,0,...,,40.606616,-74.04466,,Unspecified,Sedan,Unspecified,Sedan,,
4595711,2022-12-29 18:02:00,3,Verrazano Bridge Upper,1,0,0,0,0,0,1,...,,40.606616,-74.04466,,,,,,,


In [119]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Verrazano Bridge Upper', 'zip_code'] = '10305'

In [129]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Verrazano Bridge', 'zip_code'] = '10305'

In [117]:
collision_2022_df.drop(columns=['zipcode'], inplace=True)

In [123]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Queens Midtown Tunnel', 'zip_code'] = '11101'
collision_2022_df.loc[collision_2022_df['street_address'] == 'Queens Midtown Tunnel', 'longitude'] = -75.7265786
collision_2022_df.loc[collision_2022_df['street_address'] == 'Queens Midtown Tunnel', 'latitude'] = 42.2161881

In [127]:
collision_2022_df.loc[collision_2022_df['street_address'] == 'Port Richmond Avenue & Hatfield Place', 'zip_code'] 

collision_id
4514420    None
Name: zip_code, dtype: object

In [141]:
filtered_df = collision_2022_df[~collision_2022_df['zip_code'].str.match(invalid_zipcode_pattern, na=False)]

In [135]:
with open('../api_keys/geoapify_1.txt') as f:
    geo_apikey = f.readline().strip()

In [140]:
def get_missing_geo_data(address):
    base_url = "https://api.geoapify.com/v1/geocode/search?"
    params = {
        "text": address + ", New York City, NY, USA",
        "apiKey": geo_apikey,
        "limit": 1,
        "format": "json",
        "lang": "en"
    }

    response = requests.get(base_url, params=params).json()

    if 'results' in response and len(response['results']) > 0:
        lat = response['results'][0].get('lat', None)
        lon = response['results'][0].get('lon', None)
        zip_code = response['results'][0].get('postcode', None)
        return lat, lon, zip_code
    return None, None, None

In [144]:
for idx, row in filtered_df.iterrows():
    lat, lon, zip_code = get_missing_geo_data(row['street_address'])
    
    collision_2022_df.at[idx, 'latitude'] = lat
    collision_2022_df.at[idx, 'longitude'] = lon
    collision_2022_df.at[idx, 'zip_code'] = zip_code

In [146]:
mask = collision_2022_df['zip_code'].str.match(invalid_zipcode_pattern, na=False)
rows_to_drop = collision_2022_df[~mask]
collision_2022_df = collision_2022_df.drop(rows_to_drop.index)

In [156]:
collision_2022_df['latitude'] = collision_2022_df['latitude'].astype(float)
collision_2022_df['longitude'] = collision_2022_df['longitude'].astype(float)

In [158]:
invalid_lat_lng = collision_2022_df[(collision_2022_df['latitude'] == 0.0000)]

In [161]:
collision_2022_df.to_csv('../Resources/collision_2022.csv', index=True, header=True)

In [159]:
for idx, row in invalid_lat_lng.iterrows():
    lat, lon, zip_code = get_missing_geo_data(row['street_address'])
    
    collision_2022_df.at[idx, 'latitude'] = lat
    collision_2022_df.at[idx, 'longitude'] = lon
    collision_2022_df.at[idx, 'zip_code'] = zip_code

KeyboardInterrupt: 

In [143]:
def extract_lat_lon(location_str):
    try:
        location_dict = eval(location_str)
        lat = float(location_dict.get('latitude'))
        lon = float(location_dict.get('longitude'))
        return lat, lon
    except Exception:
        return None, None

# Apply the function to each row to extract latitudes and longitudes
df['extracted_lat_lon'] = df['locations'].apply(extract_lat_lon)

# Check for mismatches
mismatches_mask = df['extracted_lat_lon'].apply(lambda lat_lon: lat_lon != (given_lat, given_lon))

# Filter the DataFrame to get rows with mismatches
mismatched_rows = df[mismatches_mask]

Unnamed: 0_level_0,crash_datetime,crash_day_of_week,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4514420,2022-03-25 14:35:00,4,Port Richmond Avenue & Hatfield Place,1,0,1,0,0,0,0,...,,40.606616,-74.04466,"{'latitude': '40.632786', 'longitude': '-74.13...",,,,,,
4514215,2022-03-26 01:30:00,5,Calamus Avenue & 72 Street,0,0,0,0,0,0,0,...,,40.606616,-74.04466,"{'latitude': '40.734486', 'longitude': '-73.89...",Unspecified,Sedan,,,,
4514422,2022-03-25 15:06:00,4,Fahy Avenue & Felton Street,1,0,0,0,0,0,1,...,,40.606616,-74.04466,"{'latitude': '40.62026', 'longitude': '-74.167...",,,,,,
4514238,2022-03-23 14:30:00,2,New Dorp Plaza North,0,0,0,0,0,0,0,...,,40.606616,-74.04466,"{'latitude': '40.57256', 'longitude': '-74.118...",,,,,,
4514314,2022-03-27 13:10:00,6,Borden Avenue & 30 Place,2,0,0,0,0,0,2,...,,40.606616,-74.04466,"{'latitude': '40.738197', 'longitude': '-73.93...",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4567513,2022-09-25 15:30:00,6,Brooklyn Bridge,0,0,0,0,0,0,0,...,,40.606616,-74.04466,,,,,,,
4568345,2022-09-29 00:08:00,3,Conduit Boulevard & Mckinley Avenue,1,0,0,0,0,0,1,...,,40.606616,-74.04466,,,,,,,
4569243,2022-09-29 17:20:00,3,Pelham Parkway & Hutchinson River Parkway,0,0,0,0,0,0,0,...,,40.606616,-74.04466,"{'latitude': '40.856037', 'longitude': '-73.83...",,,,,,
4585686,2022-11-25 01:26:00,4,Cross Bay Bridge,0,0,0,0,0,0,0,...,,40.592474,-73.819906,,,,,,,
