In [1]:
import requests
import pandas as pd

def fetch_data(limit, offset):
    collision_response = requests.get("https://data.cityofnewyork.us/resource/h9gi-nx95.json", params = {"$limit": limit, "$offset": offset})
    collision_data = collision_response.json()
    return collision_data

In [2]:
total_records = 2500000
collision_df = pd.DataFrame()

for offset in range (0, total_records, 50000):
    collision_data = fetch_data(limit=50000, offset=offset)
    collision_df = pd.concat([collision_df, pd.DataFrame(collision_data)], ignore_index=True)
collision_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2042904 entries, 0 to 2042903
Data columns (total 29 columns):
 #   Column                         Dtype 
---  ------                         ----- 
 0   crash_date                     object
 1   crash_time                     object
 2   on_street_name                 object
 3   off_street_name                object
 4   number_of_persons_injured      object
 5   number_of_persons_killed       object
 6   number_of_pedestrians_injured  object
 7   number_of_pedestrians_killed   object
 8   number_of_cyclist_injured      object
 9   number_of_cyclist_killed       object
 10  number_of_motorist_injured     object
 11  number_of_motorist_killed      object
 12  contributing_factor_vehicle_1  object
 13  contributing_factor_vehicle_2  object
 14  collision_id                   object
 15  vehicle_type_code1             object
 16  vehicle_type_code2             object
 17  borough                        object
 18  zip_code              

In [3]:
collision_df.head()

Unnamed: 0,crash_date,crash_time,on_street_name,off_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,cross_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
1,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
2,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
3,2021-09-11T00:00:00.000,9:35,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4,2021-12-14T00:00:00.000,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [4]:
# Correct column names
collision_df = collision_df.rename(columns={"off_street_name": "cross_street_name", "cross_street_name": "off_street_name"})
collision_df.head(5)

Unnamed: 0,crash_date,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
0,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
1,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
2,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
3,2021-09-11T00:00:00.000,9:35,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4,2021-12-14T00:00:00.000,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [5]:
collision_clean_df = collision_df.copy() 
collision_clean_df.sort_values(by='crash_date', ascending=True)

Unnamed: 0,crash_date,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
1923677,2012-07-01T00:00:00.000,16:40,WEST 25 STREET,11 AVENUE,0,0,0,0,0,0,...,40.7501786,-74.0060706,"{'latitude': '40.7501786', 'longitude': '-74.0...",,,,,,,
1920645,2012-07-01T00:00:00.000,2:00,HORACE HARDING EXPRESSWAY,KISSENA BOULEVARD,0,0,0,0,0,0,...,,,,,,,,,,
1921341,2012-07-01T00:00:00.000,3:10,75 STREET,3 AVENUE,1,0,1,0,0,0,...,40.6314641,-74.0277916,"{'latitude': '40.6314641', 'longitude': '-74.0...",,,,,,,
1921175,2012-07-01T00:00:00.000,3:30,ATLANTIC AVENUE,WAVERLY AVENUE,0,0,0,0,0,0,...,40.6813432,-73.9655103,"{'latitude': '40.6813432', 'longitude': '-73.9...",,Unspecified,SPORT UTILITY / STATION WAGON,,,,
1921723,2012-07-01T00:00:00.000,12:25,ZULETTE AVENUE,EDISON AVENUE,0,0,0,0,0,0,...,40.8421186,-73.8329030,"{'latitude': '40.8421186', 'longitude': '-73.8...",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2042751,2023-11-14T00:00:00.000,10:04,,,0,0,0,0,0,0,...,40.68378,-73.86734,"{'latitude': '40.68378', 'longitude': '-73.867...",293 GRANT AVENUE,,,,,,
2042752,2023-11-14T00:00:00.000,15:00,,,1,0,0,0,0,0,...,40.82029,-73.89214,"{'latitude': '40.82029', 'longitude': '-73.892...",925 SOUTHERN BOULEVARD,,,,,,
2042753,2023-11-14T00:00:00.000,17:50,HARLEM RIVER DRIVE,,0,0,0,0,0,0,...,40.802757,-73.93026,"{'latitude': '40.802757', 'longitude': '-73.93...",,,,,,,
2042755,2023-11-14T00:00:00.000,9:30,GREENPOINT AVENUE,BORDEN AVENUE,1,0,0,0,0,0,...,40.73715,-73.930824,"{'latitude': '40.73715', 'longitude': '-73.930...",,,,,,,


In [6]:
collision_clean_df.set_index('collision_id', inplace=True)

In [7]:
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_date,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11T00:00:00.000,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26T00:00:00.000,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29T00:00:00.000,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11T00:00:00.000,9:35,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14T00:00:00.000,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [8]:
crash_datetime = pd.to_datetime(collision_clean_df['crash_date'] + ' ' + collision_clean_df['crash_time'])
collision_clean_df.insert(1, 'crash_datetime', crash_datetime)
collision_clean_df.head(5)

  crash_datetime = pd.to_datetime(collision_clean_df['crash_date'] + ' ' + collision_clean_df['crash_time'])


Unnamed: 0_level_0,crash_date,crash_datetime,crash_time,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11T00:00:00.000,2021-09-11 02:39:00,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26T00:00:00.000,2022-03-26 11:45:00,11:45,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29T00:00:00.000,2022-06-29 06:55:00,6:55,THROGS NECK BRIDGE,,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11T00:00:00.000,2021-09-11 09:35:00,9:35,,,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14T00:00:00.000,2021-12-14 08:13:00,8:13,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [9]:
collision_clean_df.drop(columns=['crash_date', 'crash_time'], inplace=True)
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,2,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,1,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,THROGS NECK BRIDGE,,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,,,0,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14 08:13:00,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [10]:
def get_address(row):
    if pd.notna(row['off_street_name']):
        return row['off_street_name']
    else:
        # Join only non-null values
        return ' & '.join(filter(pd.notna, [row['on_street_name'], row['cross_street_name']]))

In [11]:
collision_clean_df['street_address'] = collision_clean_df.apply(get_address, axis=1)

In [12]:
collision_clean_df.tail(5)

Unnamed: 0_level_0,crash_datetime,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5,street_address
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4679147,2023-11-14 14:12:00,MORRIS AVENUE,,0,0,0,0,0,0,0,...,-73.9114,"{'latitude': '40.837864', 'longitude': '-73.91...",,,,,,,,MORRIS AVENUE
4679531,2023-06-13 19:15:00,UTOPIA PARKWAY,PECK AVENUE,0,0,0,0,0,0,0,...,-73.792496,"{'latitude': '40.745667', 'longitude': '-73.79...",,,,,,,,UTOPIA PARKWAY & PECK AVENUE
4679541,2023-11-14 09:00:00,39 STREET,SKILLMAN AVENUE,0,0,0,0,0,0,0,...,-73.925476,"{'latitude': '40.74743', 'longitude': '-73.925...",,,,,,,,39 STREET & SKILLMAN AVENUE
4679584,2023-11-14 15:50:00,,,0,0,0,0,0,0,0,...,-73.93606,"{'latitude': '40.6681', 'longitude': '-73.9360...",1633 UNION STREET,,,,,,,1633 UNION STREET
4679180,2023-11-14 16:00:00,,,0,0,0,0,0,0,0,...,-73.86619,"{'latitude': '40.682575', 'longitude': '-73.86...",336 ELDERTS LANE,,,,,,,336 ELDERTS LANE


In [13]:
columns = collision_clean_df.columns.tolist()
columns.insert(1, columns.pop(columns.index('street_address')))
collision_clean_df = collision_clean_df[columns]
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,on_street_name,cross_street_name,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,...,latitude,longitude,location,off_street_name,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,WHITESTONE EXPRESSWAY & 20 AVENUE,WHITESTONE EXPRESSWAY,20 AVENUE,2,0,0,0,0,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,QUEENSBORO BRIDGE UPPER,QUEENSBORO BRIDGE UPPER,,1,0,0,0,0,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,THROGS NECK BRIDGE,THROGS NECK BRIDGE,,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 LORING AVENUE,,,0,0,0,0,0,0,...,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",1211 LORING AVENUE,,,,,,
4486609,2021-12-14 08:13:00,SARATOGA AVENUE & DECATUR STREET,SARATOGA AVENUE,DECATUR STREET,0,0,0,0,0,0,...,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,,


In [14]:
collision_clean_df.drop(columns=['on_street_name', 'cross_street_name', 'off_street_name'], inplace=True, errors='ignore')

In [16]:
collision_clean_df['street_address'] = collision_clean_df['street_address'].str.title()
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,Queensboro Bridge Upper,1,0,0,0,0,0,1,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,Throgs Neck Bridge,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 Loring Avenue,0,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [19]:
collision_clean_df['borough'] = collision_clean_df['borough'].str.title()
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,Queensboro Bridge Upper,1,0,0,0,0,0,1,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,Throgs Neck Bridge,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 Loring Avenue,0,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [18]:
collision_clean_df.head()

Unnamed: 0_level_0,crash_datetime,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,0,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,Queensboro Bridge Upper,1,0,0,0,0,0,1,0,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,Throgs Neck Bridge,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,1211 Loring Avenue,0,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [20]:
collision_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2042904 entries, 4455765 to 4679180
Data columns (total 25 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   crash_datetime                 datetime64[ns]
 1   street_address                 object        
 2   number_of_persons_injured      object        
 3   number_of_persons_killed       object        
 4   number_of_pedestrians_injured  object        
 5   number_of_pedestrians_killed   object        
 6   number_of_cyclist_injured      object        
 7   number_of_cyclist_killed       object        
 8   number_of_motorist_injured     object        
 9   number_of_motorist_killed      object        
 10  contributing_factor_vehicle_1  object        
 11  contributing_factor_vehicle_2  object        
 12  vehicle_type_code1             object        
 13  vehicle_type_code2             object        
 14  borough                        object        
 15  zip_code      

In [21]:
collision_clean_df.dropna(subset=['street_address'], inplace=True)
collision_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2042904 entries, 4455765 to 4679180
Data columns (total 25 columns):
 #   Column                         Dtype         
---  ------                         -----         
 0   crash_datetime                 datetime64[ns]
 1   street_address                 object        
 2   number_of_persons_injured      object        
 3   number_of_persons_killed       object        
 4   number_of_pedestrians_injured  object        
 5   number_of_pedestrians_killed   object        
 6   number_of_cyclist_injured      object        
 7   number_of_cyclist_killed       object        
 8   number_of_motorist_injured     object        
 9   number_of_motorist_killed      object        
 10  contributing_factor_vehicle_1  object        
 11  contributing_factor_vehicle_2  object        
 12  vehicle_type_code1             object        
 13  vehicle_type_code2             object        
 14  borough                        object        
 15  zip_code      

In [25]:
import datetime as dt
# convert the datetime to days of week 
position = collision_clean_df.columns.get_loc('crash_datetime') + 1
crash_day_of_week = collision_clean_df['crash_datetime'].dt.dayofweek
collision_clean_df.insert(position, 'crash_day_of_week', crash_day_of_week)
collision_clean_df.head(5)

Unnamed: 0_level_0,crash_datetime,crash_day_of_week,street_address,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,...,zip_code,latitude,longitude,location,contributing_factor_vehicle_3,vehicle_type_code_3,contributing_factor_vehicle_4,vehicle_type_code_4,contributing_factor_vehicle_5,vehicle_type_code_5
collision_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4455765,2021-09-11 02:39:00,5,Whitestone Expressway & 20 Avenue,2,0,0,0,0,0,2,...,,,,,,,,,,
4513547,2022-03-26 11:45:00,5,Queensboro Bridge Upper,1,0,0,0,0,0,1,...,,,,,,,,,,
4541903,2022-06-29 06:55:00,2,Throgs Neck Bridge,0,0,0,0,0,0,0,...,,,,,,,,,,
4456314,2021-09-11 09:35:00,5,1211 Loring Avenue,0,0,0,0,0,0,0,...,11208.0,40.667202,-73.8665,"{'latitude': '40.667202', 'longitude': '-73.86...",,,,,,
4486609,2021-12-14 08:13:00,1,Saratoga Avenue & Decatur Street,0,0,0,0,0,0,0,...,11233.0,40.683304,-73.917274,"{'latitude': '40.683304', 'longitude': '-73.91...",,,,,,


In [48]:
import requests
import urllib.parse

def geocode_address_with_zipcode(address):
    # URL encode the address
    address = address + ", New York, NY"
    address_encoded = urllib.parse.quote(address)

    # Construct the request URL
    url = f"https://nominatim.openstreetmap.org/search?q={address_encoded}&format=json"

    # Send the request
    response = requests.get(url)

    # Check if the response is successful
    if response.status_code == 200:
        # Parse the JSON response
        data = response.json()

        # Check if data is returned
        if data:
            # Extract latitude and longitude
            latitude = data[0]["lat"]
            longitude = data[0]["lon"]

            # Try to extract the zipcode
            zipcode = data[0].get("display_name").split(',')[-2].strip() if len(data[0].get("display_name").split(',')) > 1 else "Zipcode not found"

            return latitude, longitude, zipcode
        else:
            return None, None, None
    else:
        return None, None, None

In [49]:
for row in collision_clean_df.itertuples():
    if pd.isna(row.latitude) or pd.isna(row.longitude) or pd.isna(row.zip_code):
        latitude, longitude, zipcode = geocode_address_with_zipcode (row.street_address)
        collision_clean_df.at[row.Index, 'latitude'] = latitude
        collision_clean_df.at[row.Index, 'longitude'] = longitude
        collision_clean_df.at[row.Index, 'zip_code'] = zipcode