In [1]:
import pandas as pd
import requests
import numpy as np

In [2]:
with open('../api_keys/geoapify.txt', 'r') as file:
    geo_apikey = file.readline().strip()

In [3]:
collision_df = pd.read_csv("../Resources/collision_data_clean.csv", dtype={"zip_code": "str"})
collision_df['county'] = np.nan
collision_df = collision_df.rename(columns={"contributing_factor_vehicle_1": "contributing_factor_vehicle"}, inplace=False)
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,cross_street_name,off_street_name,borough,zip_code,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle,longitude,latitude,county
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,20 AVENUE,,,,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage,,,
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,,,1,0,0,0,0,0,1,0,Pavement Slippery,,,
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,0,0,0,0,0,0,0,0,Following Too Closely,,,
3,4456314,2021-09-11,9:35,,,1211 LORING AVENUE,BROOKLYN,11208.0,0,0,0,0,0,0,0,0,Unspecified,-73.8665,40.667202,
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,DECATUR STREET,,BROOKLYN,11233.0,0,0,0,0,0,0,0,0,,-73.917274,40.683304,


In [8]:
off_street_missing_data = collision_df[(collision_df['off_street_name'].notna()) & 
                                       (collision_df['latitude'].isna() | collision_df['longitude'].isna() | collision_df['zip_code'].isna() | collision_df['county'].isna())]
off_street_missing_data

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,cross_street_name,off_street_name,borough,zip_code,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle,longitude,latitude,county
3,4456314,2021-09-11,9:35,,,1211 LORING AVENUE,BROOKLYN,11208,0,0,0,0,0,0,0,0,Unspecified,-73.866500,40.667202,
7,4486660,2021-12-14,8:17,,,344 BAYCHESTER AVENUE,BRONX,10475,2,0,0,0,0,0,2,0,Unspecified,-73.831480,40.868160,
8,4487074,2021-12-14,21:10,,,2047 PITKIN AVENUE,BROOKLYN,11207,0,0,0,0,0,0,0,0,Driver Inexperience,-73.897100,40.671720,
15,4486604,2021-12-14,17:58,,,480 DEAN STREET,BROOKLYN,11217,0,0,0,0,0,0,0,0,Passing Too Closely,-73.974630,40.681580,
16,4486991,2021-12-14,20:03,,,878 FLATBUSH AVENUE,BROOKLYN,11226,4,0,0,0,0,0,4,0,Steering Failure,-73.958810,40.650680,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,4407358,2021-04-14,19:01,,,2820 SNYDER AVENUE,BROOKLYN,11226,0,0,0,0,0,0,0,0,Driver Inattention/Distraction,-73.951020,40.648840,
986,4407695,2021-04-15,10:13,,,525 EAST 72 STREET,MANHATTAN,10021,0,0,0,0,0,0,0,0,Passing Too Closely,-73.953100,40.766586,
988,4408300,2021-04-10,22:06,,,1055 UNIVERSITY AVENUE,BRONX,10452,4,0,0,0,0,0,4,0,Following Too Closely,-73.930275,40.834675,
991,4407494,2021-04-14,12:00,,,33 FERNDALE AVENUE,,,0,0,0,0,0,0,0,0,Passing Too Closely,-74.159000,40.593002,


In [9]:
def get_missing_geo_data(address):
    base_url = "https://api.geoapify.com/v1/geocode/search?"
    params = {
        "text": address + ", New York City, NY, USA",
        "apiKey": geo_apikey,
        "limit": 1,
        "format": "json",
        "lang": "en"
    }

    response = requests.get(base_url, params=params).json()
    return response 

In [14]:
def extract_data_from_response(response):
    if 'results' in response and len(response['results']) > 0:
        lat = response['results'][0].get('lat', None)
        lon = response['results'][0].get('lon', None)
        county = response['results'][0].get('county', None)
        zip_code = response['results'][0].get('postcode', None)
        return lat, lon, county, zip_code
    return None, None

In [15]:
for idx, row in off_street_missing_data.iterrows():
    response = get_missing_geo_data(row['off_street_name'])
    lat, lon, zip_code, county = extract_data_from_response(response)
    
    if pd.isna(row['latitude']):
        collision_df.at[idx, 'latitude'] = lat
    if pd.isna(row['longitude']):
        collision_df.at[idx, 'longitude'] = lon
    if pd.isna(row['zip_code']):
        collision_df.at[idx, 'zip_code'] = zip_code
    if pd.isna(row['county']):
        collision_df.at[idx, 'county'] = county