In [1]:
import pandas as pd
import requests
import numpy as np

In [20]:
with open('../api_keys/geoapify.txt', 'r') as file:
    geo_apikey = file.readline().strip()
with open('../api_keys/googlemap.txt', 'r') as file:
    google_apikey = file.readline().strip()

In [3]:
collision_df = pd.read_csv("../Resources/collision_data_clean.csv", dtype={"zip_code": "str"})
collision_df['county'] = np.nan
collision_df = collision_df.rename(columns={"contributing_factor_vehicle_1": "contributing_factor_vehicle"}, inplace=False)
collision_df = collision_df[['collision_id', 'crash_date', 'crash_time', 
                             'on_street_name', 'off_street_name', 'cross_street_name', 'borough', 'county', 'zip_code', 'latitude', 'longitude', 
                             'number_of_persons_injured', 'number_of_persons_killed', 'number_of_pedestrians_injured', 'number_of_pedestrians_killed', 
                             'number_of_cyclist_injured', 'number_of_cyclist_killed', 'number_of_motorist_injured', 'number_of_motorist_killed', 'contributing_factor_vehicle']]
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,,,,,,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,,,,,,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,,,,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,,11208.0,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,,DECATUR STREET,BROOKLYN,,11233.0,40.683304,-73.917274,0,0,0,0,0,0,0,0,


In [4]:
off_street_missing_data = collision_df[(collision_df['off_street_name'].notna())]
off_street_missing_data

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,,11208,40.667202,-73.866500,0,0,0,0,0,0,0,0,Unspecified
7,4486660,2021-12-14,8:17,,344 BAYCHESTER AVENUE,,BRONX,,10475,40.868160,-73.831480,2,0,0,0,0,0,2,0,Unspecified
8,4487074,2021-12-14,21:10,,2047 PITKIN AVENUE,,BROOKLYN,,11207,40.671720,-73.897100,0,0,0,0,0,0,0,0,Driver Inexperience
15,4486604,2021-12-14,17:58,,480 DEAN STREET,,BROOKLYN,,11217,40.681580,-73.974630,0,0,0,0,0,0,0,0,Passing Too Closely
16,4486991,2021-12-14,20:03,,878 FLATBUSH AVENUE,,BROOKLYN,,11226,40.650680,-73.958810,4,0,0,0,0,0,4,0,Steering Failure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
984,4407358,2021-04-14,19:01,,2820 SNYDER AVENUE,,BROOKLYN,,11226,40.648840,-73.951020,0,0,0,0,0,0,0,0,Driver Inattention/Distraction
986,4407695,2021-04-15,10:13,,525 EAST 72 STREET,,MANHATTAN,,10021,40.766586,-73.953100,0,0,0,0,0,0,0,0,Passing Too Closely
988,4408300,2021-04-10,22:06,,1055 UNIVERSITY AVENUE,,BRONX,,10452,40.834675,-73.930275,4,0,0,0,0,0,4,0,Following Too Closely
991,4407494,2021-04-14,12:00,,33 FERNDALE AVENUE,,,,,40.593002,-74.159000,0,0,0,0,0,0,0,0,Passing Too Closely


In [5]:
def get_missing_geo_data(address):
    base_url = "https://api.geoapify.com/v1/geocode/search?"
    params = {
        "text": address + ", New York City, NY, USA",
        "apiKey": geo_apikey,
        "limit": 1,
        "format": "json",
        "lang": "en"
    }

    response = requests.get(base_url, params=params).json()
    return response 

In [6]:
def extract_data_from_response(response):
    if 'results' in response and len(response['results']) > 0:
        lat = response['results'][0].get('lat', None)
        lon = response['results'][0].get('lon', None)
        county = response['results'][0].get('county', None)
        zip_code = response['results'][0].get('postcode', None)
        return lat, lon, county, zip_code
    return None, None, None, None

In [7]:
for idx, row in off_street_missing_data.iterrows():
    response = get_missing_geo_data(row['off_street_name'])
    lat, lon, county, zip_code = extract_data_from_response(response)
    
    if pd.isna(row['latitude']):
        collision_df.at[idx, 'latitude'] = lat
    if pd.isna(row['longitude']):
        collision_df.at[idx, 'longitude'] = lon
    if pd.isna(row['zip_code']):
        collision_df.at[idx, 'zip_code'] = zip_code
    if pd.isna(row['county']):
        collision_df.at[idx, 'county'] = county

In [8]:
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,,,,,,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,,,,,,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,,,,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,,11208.0,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,,DECATUR STREET,BROOKLYN,,11233.0,40.683304,-73.917274,0,0,0,0,0,0,0,0,


In [9]:
def get_intersection_geo_data(on_street, cross_street):
    intersection_address = f"{on_street} & {cross_street}, New York City, NY, USA"
    
    base_url = "https://api.geoapify.com/v1/geocode/search"
    params = {
        "text": intersection_address,
        "apiKey": geo_apikey,
        "limit": 1,
        "format": "json",
        "lang": "en"
    }

    response = requests.get(base_url, params=params).json()
    return response

In [10]:
intersection_missing_df = collision_df[(collision_df['on_street_name'].notna() & collision_df['cross_street_name'].notna())]
intersection_missing_df

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,,,,,,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,,DECATUR STREET,BROOKLYN,,11233,40.683304,-73.917274,0,0,0,0,0,0,0,0,
9,4486519,2021-12-14,14:58,3 AVENUE,,EAST 43 STREET,MANHATTAN,,10017,40.751440,-73.973970,0,0,0,0,0,0,0,0,Passing Too Closely
11,4487127,2021-12-14,16:50,SPRINGFIELD BOULEVARD,,EAST GATE PLAZA,QUEENS,,11413,40.675884,-73.755770,0,0,0,0,0,0,0,0,Turning Improperly
12,4486634,2021-12-14,8:30,broadway,,west 80 street -west 81 street,,,,,,0,0,0,0,0,0,0,0,Unsafe Lane Changing
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
989,4407768,2021-04-13,8:04,EAST 13 STREET,,AVENUE T,BROOKLYN,,11229,40.600883,-73.958694,1,0,0,0,0,0,0,0,Driver Inattention/Distraction
990,4407680,2021-04-15,23:06,MARION AVENUE,,EAST 193 STREET,BRONX,,10458,40.863647,-73.891800,1,0,0,0,0,0,1,0,Traffic Control Disregarded
992,4408304,2021-04-10,4:45,THROOP AVENUE,,GREENE AVENUE,BROOKLYN,,11221,40.689426,-73.942230,0,0,0,0,0,0,0,0,Traffic Control Disregarded
995,4407740,2021-04-14,12:47,HENDRIX STREET,,ATLANTIC AVENUE,BROOKLYN,,11207,40.676594,-73.890380,2,0,0,0,0,0,2,0,Turning Improperly


In [11]:
for idx, row in intersection_missing_df.iterrows():
    response = get_intersection_geo_data(row['on_street_name'], row['cross_street_name'])
    lat, lon, county, zip_code = extract_data_from_response(response)
    
    if pd.isna(row['latitude']):
        collision_df.at[idx, 'latitude'] = lat
    if pd.isna(row['longitude']):
        collision_df.at[idx, 'longitude'] = lon
    if pd.isna(row['zip_code']):
        collision_df.at[idx, 'zip_code'] = zip_code
    if pd.isna(row['county']):
        collision_df.at[idx, 'county'] = county

In [12]:
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,,,,40.712728,-74.006015,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,,,,,,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,,,,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,,11208.0,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,,DECATUR STREET,BROOKLYN,,11233.0,40.683304,-73.917274,0,0,0,0,0,0,0,0,


In [13]:
on_street_missing_df = collision_df[(collision_df['on_street_name'].notna() & collision_df['cross_street_name'].isna())]
on_street_missing_df

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,,,,,,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,,,,0,0,0,0,0,0,0,0,Following Too Closely
5,4407458,2021-04-14,12:47,MAJOR DEEGAN EXPRESSWAY RAMP,,,,,,,,0,0,0,0,0,0,0,0,Unspecified
6,4486555,2021-12-14,17:05,BROOKLYN QUEENS EXPRESSWAY,,,,,,40.709183,-73.956825,0,0,0,0,0,0,0,0,Passing Too Closely
10,4486934,2021-12-13,0:34,MYRTLE AVENUE,,,,,,40.701275,-73.888870,0,0,0,0,0,0,0,0,Passing or Lane Usage Improper
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
979,4407974,2021-04-10,0:00,WEST 64 STREET,,,,,,,,0,0,0,0,0,0,0,0,Passing or Lane Usage Improper
993,4407827,2021-04-14,17:30,WEST 34 STREET,,,,,,40.754580,-73.999150,0,0,0,0,0,0,0,0,Passing or Lane Usage Improper
994,4407399,2021-04-14,22:40,LONG ISLAND EXPRESSWAY,,,,,,40.741394,-73.823030,2,0,0,0,0,0,2,0,Unspecified
996,4408392,2021-04-16,14:30,EAST 64 STREET,,,,,,40.764680,-73.964300,0,0,0,0,0,0,0,0,Backing Unsafely


In [14]:
for idx, row in on_street_missing_df.iterrows():
    response = get_missing_geo_data(row['on_street_name'])
    lat, lon, county, zip_code = extract_data_from_response(response)
    
    if pd.isna(row['latitude']):
        collision_df.at[idx, 'latitude'] = lat
    if pd.isna(row['longitude']):
        collision_df.at[idx, 'longitude'] = lon
    if pd.isna(row['zip_code']):
        collision_df.at[idx, 'zip_code'] = zip_code
    if pd.isna(row['county']):
        collision_df.at[idx, 'county'] = county

In [15]:
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,,,,40.712728,-74.006015,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,,Queens County,,40.746402,-73.940191,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,10465.0,40.816416,-73.798616,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,,11208.0,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,,DECATUR STREET,BROOKLYN,,11233.0,40.683304,-73.917274,0,0,0,0,0,0,0,0,


In [16]:
collision_df[collision_df['county'].isna() & collision_df['zip_code'].isna()]

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,,,,40.712728,-74.006015,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
12,4486634,2021-12-14,8:30,broadway,,west 80 street -west 81 street,,,,40.712728,-74.006015,0,0,0,0,0,0,0,0,Unsafe Lane Changing
17,4486284,2021-12-14,1:28,MEEKER AVENUE,,LORIMER STREET,,,,40.712728,-74.006015,3,0,0,0,0,0,3,0,Traffic Control Disregarded
61,4521902,2022-04-24,10:27,VANWYCK EXPRESSWAY,,109 AVENUE,,,,40.712728,-74.006015,1,0,0,0,0,0,1,0,Traffic Control Disregarded
107,4514263,2022-03-26,9:30,BRUCKNER BOULEVARD,,EAST 137 STREET,,,,40.804153,-73.91304,0,0,0,0,0,0,0,0,Unspecified
167,4514419,2022-03-25,14:00,LEONARD AVENUE,,FISKE AVENUE,,,,40.61763,-74.133575,0,0,0,0,0,0,0,0,Driver Inattention/Distraction
247,4456550,2021-09-06,10:34,21 STREET,,33 ROAD,,,,40.764103,-73.93277,1,0,0,0,0,0,1,0,Reaction to Uninvolved Vehicle
292,4455860,2021-09-11,14:00,72 STREET,,41 AVENUE,,,,40.74498,-73.89325,0,0,0,0,0,0,0,0,Unspecified
293,4456782,2021-08-29,0:12,VICTORY BOULEVARD,,HARVEY AVENUE,,,,40.6117,-74.13918,1,0,0,0,0,0,1,0,Driver Inattention/Distraction
338,4456582,2021-07-08,11:50,BRADLEY AVENUE,,NORTH GANNON AVENUE,,,,40.60855,-74.13216,2,0,0,0,0,0,2,0,Driver Inattention/Distraction


In [34]:
county_missing_given_latlon_df = collision_df[(collision_df['borough'].isna() | collision_df['county'].isna()) & collision_df['latitude'].notna() & collision_df['longitude'].notna()]
county_missing_given_latlon_df

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,,,,40.712728,-74.006015,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,,Queens County,,40.746402,-73.940191,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,10465,40.816416,-73.798616,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,,11208,40.667202,-73.866500,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,,DECATUR STREET,BROOKLYN,,11233,40.683304,-73.917274,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,4407827,2021-04-14,17:30,WEST 34 STREET,,,,New York County,10011,40.754580,-73.999150,0,0,0,0,0,0,0,0,Passing or Lane Usage Improper
994,4407399,2021-04-14,22:40,LONG ISLAND EXPRESSWAY,,,,Queens County,,40.741394,-73.823030,2,0,0,0,0,0,2,0,Unspecified
996,4408392,2021-04-16,14:30,EAST 64 STREET,,,,New York County,,40.764680,-73.964300,0,0,0,0,0,0,0,0,Backing Unsafely
998,4407655,2021-04-14,6:55,BROOKLYN QUEENS EXPRESSWAY,,,,,11104,40.698544,-73.962360,0,0,0,0,0,0,0,0,Following Too Closely


In [21]:
def get_borough_county(lat, lon):
    base_url = f"https://maps.googleapis.com/maps/api/geocode/json?latlng={lat},{lon}&key={google_apikey}"
    response = requests.get(base_url).json()
    return response

In [47]:
def extract_borough_county_from_response(response):
    borough, county, zip_code = None, None, None
    
    if 'results' in response:
        for result in response['results']:
            for component in result['address_components']:
                if not county and 'administrative_area_level_2' in component['types']:
                    county = component['long_name']
                if not borough and 'sublocality_level_1' in component['types']:
                    borough = component['short_name']
                if not zip_code and 'postal_code' in component['types']:
                    zip_code = component['long_name']
            
            # Check if we've found all the values we're looking for
            if borough and county and zip_code:
                break
                
    return borough, county, zip_code


In [48]:
for idx, row in county_missing_given_latlon_df.iterrows():
    response = get_borough_county(row['latitude'], row['longitude'])
    borough, county, zip_code = extract_borough_county_from_response(response)
    
    if pd.isna(row['borough']):
        collision_df.at[idx, 'borough'] = borough
    if pd.isna(row['county']):
        collision_df.at[idx, 'county'] = county
    if pd.isna(row['zip_code']):
        collision_df.at[idx, 'zip_code'] = zip_code

In [49]:
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,Brooklyn,New York County,10007,40.712728,-74.006015,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,Queens,Queens County,11101,40.746402,-73.940191,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,Bronx,Bronx County,10465,40.816416,-73.798616,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,Kings County,11208,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,,DECATUR STREET,BROOKLYN,Kings County,11233,40.683304,-73.917274,0,0,0,0,0,0,0,0,


In [54]:
def get_latlon_borough_county(address):
    base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'
    params = {
        'address': address + ", New York City, NY, USA",
        'key': google_apikey
    }
    response = requests.get(base_url, params=params).json()
    return response

In [53]:
def extract_latlon_borough_from_response(response):
    lat, lon, borough, county, zip_code = None, None, None, None, None
    
    if 'results' in response:
        for result in response['results']:
            lat = result['geometry']['location']['lat']
            lon = result['geometry']['location']['lng']
            for component in result['address_components']:
                if not county and 'administrative_area_level_2' in component['types']:
                    county = component['long_name']
                if not borough and 'sublocality_level_1' in component['types']:
                    borough = component['short_name']
                if not zip_code and 'postal_code' in component['types']:
                    zip_code = component['long_name']
            
            # Check if we've found all the values we're looking for
            if borough and county and zip_code:
                break
                
    return lat, lon, borough, county, zip_code

In [59]:
collision_df.replace({None: np.nan, '': np.nan}, inplace=True)
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,WHITESTONE EXPRESSWAY,,20 AVENUE,Queens,Queens County,11356,40.712728,-74.006015,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,QUEENSBORO BRIDGE UPPER,,,Manhattan,Queens County,10044,40.746402,-73.940191,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,10465,40.816416,-73.798616,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,,11208,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,SARATOGA AVENUE,,DECATUR STREET,BROOKLYN,Kings County,11233,40.683304,-73.917274,0,0,0,0,0,0,0,0,


In [63]:
county_missing_given_latlon_df = collision_df[(collision_df['borough'].isna() | collision_df['county'].isna()) & collision_df['latitude'].notna() & collision_df['longitude'].notna()]
county_missing_given_latlon_df

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
2,4541903,2022-06-29,6:55,THROGS NECK BRIDGE,,,,,10465.0,40.816416,-73.798616,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 LORING AVENUE,,BROOKLYN,,11208.0,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
5,4407458,2021-04-14,12:47,MAJOR DEEGAN EXPRESSWAY RAMP,,,,Bronx County,,40.81166,-73.931423,0,0,0,0,0,0,0,0,Unspecified
6,4486555,2021-12-14,17:05,BROOKLYN QUEENS EXPRESSWAY,,,,,11104.0,40.709183,-73.956825,0,0,0,0,0,0,0,0,Passing Too Closely
7,4486660,2021-12-14,8:17,,344 BAYCHESTER AVENUE,,BRONX,,10475.0,40.86816,-73.83148,2,0,0,0,0,0,2,0,Unspecified
8,4487074,2021-12-14,21:10,,2047 PITKIN AVENUE,,BROOKLYN,,11207.0,40.67172,-73.8971,0,0,0,0,0,0,0,0,Driver Inexperience
10,4486934,2021-12-13,0:34,MYRTLE AVENUE,,,,,11237.0,40.701275,-73.88887,0,0,0,0,0,0,0,0,Passing or Lane Usage Improper
13,4486564,2021-12-14,0:59,BELT PARKWAY,,,,,11229.0,40.59662,-74.00231,0,0,0,0,0,0,0,0,Unsafe Speed
15,4486604,2021-12-14,17:58,,480 DEAN STREET,,BROOKLYN,,11217.0,40.68158,-73.97463,0,0,0,0,0,0,0,0,Passing Too Closely
16,4486991,2021-12-14,20:03,,878 FLATBUSH AVENUE,,BROOKLYN,,11226.0,40.65068,-73.95881,4,0,0,0,0,0,4,0,Steering Failure


In [64]:
for idx, row in county_missing_given_latlon_df.iterrows():
    if row['on_street_name'] and row['cross_street_name']:
        address = f"{row['on_street_name']} & {row['cross_street_name']}"
    elif row['on_street_name']:
        address = row['on_street_name']
    elif row['off_street_name']:
        address = row['off_street_name']
    else:
        address = row['cross_street_name']
    
    response = get_latlon_borough_county(address)
    lat, lon, borough, county, zip_code = extract_latlon_borough_from_response(response)

    if pd.isna(row['latitude']) or row['latitude'] == 0:
        collision_df.at[idx, 'latitude'] = lat
    if pd.isna(row['longitude']) or row['longitude'] == 0:
        collision_df.at[idx, 'longitude'] = lon
    if pd.isna(row['borough']):
        collision_df.at[idx, 'borough'] = borough
    if pd.isna(row['county']):
        collision_df.at[idx, 'county'] = county
    if pd.isna(row['zip_code']):
        collision_df.at[idx, 'zip_code'] = zip_code

In [69]:
collision_df.replace({None: np.nan, '': np.nan}, inplace=True)
missing_data_df = collision_df[collision_df['county'].isna() & collision_df['borough'].isna()]

In [70]:
for idx, row in missing_data_df.iterrows():
    response = get_borough_county(row['latitude'], row['longitude'])
    borough, county, zip_code = extract_borough_county_from_response(response)
    
    if pd.isna(row['borough']):
        collision_df.at[idx, 'borough'] = borough
    if pd.isna(row['county']):
        collision_df.at[idx, 'county'] = county
    if pd.isna(row['zip_code']):
        collision_df.at[idx, 'zip_code'] = zip_code

In [75]:
collision_df.replace({None: np.nan, '': np.nan}, inplace=True)
missing_data_df = collision_df[collision_df['county'].isna() | collision_df['borough'].isna()]

In [76]:
for idx, row in missing_data_df.iterrows():
    response = get_borough_county(row['latitude'], row['longitude'])
    borough, county, zip_code = extract_borough_county_from_response(response)
    
    if pd.isna(row['borough']):
        collision_df.at[idx, 'borough'] = borough
    if pd.isna(row['county']):
        collision_df.at[idx, 'county'] = county
    if pd.isna(row['zip_code']):
        collision_df.at[idx, 'zip_code'] = zip_code

In [82]:
collision_df['borough'] = collision_df['borough'].str.title()
collision_df['county'] = collision_df['county'].str.title()
collision_df['zip_code'] = collision_df['zip_code'].str.strip()
collision_df['on_street_name'] = collision_df['on_street_name'].str.title()
collision_df['off_street_name'] = collision_df['off_street_name'].str.title()
collision_df['cross_street_name'] = collision_df['cross_street_name'].str.title()
collision_df['contributing_factor_vehicle'] = collision_df['contributing_factor_vehicle'].str.title()
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,on_street_name,off_street_name,cross_street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,Whitestone Expressway,,20 Avenue,Queens,Queens County,11356,40.712728,-74.006015,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,Queensboro Bridge Upper,,,Manhattan,Queens County,10044,40.746402,-73.940191,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,Throgs Neck Bridge,,,Bronx,Bronx County,10465,40.816416,-73.798616,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,,1211 Loring Avenue,,Brooklyn,Kings County,11208,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,Saratoga Avenue,,Decatur Street,Brooklyn,Kings County,11233,40.683304,-73.917274,0,0,0,0,0,0,0,0,


In [90]:
def get_street_name(row):
    if pd.notna(row['on_street_name']) and pd.notna(row['cross_street_name']):
        return f"{row['on_street_name']} & {row['cross_street_name']}"
    elif pd.notna(row['on_street_name']):
        return row['on_street_name']
    elif pd.notna(row['off_street_name']):
        return row['off_street_name']
    else:
        return row['cross_street_name']

In [91]:
collision_df['street_name'] = collision_df.apply(get_street_name, axis=1)

In [94]:
collision_df = collision_df[['collision_id', 'crash_date', 'crash_time', 
                             'street_name', 'borough', 'county', 'zip_code', 'latitude', 'longitude', 
                             'number_of_persons_injured', 'number_of_persons_killed', 'number_of_pedestrians_injured', 'number_of_pedestrians_killed', 
                             'number_of_cyclist_injured', 'number_of_cyclist_killed', 'number_of_motorist_injured', 'number_of_motorist_killed', 'contributing_factor_vehicle']]
collision_df.head()

Unnamed: 0,collision_id,crash_date,crash_time,street_name,borough,county,zip_code,latitude,longitude,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,contributing_factor_vehicle
0,4455765,2021-09-11,2:39,Whitestone Expressway & 20 Avenue,Queens,Queens County,11356,40.712728,-74.006015,2,0,0,0,0,0,2,0,Aggressive Driving/Road Rage
1,4513547,2022-03-26,11:45,Queensboro Bridge Upper,Manhattan,Queens County,10044,40.746402,-73.940191,1,0,0,0,0,0,1,0,Pavement Slippery
2,4541903,2022-06-29,6:55,Throgs Neck Bridge,Bronx,Bronx County,10465,40.816416,-73.798616,0,0,0,0,0,0,0,0,Following Too Closely
3,4456314,2021-09-11,9:35,1211 Loring Avenue,Brooklyn,Kings County,11208,40.667202,-73.8665,0,0,0,0,0,0,0,0,Unspecified
4,4486609,2021-12-14,8:13,Saratoga Avenue & Decatur Street,Brooklyn,Kings County,11233,40.683304,-73.917274,0,0,0,0,0,0,0,0,


In [97]:
collision_df.to_csv("../Resources/collision_geo_data_clean.csv", index=False)