Can you integrate additional data (hierarchical GeoHash/Uber H3/Google S2 encoding for spatial data, properties of the road, additional weather conditions, etc.) from external sources with a reasonable effort?

In [1]:
#Imports
import pandas as pd

In [2]:
#Read crashes data
crashes_df = pd.read_csv('../data/Crashes.csv')

### External Integration

In [None]:
#Filtering data for the columns we need (geographical data)
only_geo_data_df = crashes_df[["LOCATION", "LONGITUDE", "LATITUDE", "STREET_NAME", "STREET_NO", "BEAT_OF_OCCURRENCE"]]
only_geo_data_df

In [None]:
#Checking for missing values
only_geo_data_df.isna().sum()[crashes_df.isna().sum() > 0]


In [5]:
#Function to get the API key
def getApiKey ():
    with open ("api key") as f:
        return f.readline().strip()

In [None]:
#USING https://opencagedata.com/pricing (2500/calls a day for free)
#this is a demo
from opencage.geocoder import OpenCageGeocode

key = getApiKey()
geocoder = OpenCageGeocode(key)

query = u'Bosutska ulica 10, Zagreb, Croatia'

# no need to URI encode query, module does that for you
results = geocoder.geocode(query)

print(u'%f;%f;%s;%s' % (results[0]['geometry']['lat'],
                        results[0]['geometry']['lng'],
                        results[0]['components']['country_code'],
                        results[0]['annotations']['timezone']['name']))

In [None]:
#Get the filtered data where the location info are missing
filtered_geo_data_df = only_geo_data_df[only_geo_data_df[['LONGITUDE', 'LATITUDE', 'LOCATION']].isna().any(axis=1)]
filtered_geo_data_df

In [None]:
from opencage.geocoder import OpenCageGeocode

key = getApiKey()
#geocoder = OpenCageGeocode(key)

import json
import time
from tqdm import tqdm

try:
    with open ("missing lat lng.json") as f:
        resultDict = json.load(f)
except Exception:
    resultDict = {}
print (resultDict)
for index, row in tqdm(filtered_geo_data_df.iterrows(), total=filtered_geo_data_df.shape[0]):
    resultKey = str(row["STREET_NAME"]) + " " + str(row["STREET_NO"])
    query = resultKey + ", Chicago, Illinois"
    print ("Query: " + query)
    if resultKey not in resultDict:
        results = geocoder.geocode(query)
        resultDict[resultKey]= {
            "lat": results[0]['geometry']['lat'],
            "lng": results[0]['geometry']['lng'],
        }
        with open ("missing lat lng.json", "w") as f:
            json.dump(resultDict, f, indent=4)
            time.sleep(1)

In [None]:
#Checking the number of results
len(resultDict.keys())

We generate the queries using the following code:

In [None]:
allKeys = set()
for index, row in tqdm(filtered_geo_data_df.iterrows(), total=filtered_geo_data_df.shape[0]):
    resultKey = str(row["STREET_NAME"]) + " " + str(row["STREET_NO"])
    allKeys.add(resultKey)

len (allKeys)

As a conclusion, we can see that there are only 548 missing values from the dataset and not 1022 as we previously thought.
