In [1]:
import numpy as np
import pandas as pd
from shapely.geometry import shape, Point
import json

In [2]:
with open('resources/chicago-community.geojson') as f1:
    communities = json.load(f1)

with open('resources/chicago-neighborhoods.geojson') as f2:
    neighborhoods = json.load(f2)

In [44]:
dataset_path = 'resources/food-inspections.csv'
dataset = pd.read_csv(dataset_path, sep=',')
dataset.head(2)

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Results,Violations,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards
0,2320519,SALAM RESTAURANT,SALAM RESTAURANT,2002822.0,Restaurant,Risk 1 (High),4634-4636 N KEDZIE AVE,CHICAGO,IL,60625.0,...,Pass,,41.965719,-87.708538,"{'longitude': '41.965719017423005', 'latitude'...",,,,,
1,2320509,TAQUERIA EL DORADO,TAQUERIA EL DORADO,2694960.0,Restaurant,Risk 1 (High),2114 W LAWRENCE AVE,CHICAGO,IL,60625.0,...,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.96882,-87.682292,"{'longitude': '41.968819723568394', 'latitude'...",,,,,


### Cleaning the State column

In [57]:
# Separate the wheat from the chaff
sure_illinois = dataset[dataset.State == 'IL']
nan_state = dataset[dataset.State.isnull()]
print("Confirmed illinois shops: {}".format(sure_illinois.shape[0]))
print("Unsure shops: {}".format(nan_state.shape[0]))

Confirmed illinois shops: 194859
Unsure shops: 42


In [58]:
# Find all the unsure shops that have coordinates in Chicago and change their State
def isInIL(longitude, latitude):
    point = Point(longitude, latitude)
    for feature in communities['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return True
    return False

new_state = nan_state[nan_state[['Longitude','Latitude']].apply(lambda x: isInIL(x[0],x[1]), axis=1)]
new_state.State = 'IL'

In [158]:
# Combine previous IL datasets into one and verify there is no outlier left
dataset_state_cleaned = pd.concat([sure_illinois, new_state])
print("Shops in IL: {}".format(dataset_state_cleaned.shape[0]))
print("Lost outliers: {}".format(dataset.shape[0] - dataset_state_cleaned.shape[0]))

Shops in IL: 194901
Lost outliers: 3


### Cleaning the City column

In [148]:
# There are a lot of cities and Chicago is often mispelled...
cities = dataset_state_cleaned.City.str.lower()
chicago_misspells = cities[cities.str.contains('cago', na=False, regex=False)]
print("Unique cities: {}".format(cities.unique().shape[0]))
print("References to Chicago among these cities: {}".format(chicago_misspells.unique().shape[0]))

Unique cities: 64
References to Chicago among these cities: 10


In [149]:
# As before: separate the wheat from the chaff
sure_chicago = dataset_state_cleaned[dataset_state_cleaned.City.str.lower().isin(chicago_misspells)]
unsure_city = dataset_state_cleaned[~dataset_state_cleaned.City.str.lower().isin(chicago_misspells)]
print("Shops in Chicago: {}".format(sure_chicago.shape[0]))
print("Unsure shops: {}".format(unsure_city.shape[0]))

Shops in Chicago: 194567
Unsure shops: 334


In [153]:
def isInChicago(city_name, longitude, latitude):
    point = Point(longitude, latitude)
    for feature in communities['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return True
    return False

new_city = unsure_city[unsure_city[['City','Longitude','Latitude']].apply(lambda x: isInChicago(x[0],x[1],x[2]), axis=1)]

In [157]:
# Combine previous Chicago datasets into one and normalize the city name
dataset_city_cleaned = pd.concat([sure_chicago, new_city])
dataset_city_cleaned.City = 'Chicago'
print("Shops in Chicago: {}".format(dataset_city_cleaned.shape[0]))
print("Lost outliers: {}".format(dataset_state_cleaned.shape[0] - dataset_city_cleaned.shape[0]))

Shops in Chicago: 194714
Lost outliers: 187
