In [2]:
import numpy as np
import pandas as pd
import geopy as geo
from shapely.geometry import shape, Point
import json

In [3]:
dataset_path = 'resources/food-inspections.csv'
dataset = pd.read_csv(dataset_path, sep=',').drop(['Zip Codes', 'Community Areas', 'Census Tracts', 'Wards', 'Historical Wards 2003-2015'], axis=1)

# 1) Cleaning the city and keeping only Chicago's shops

### 1. Separating the data

In [4]:
with_location = dataset[dataset['Location'].astype('str') != "nan"]
missing_location = dataset[dataset['Location'].astype('str') == "nan"]
print('Shops with coordinates: {}'.format(with_location.shape[0]))
print('Shops without coordinates: {}'.format(missing_location.shape[0]))

Shops with coordinates: 194221
Shops without coordinates: 683


### 2. Cleaning with coordinates

In [5]:
# Only keeping shops with address in Chicago
# Warning, takes about 10mins!

with open('resources/chicago-city.geojson') as f0:
    chicago_border = json.load(f0)

chicago_shape = shape(chicago_border['features'][0]['geometry'])

def is_in_chicago(lng_lat):
    point = Point(lng_lat[0], lng_lat[1])    
    return chicago_shape.contains(point)

with_chicago_location = with_location[with_location[['Longitude', 'Latitude']].apply(is_in_chicago, axis=1)]
print('Shops with coordinates in Chicago: {}'.format(with_chicago_location.shape[0]))

Shops with coordinates in Chicago: 191329


There is no missing address, city or state in missing_location_dataset.  
Thus we can try to find their coordinates with the help of [GeoPy](https://geopy.readthedocs.io/en/latest/#). 


### 3. Cleaning with address

In [6]:
# First we will keep only the shops in IL
illinois_shops = missing_location[missing_location.State == 'IL']
print("Shops with missing location in IL: {}".format(illinois_shops.shape[0]))

illinois_shops_with_city = illinois_shops[illinois_shops['City'].astype('str') != "nan"]
print("Shops with missing location in IL with a city name: {}".format(illinois_shops_with_city.shape[0]))

# Then we rename the cities misspelling Chicago and add chicago to others
def rename_if_chicago(city_name):
    n_city = city_name.lower()
    if 'cago' in n_city:
        return 'chicago'
    return 'chicago ' + n_city

illinois_shops_with_city['City'] = illinois_shops_with_city['City'].map(rename_if_chicago)

Shops with missing location in IL: 680
Shops with missing location in IL with a city name: 677


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [7]:
# Getting the coordinates from some maps provider (here Bing)
geo.geocoders.options.default_user_agent = 'my_app/1'
geo.geocoders.options.default_timeout = 20
geolocator = geo.geocoders.Bing(api_key='AqZbhCYki5TkYlGbyLBfTJR5wfwpLJ22S-D3p_DRJxfDIyYY4wTRsq6gxPCdAXSV')

def get_location(full_address):
    address = ' '.join(full_address)
    loc = geolocator.geocode(address)
    return loc.longitude, loc.latitude

new_locations = illinois_shops_with_city[['Address','City','State']].apply(get_location, axis=1)

Found 8750 W BRYN WAWR AVE  chicago IL in -87.844333 41.983042.7931166454745273380483

In [53]:
pos = pd.DataFrame(new_locations.copy(), columns=['position'])
pos.position = pos.position.astype(str)
pos.position = pos["position"].str.strip('()')
coords = pos["position"].str.split(", ", n = 1, expand = True)

illinois_shops_with_city['Longitude'] = coords[0].astype(float)
illinois_shops_with_city['Latitude'] = coords[1].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [55]:
# We only keep those in Chicago borders
chicago_shops_without_location = illinois_shops_with_city[illinois_shops_with_city[['Longitude', 'Latitude']].apply(is_in_chicago, axis=1)]
print("Shops with missing location in Chicago: {}".format(chicago_shops_without_location.shape[0]))

Shops with missing location in Chicago: 498


In [59]:
# Merging the result
chicago_shops = pd.concat([with_chicago_location, chicago_shops_without_location])
print("We have {} shops within Chicago's borders!".format(chicago_shops.shape[0]))

We have 191827 shops within Chicago's borders!


### Checkpoint, saving the data

In [67]:
# Dropping useless columns and saving the data
chicago_shops.drop(columns=['Location','State','City']).to_parquet('resources/food-inspections-part1.parquet')

# 2) Assign matching types to columns

In [119]:
dataset = pd.read_parquet('resources/food-inspections-part1.parquet')
dataset['Inspection Date'] = pd.to_datetime(dataset['Inspection Date'])
dataset['Zip'] = dataset['Zip'].fillna(0).astype(int)
dataset['License #'] = dataset['License #'].fillna(0).astype(int)
print("New type:\n")
dataset.dtypes

New type:



Inspection ID               int64
DBA Name                   object
AKA Name                   object
License #                   int64
Facility Type              object
Risk                       object
Address                    object
Zip                         int64
Inspection Date    datetime64[ns]
Inspection Type            object
Results                    object
Violations                 object
Latitude                  float64
Longitude                 float64
dtype: object

### Checkpoint, saving the data

In [122]:
# Dropping useless columns and saving the data
dataset.to_parquet('resources/food-inspections-part2.parquet')

In [123]:
dataset.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2320519,SALAM RESTAURANT,SALAM RESTAURANT,2002822,Restaurant,Risk 1 (High),4634-4636 N KEDZIE AVE,60625,2019-10-25,Complaint Re-Inspection,Pass,,41.965719,-87.708538
1,2320509,TAQUERIA EL DORADO,TAQUERIA EL DORADO,2694960,Restaurant,Risk 1 (High),2114 W LAWRENCE AVE,60625,2019-10-25,License Re-Inspection,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.96882,-87.682292
2,2320412,"DANTE'S PIZZA,INC.",DANTE'S PIZZA,2092884,Restaurant,Risk 1 (High),3028 W ARMITAGE AVE,60647,2019-10-24,Canvass,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.917539,-87.703728
3,2320430,LAO PENG YOU LLC,LAO PENG YOU,2694477,Restaurant,Risk 1 (High),2020 W CHICAGO,60622,2019-10-24,License Re-Inspection,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.896005,-87.677938
4,2320384,ARBOR,ARBOR,2363029,Restaurant,Risk 1 (High),2545 W DIVERSEY AVE,60647,2019-10-24,Recent Inspection,Pass w/ Conditions,14. REQUIRED RECORDS AVAILABLE: SHELLSTOCK TAG...,41.932025,-87.692169


# 3) Rewriting the Risk column

In [39]:
# We load the last checkpoint
dataset = pd.read_parquet('resources/food-inspections-part2.parquet')

In [40]:
# First, let's see what are the possible risks
np.array(dataset.Risk.unique())

array(['Risk 1 (High)', 'Risk 2 (Medium)', 'Risk 3 (Low)', 'All', None],
      dtype=object)

In [41]:
dictionnary = {'Risk 1 (High)': 'High', 'Risk 2 (Medium)': 'Medium', 'Risk 3 (Low)': 'Low', 'All': 'All', None: None}
dataset.Risk = dataset.Risk.map(dictionnary)

In [55]:
# Now it's a bit clearer and easier to search
dataset.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Re-inspection
0,2320519,SALAM RESTAURANT,SALAM RESTAURANT,2002822,Restaurant,High,4634-4636 N KEDZIE AVE,60625,2019-10-25,Complaint,Pass,,41.965719,-87.708538,True
1,2320509,TAQUERIA EL DORADO,TAQUERIA EL DORADO,2694960,Restaurant,High,2114 W LAWRENCE AVE,60625,2019-10-25,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.96882,-87.682292,True
2,2320412,"DANTE'S PIZZA,INC.",DANTE'S PIZZA,2092884,Restaurant,High,3028 W ARMITAGE AVE,60647,2019-10-24,Canvass,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.917539,-87.703728,False
3,2320430,LAO PENG YOU LLC,LAO PENG YOU,2694477,Restaurant,High,2020 W CHICAGO,60622,2019-10-24,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.896005,-87.677938,True
4,2320384,ARBOR,ARBOR,2363029,Restaurant,High,2545 W DIVERSEY AVE,60647,2019-10-24,Others,Pass w/ Conditions,14. REQUIRED RECORDS AVAILABLE: SHELLSTOCK TAG...,41.932025,-87.692169,False


### Checkpoint, saving the data

In [44]:
dataset.to_parquet('resources/food-inspections-part3.parquet')

# 4) Exploding the inspection

In [204]:
# We load the last checkpoint
dataset = pd.read_parquet('resources/food-inspections-part3.parquet')

In [205]:
# There is a lot to unfold here.
# First we will put all text to lowercase
dataset['Inspection Type'] = dataset['Inspection Type'].str.lower()

In [206]:
# Then we will filter all the inpection labels to find which type they correspond to

def substring_in_word(word, substrings):
    for s in substrings:
        if s.lower() in word.lower():
            return True
    return False

def find_inspection_type(inspection):
    if inspection == None:
        return 'Unknown', False
    
    lc_name = inspection.lower()
    i_type = 'Others'
    i_repeat = False
    
    if substring_in_word(lc_name, ['changed', 'closed', 'out of', 'cancelled', 'suspended', 'not ready', 'not located', 'wrong', 'non-inspection']):
        i_type = 'Cancelled'
    elif substring_in_word(lc_name, ['task', 'tavern', 'liquor']):
        i_type = 'Task-force'
    elif substring_in_word(lc_name, ['consultation']):
        i_type = 'Consultation'
    elif substring_in_word(lc_name, ['license']):
        i_type = 'License'
    elif substring_in_word(lc_name, ['complain']):
        i_type = 'Complaint'
    elif substring_in_word(lc_name, ['food poisoning', 'got sick']):
        i_type = 'Suspect food poisoning'
    elif substring_in_word(lc_name, ['canvas']):
        i_type = 'Canvass'
    
    if substring_in_word(lc_name, ['reinspection', 're-inspection', 're inspection']):
        i_repeat = True
    
    return i_type, i_repeat

In [207]:
dataset['Inspection Type'] = dataset['Inspection Type'].map(find_inspection_type)
dataset['Inspection Type'].value_counts()

(Canvass, False)                   100957
(License, False)                    25428
(Complaint, False)                  24901
(Canvass, True)                     20133
(License, True)                      8881
(Complaint, True)                    7499
(Others, False)                      1073
(Task-force, False)                   921
(Suspect food poisoning, False)       828
(Consultation, False)                 681
(Cancelled, False)                    334
(Suspect food poisoning, True)        185
(Others, True)                          5
(Unknown, False)                        1
Name: Inspection Type, dtype: int64

In [208]:
# Re-assign the inspection type values and add a re-inspection column
dataset['Re-inspection'] = dataset['Inspection Type'].map(lambda x: x[1])
dataset['Inspection Type'] = dataset['Inspection Type'].map(lambda x: x[0])

In [209]:
# Now the inspection data is easier to use!
dataset.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Re-inspection
0,2320519,SALAM RESTAURANT,SALAM RESTAURANT,2002822,Restaurant,High,4634-4636 N KEDZIE AVE,60625,2019-10-25,Complaint,Pass,,41.965719,-87.708538,True
1,2320509,TAQUERIA EL DORADO,TAQUERIA EL DORADO,2694960,Restaurant,High,2114 W LAWRENCE AVE,60625,2019-10-25,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.96882,-87.682292,True
2,2320412,"DANTE'S PIZZA,INC.",DANTE'S PIZZA,2092884,Restaurant,High,3028 W ARMITAGE AVE,60647,2019-10-24,Canvass,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.917539,-87.703728,False
3,2320430,LAO PENG YOU LLC,LAO PENG YOU,2694477,Restaurant,High,2020 W CHICAGO,60622,2019-10-24,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.896005,-87.677938,True
4,2320384,ARBOR,ARBOR,2363029,Restaurant,High,2545 W DIVERSEY AVE,60647,2019-10-24,Others,Pass w/ Conditions,14. REQUIRED RECORDS AVAILABLE: SHELLSTOCK TAG...,41.932025,-87.692169,False


### Checkpoint, saving the data

In [210]:
dataset.to_parquet('resources/food-inspections-part4.parquet')

# 5) Facility types, here we are!

In [221]:
# We load the last checkpoint
dataset = pd.read_parquet('resources/food-inspections-part4.parquet')
dataset.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Re-inspection
0,2320519,SALAM RESTAURANT,SALAM RESTAURANT,2002822,Restaurant,High,4634-4636 N KEDZIE AVE,60625,2019-10-25,Complaint,Pass,,41.965719,-87.708538,True
1,2320509,TAQUERIA EL DORADO,TAQUERIA EL DORADO,2694960,Restaurant,High,2114 W LAWRENCE AVE,60625,2019-10-25,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.96882,-87.682292,True
2,2320412,"DANTE'S PIZZA,INC.",DANTE'S PIZZA,2092884,Restaurant,High,3028 W ARMITAGE AVE,60647,2019-10-24,Canvass,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.917539,-87.703728,False
3,2320430,LAO PENG YOU LLC,LAO PENG YOU,2694477,Restaurant,High,2020 W CHICAGO,60622,2019-10-24,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.896005,-87.677938,True
4,2320384,ARBOR,ARBOR,2363029,Restaurant,High,2545 W DIVERSEY AVE,60647,2019-10-24,Others,Pass w/ Conditions,14. REQUIRED RECORDS AVAILABLE: SHELLSTOCK TAG...,41.932025,-87.692169,False


In [222]:
print("Number of facility types: {}".format(dataset['Facility Type'].str.lower().unique().size))

Number of facility types: 432


They are more than 400 hundreds facility types which will seriously complicate our data analysis.  
Thus, we will only label facility types explicitely described in the [Chicago Food Inspection description](https://data.cityofchicago.org/api/assets/BAD5301B-681A-4202-9D25-51B2CAE672FF):  
* bakery  
* banquet hall  
* candy store  
* caterer  
* coffee shop  
* day care center (< 2 yo)  
* day care center (2-6 yo)  
* day care center (0-6 yo)  
* gas station  
* Golden Dinner  
* grocery store  
* hospital 
* long term care center(nursing home)  
* liquor store  
* mobile food dispenser  
* restaurant  
* paleteria  
* school  
* shelter  
* tavern  
* social club  
* wholesaler  
* Wrigley Field Rooftop

In [223]:
def filter_none(x):
    return x != None and '/' in x

print("Shops with 2 types: {}".format(dataset[dataset['Facility Type'].apply(filter_none)].size))
print("Shops with 1 type: {}".format(dataset.size))

Shops with 2 types: 9360
Shops with 1 type: 2877405


Some shops are hybrids types where the types are separated by a slash.  
As they represent a very small part of a dataset, we will only consider the first category they fit, or the second if the first is uncategorizable.

In [224]:
def find_facility_type(facility):
    if facility == None:
        return 'Unknown'
    
    split = facility.lower().split('/')
    lc_name = split[0]
    f_types = []
    
    if 'hospital' in lc_name:
        return 'Hospital'
        
    elif substring_in_word(lc_name, ['banquet', 'hall']):
        return 'Banquet hall'
        
    elif substring_in_word(lc_name, ['ice cream', 'paleteria', 'gelato', 'candy']):
        return 'Sweetshop'
        
    elif substring_in_word(lc_name, ['cater', 'butcher', 'deli']):
        return 'Caterer'
        
    elif substring_in_word(lc_name, ['cafe', 'coffee', 'kiosk']):
        return 'Coffee shop'
                           
    elif substring_in_word(lc_name, ['dispenser', 'mobile', 'truck', 'cart']):
        return 'Mobile food dispenser'
                                                     
    elif 'gas' in lc_name:
        return 'Gas station'
                           
    elif 'golden' in lc_name:
        return 'Golden Dinner'
        
    elif substring_in_word(lc_name, ['liquor', 'liqour', 'wine']):
        return 'Liquor store'
        
    elif substring_in_word(lc_name, ['tavern', 'bar', 'brewery', 'brewpub']):
        return 'Tavern'
                           
    elif substring_in_word(lc_name, ['grocery', 'market', 'convenience', 'store']):
        return 'Grocery store'
                           
    elif substring_in_word(lc_name, ['pantry', 'bakery']):
        return 'Bakery'
        
    elif substring_in_word(lc_name, ['wholesale', 'slaughter', 'live', 'warehouse']):
        return 'Wholesaler'
        
    elif substring_in_word(lc_name, ['school', 'shcool', 'college']):
        return 'School'
        
    elif substring_in_word(lc_name, ['services facility', 'service facility']):
        return 'Childrens services facility'
        
    elif substring_in_word(lc_name, ['roof']):
        return 'Rooftop'
        
    elif substring_in_word(lc_name, ['shared']):
        return 'Shared kitchen'
        
    elif substring_in_word(lc_name, ['nursing', 'care center', 'senior', 'long term care', 'assisted', 'supportive']):
        return 'Long term care center'

    elif 'restaurant' in lc_name:
        return 'Restaurant'
    
    elif 'shelter' in lc_name:
        return 'Shelter'
        
    elif substring_in_word(lc_name, ['(Under 2 Years)', 'Daycare (2 Years)']):
        return 'Daycare (0-2 yo)'

    elif substring_in_word(lc_name, ['combo', 'Above and Under 2 Years', '6 WKS-5YRS']):
        return 'Daycare (0-6 yo)'
            
    elif substring_in_word(lc_name, ['2-6', '(2 - 6 Years)', 'DAY CARE ABOVE 2 YEARS']):
        return 'Daycare (2-6 yo)'
    
    elif substring_in_word(lc_name, ['daycare', 'day care']):
        return 'Daycare (others)'
    
    elif len(split) > 1:
        return find_facility_type(split[1])
    
    else:
        return 'Others'
    
    return f_type

In [225]:
dataset['Facility Type'] = dataset['Facility Type'].map(find_facility_type)
print('We have now {} distinct categories!'.format(dataset['Facility Type'].unique().size))

We have now 26 distinct categories!


### Checkpoint, saving the data

In [226]:
dataset.to_parquet('resources/food-inspections-part5.parquet')

In [227]:
dataset.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Re-inspection
0,2320519,SALAM RESTAURANT,SALAM RESTAURANT,2002822,Restaurant,High,4634-4636 N KEDZIE AVE,60625,2019-10-25,Complaint,Pass,,41.965719,-87.708538,True
1,2320509,TAQUERIA EL DORADO,TAQUERIA EL DORADO,2694960,Restaurant,High,2114 W LAWRENCE AVE,60625,2019-10-25,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.96882,-87.682292,True
2,2320412,"DANTE'S PIZZA,INC.",DANTE'S PIZZA,2092884,Restaurant,High,3028 W ARMITAGE AVE,60647,2019-10-24,Canvass,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.917539,-87.703728,False
3,2320430,LAO PENG YOU LLC,LAO PENG YOU,2694477,Restaurant,High,2020 W CHICAGO,60622,2019-10-24,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.896005,-87.677938,True
4,2320384,ARBOR,ARBOR,2363029,Restaurant,High,2545 W DIVERSEY AVE,60647,2019-10-24,Others,Pass w/ Conditions,14. REQUIRED RECORDS AVAILABLE: SHELLSTOCK TAG...,41.932025,-87.692169,False


# 6) Adding the neighborhoods and community areas

In [3]:
# We load the last checkpoint
dataset = pd.read_parquet('resources/food-inspections-part5.parquet')
dataset.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Re-inspection
0,2320519,SALAM RESTAURANT,SALAM RESTAURANT,2002822,Restaurant,High,4634-4636 N KEDZIE AVE,60625,2019-10-25,Complaint,Pass,,41.965719,-87.708538,True
1,2320509,TAQUERIA EL DORADO,TAQUERIA EL DORADO,2694960,Restaurant,High,2114 W LAWRENCE AVE,60625,2019-10-25,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.96882,-87.682292,True
2,2320412,"DANTE'S PIZZA,INC.",DANTE'S PIZZA,2092884,Restaurant,High,3028 W ARMITAGE AVE,60647,2019-10-24,Canvass,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.917539,-87.703728,False
3,2320430,LAO PENG YOU LLC,LAO PENG YOU,2694477,Restaurant,High,2020 W CHICAGO,60622,2019-10-24,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.896005,-87.677938,True
4,2320384,ARBOR,ARBOR,2363029,Restaurant,High,2545 W DIVERSEY AVE,60647,2019-10-24,Others,Pass w/ Conditions,14. REQUIRED RECORDS AVAILABLE: SHELLSTOCK TAG...,41.932025,-87.692169,False


In [49]:
import json
from shapely.geometry import shape, Point

# load GeoJSON file containing sectors
with open('resources/chicago-community.geojson') as f:
    communities = json.load(f)
    
with open('resources/chicago-neighborhoods.geojson') as f:
    neighborhoods = json.load(f)
    
with open('resources/chicago-wards.geojson') as f:
    wards = json.load(f)
    
with open('resources/chicago-precincts.geojson') as f:
    precincts = json.load(f)
    
def find_community(lng_lat):
    point = Point(lng_lat[0], lng_lat[1])
    for feature in communities['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature['properties']['community']
    return 'Not found'

def find_neighborhood(lng_lat):
    point = Point(lng_lat[0], lng_lat[1])
    for feature in neighborhoods['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature['properties']['sec_neigh']
    return 'Not found'

def find_ward(lng_lat):
    point = Point(lng_lat[0], lng_lat[1])
    for feature in wards['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature['properties']['ward']
    return 'Not found'

def find_precinct(lng_lat):
    point = Point(lng_lat[0], lng_lat[1])
    for feature in precincts['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature['properties']['precinct']
    return 'Not found'

In [29]:
dataset['Community'] = dataset[['Longitude', 'Latitude']].apply(find_community, axis=1)

In [31]:
dataset['Neighborhood'] = dataset[['Longitude', 'Latitude']].apply(find_neighborhood, axis=1)

In [47]:
dataset['Ward'] = dataset[['Longitude', 'Latitude']].apply(find_ward, axis=1)

In [50]:
dataset['Precinct'] = dataset[['Longitude', 'Latitude']].apply(find_precinct, axis=1)

### Checkpoint, saving the data

In [52]:
dataset.to_parquet('resources/food-inspections-part6.parquet')

In [53]:
dataset.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Re-inspection,Community,Neighborhood,Ward,Precinct
0,2320519,SALAM RESTAURANT,SALAM RESTAURANT,2002822,Restaurant,High,4634-4636 N KEDZIE AVE,60625,2019-10-25,Complaint,Pass,,41.965719,-87.708538,True,ALBANY PARK,"NORTH PARK,ALBANY PARK",33,27
1,2320509,TAQUERIA EL DORADO,TAQUERIA EL DORADO,2694960,Restaurant,High,2114 W LAWRENCE AVE,60625,2019-10-25,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.96882,-87.682292,True,LINCOLN SQUARE,LINCOLN SQUARE,47,38
2,2320412,"DANTE'S PIZZA,INC.",DANTE'S PIZZA,2092884,Restaurant,High,3028 W ARMITAGE AVE,60647,2019-10-24,Canvass,Fail,"38. INSECTS, RODENTS, & ANIMALS NOT PRESENT - ...",41.917539,-87.703728,False,LOGAN SQUARE,LOGAN SQUARE,32,22
3,2320430,LAO PENG YOU LLC,LAO PENG YOU,2694477,Restaurant,High,2020 W CHICAGO,60622,2019-10-24,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",41.896005,-87.677938,True,WEST TOWN,UKRAINIAN VILLAGE AND EAST VILLAGE,2,21
4,2320384,ARBOR,ARBOR,2363029,Restaurant,High,2545 W DIVERSEY AVE,60647,2019-10-24,Others,Pass w/ Conditions,14. REQUIRED RECORDS AVAILABLE: SHELLSTOCK TAG...,41.932025,-87.692169,False,LOGAN SQUARE,LOGAN SQUARE,32,1
