In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import fiona
import geopandas as gpd
import matplotlib.pyplot as plt
from geopy import distance
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
import censusgeocode as cg
from random import uniform
from concurrent.futures import ThreadPoolExecutor
from tqdm.notebook import tqdm
from stargazer.stargazer import Stargazer
from IPython.core.display import HTML

import warnings
warnings.filterwarnings('ignore')

In [2]:
center = (39.952,-75.164)

In [4]:
cg.coordinates(-75.164, 39.952)['2020 Census Blocks'][0]

{'SUFFIX': '',
 'GEOID': '421010005002000',
 'CENTLAT': '+39.9525419',
 'BLOCK': '2000',
 'AREAWATER': 0,
 'STATE': '42',
 'BASENAME': '2000',
 'OID': '210701003164220',
 'LSADC': 'BK',
 'FUNCSTAT': 'S',
 'INTPTLAT': '+39.9525419',
 'NAME': 'Block 2000',
 'OBJECTID': 3255873,
 'TRACT': '000500',
 'CENTLON': '-075.1638815',
 'BLKGRP': '2',
 'AREALAND': 44998,
 'INTPTLON': '-075.1638815',
 'MTFCC': 'G5040',
 'LWBLKTYP': 'L',
 'COUNTY': '101',
 'CENT': (-75.1638815, 39.9525419),
 'INTPT': (-75.1638815, 39.9525419)}

In [5]:
with open("../data/yelp_dataset/yelp_academic_dataset_business21.json", "r") as f:
    yelp = pd.read_json(f, orient="records", lines=True)
with open("../data/yelp_dataset/yelp_academic_dataset_checkin.json", "r") as c:
    checkin = pd.read_json(c, orient="records", lines=True)

In [4]:
years = range(2011,2022)
def checkin_year(row,y):
    if str(y) in row['date']:
        return 1
    else:
        return 0
for y in years:
    #checkin.loc[:,'checkin_'+str(y)] = 0
    checkin['checkin_'+str(y)] = checkin.apply(lambda row: checkin_year(row,y), axis=1)

checkin.rename({'date':'checkin_date'},inplace=True,axis=1)
result = pd.merge(yelp, checkin, how = 'left', on=["business_id"])

philly = result[result.city=='Philadelphia']
philly.reset_index(drop=True,inplace=True)
philly = philly[(philly.latitude.notna()) & (philly.longitude.notna())]
center = (39.952,-75.164)
philly['distance'] = philly.apply(lambda x: distance.distance((x['latitude'], x['longitude']), center).km, axis=1) 
philly['is_cbd'] = philly.apply(lambda x: 1 if x['distance'] <= 2 else 0, axis=1) 
philly.drop(['address', 'state', 'city', 'hours','attributes'], axis=1, inplace=True)

for y in years[1:]:
    philly['close_%s'%(y)] = \
        philly.apply(lambda x: 
                1 if (x['checkin_%s'%(y-1)]==1 and x['checkin_%s'%(y)]==0) 
                else 0, axis=1) 
    philly['open_%s'%(y)] = \
        philly.apply(lambda x: 
                1 if (x['checkin_%s'%(y)]==1 and x['checkin_%s'%(y-1)]==0) 
                else 0, axis=1) 
                
status = philly[['close_%s'%(y) for y in years[1:]]+['open_%s'%(y) for y in years[1:]]]
philly=philly[(status == 1).any(axis=1)]
philly.reset_index(inplace=True,drop=True)

def count(status, year, df):
    count = df[status+'_'+str(year)].value_counts()[1]
    count_cbd = df[df[status+'_'+str(year)]==1].is_cbd.value_counts()[1]
    ratio = count_cbd/count * 100
    return count, count_cbd,ratio
    
status = philly[['close_%s'%(y) for y in years[1:]]+['open_%s'%(y) for y in years[1:]]]
philly=philly[(status == 1).any(axis=1)]
philly.reset_index(inplace=True,drop=True)

def count(status, year, df):
    count = df[status+'_'+str(year)].value_counts()[1]
    count_cbd = df[df[status+'_'+str(year)]==1].is_cbd.value_counts()[1]
    ratio = count_cbd/count * 100
    return count, count_cbd,ratio

### Match census tract from coordinates

In [None]:
locations = philly[['latitude','longitude']]
def geocode(row):
    index, lat, lng = row
    try:
        census = cg.coordinates(lng, lat)['2020 Census Blocks'][0]

        data = dict(geoid=census['GEOID'], 
                    state=census['STATE'], 
                    county=census['COUNTY'], 
                    tract=census['TRACT'], 
                    block=census['BLOCK'], 
                    lat=lat, 
                    lng=lng)

    except Exception as e:
        data = dict(lat=lat, 
                    lng=lng)

    return data

with ThreadPoolExecutor() as tpe:
     data = list(tqdm(tpe.map(geocode, locations.itertuples()), total=len(locations)))
df = pd.DataFrame.from_records(data)

In [None]:
# save for further use
df.to_csv('../output/yelp_census_tract.csv', index=False)

In [3]:
shape_map_path = r"../data/business_licenses/business_licenses.shp"
shape_map = gpd.read_file(shape_map_path)
license = shape_map.to_crs(epsg=4326)  # EPSG 4326 = WGS84 = https://epsg.io/4326
food_l = license.licensetyp.str.contains('Food', case=False)
food_license = license[food_l]
food_license.reset_index(inplace=True, drop=True)
food_license.loc[:, 'longitude'] = food_license.geometry.x
food_license.loc[:, 'latitude'] = food_license.geometry.y
food_license['open'] = food_license.initialiss.str[:4]
food_license['close'] = food_license.inactiveda.str[:4]
food_license = food_license[(food_license.latitude.notna()) & (food_license.longitude.notna())]
food_license['distance'] = food_license.apply(lambda x: distance.distance((x['latitude'], x['longitude']), center).km, axis=1) 
food_license['is_cbd'] = food_license.apply(lambda x: 1 if x['distance'] <= 2 else 0, axis=1) 

### Match cbd's coordinates to census tracts

In [6]:
locations = food_license[food_license.is_cbd==1][['latitude','longitude']] 
def geocode(row):
    index, lat, lng = row
    try:
        census = cg.coordinates(lng, lat)['2020 Census Blocks'][0]

        data = dict(geoid=census['GEOID'], 
                    state=census['STATE'], 
                    county=census['COUNTY'], 
                    tract=census['TRACT'], 
                    block=census['BLOCK'], 
                    lat=lat, 
                    lng=lng)

    except Exception as e:
        data = dict(lat=lat, 
                    lng=lng)

    return data

with ThreadPoolExecutor() as tpe:
     data = list(tqdm(tpe.map(geocode, locations.itertuples()), total=len(locations)))
df = pd.DataFrame.from_records(data)

  0%|          | 0/7212 [00:00<?, ?it/s]

In [8]:
# save to csv
df.to_csv('../output/cbd_census.csv') 

In [4]:
locations = food_license[['latitude','longitude']] 
def geocode(row):
    index, lat, lng = row
    try:
        census = cg.coordinates(lng, lat)['2020 Census Blocks'][0]

        data = dict(geoid=census['GEOID'], 
                    state=census['STATE'], 
                    county=census['COUNTY'], 
                    tract=census['TRACT'], 
                    block=census['BLOCK'], 
                    lat=lat, 
                    lng=lng)

    except Exception as e:
        data = dict(lat=lat, 
                    lng=lng)

    return data

with ThreadPoolExecutor() as tpe:
     data = list(tqdm(tpe.map(geocode, locations.itertuples()), total=len(locations)))
df = pd.DataFrame.from_records(data)

  0%|          | 0/35606 [00:00<?, ?it/s]

In [None]:
# save for further use
df.to_csv('../output/bl_census_tract.csv', index=False)

In [22]:
with open("../output/df_missing.csv", "r") as missing:
    df_missing = pd.read_csv(missing)

In [23]:
locations = df_missing[['latitude','longitude']] 
def geocode(row):
    index, lat, lng = row
    try:
        census = cg.coordinates(lng, lat)['2020 Census Blocks'][0]

        data = dict(geoid=census['GEOID'], 
                    state=census['STATE'], 
                    county=census['COUNTY'], 
                    tract=census['TRACT'], 
                    block=census['BLOCK'], 
                    lat=lat, 
                    lng=lng)

    except Exception as e:
        data = dict(lat=lat, 
                    lng=lng)

    return data

with ThreadPoolExecutor() as tpe:
     data = list(tqdm(tpe.map(geocode, locations.itertuples()), total=len(locations)))
df = pd.DataFrame.from_records(data)

  0%|          | 0/1484 [00:00<?, ?it/s]

In [24]:
df = df.drop_duplicates(['lng','lat'])
df= df[df.tract.notna()]

In [25]:
df.reset_index(inplace=True,drop=True)

In [26]:
df=df[['lat','lng','tract']]

In [29]:
df

Unnamed: 0,lat,lng,tract
0,40.029113,-75.099840,039002
1,39.941119,-75.145943,001600
2,39.962607,-75.135648,014202
3,39.937315,-75.158016,002400
4,39.934656,-75.154656,002400
...,...,...,...
1353,40.010672,-75.151622,020102
1354,40.061092,-75.084019,030700
1355,39.928940,-75.178738,003701
1356,39.919372,-75.187334,003800


In [28]:
# save for further use
df.to_csv('../output/missing_census_tract.csv', index=False)