In [1]:
import requests

In [8]:
import pandas as pd
import numpy as np 
import geopandas as gpd 

### Scrape website for rebuild directory data 

In [2]:
from urllib.request import Request, urlopen
url = "https://www.phila.gov/programs/rebuild/project-sites/"
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web_byte = urlopen(req).read()
webpage = web_byte.decode('utf-8')


In [3]:
from bs4 import BeautifulSoup

In [4]:
soup = BeautifulSoup(webpage, 'html.parser')
table = soup.find('table')
rsite = table.find_all('tr')

In [7]:
sites = []
for r in rsite: 
    sites.append(r.text)

In [9]:
df = pd.DataFrame(sites)
df = df[0].str.split("\n", n =4, expand = True)
df = df.drop([0,4], axis = 1)
df= df.rename(columns=df.iloc[0]).drop(df.index[0])

In [14]:
address = df['Address'].str.split(',' ,n=1, expand = True)
df['Address'] = address[0]

In [66]:
df

Unnamed: 0,Project site,Address,Status
1,Capitolo Playground,900 Federal St.,Under construction
2,Cecil B. Moore Recreation Center,2551 N. 22nd St.,Community engagement and design
3,Disston Recreation Center,4423 Longshore Ave.,Under construction
4,East Poplar,820 N. 8th St.,Community engagement and design
5,Fishtown Recreation Center,1202 E. Montgomery Ave.,Under construction
...,...,...,...
58,Vogt Recreation Center,4131 Unruh Ave.,"Time-sensitive repairs, more to come"
59,Waterloo Playground,2502-12 N. Howard St.,Community engagement and design
60,West Mill Creek Playground,5100 Parrish St.,"Time-sensitive repairs, more to come"
61,Wynnefield Library,5325 Overbrook Ave.,"Time-sensitive repairs, more to come"


### Load rebuild data 

In [17]:
path = 'E:/August Data'

In [84]:
rebuild = gpd.GeoDataFrame(gpd.read_file(path +'/facility/rebuild_sites2.shp'))
rebuild = rebuild.rename(columns = {'Copy_of__7' : 'status'})  

### Match using fuzzywuzzy

In [85]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [86]:
def match_name(name, list_names, min_score=0):
    # -1 score incase we don't get any matches
    max_score = -1
    # Returning empty name for no match as well
    max_name = ""
    # Iternating over all names in the other
    for name2 in list_names:
        #Finding fuzzy match score
        score = fuzz.token_sort_ratio(name, name2)
        # Checking if we are above our threshold and have a better score
        if (score > min_score) & (score > max_score):
            max_name = name2
            max_score = score
    return (max_name, max_score)

In [216]:
dict_list = []
# match using site name 
for name in df['Project site']:
    match = match_name(name, rebuild.SITE_NAME, 90)
    
    # New dict for storing data
    dict_ = {}
    dict_.update({"match_name" : name})
    dict_.update({"SITE_NAME" : match[0]})
    dict_.update({"score" : match[1]})
#    dict.update({"geometry": rebuild['geometry']}) ### doesnt work 
    dict_list.append(dict_)
    
merge_table = pd.DataFrame(dict_list)


# create dataframe that still doesnt have matches 
site_n = merge_table.loc[merge_table['score'] == -1] 
# df2 goes into match function 
df2 = df.loc[df['Project site'].isin(site_n['match_name'])] 


# matches for first
site_m1 = merge_table.loc[merge_table['score'] != -1 ].sort_values('SITE_NAME').reset_index()  
# a is subset of rebuild dataframe for matched site names 
a = rebuild.loc[rebuild['SITE_NAME'].isin(site_m1['SITE_NAME'])].sort_values('SITE_NAME').reset_index()
# take geometry from a and put as column in matches 
site_m1['geometry'] = a['geometry'] 
# drop index and site_name so can concat to other matches later
site_m1 = site_m1.drop(['index', 'SITE_NAME'], axis = 1) 

In [220]:
dict_list = []
# match using asset name for sites that arent matched yet (df2)
for name in df2['Project site']:
    match = match_name(name, rebuild.ASSET_NAME, 85)
    
    # New dict for storing data
    dict_ = {}
    dict_.update({"match_name" : name})
    dict_.update({"ASSET_NAME" : match[0]})
    dict_.update({"score" : match[1]})
    dict_list.append(dict_)
    
merge_table = pd.DataFrame(dict_list)


# dataframe that still doesnt have matches 
asset_n = merge_table.loc[merge_table['score'] == -1] 
df3 = df2.loc[df2['Project site'].isin(asset_n['match_name'])] 

# matches for second 
site_m2 = merge_table.loc[merge_table['score'] != -1 ].sort_values('ASSET_NAME').reset_index()
a = rebuild.loc[rebuild['ASSET_NAME'].isin(site_m2['ASSET_NAME'])].sort_values('ASSET_NAME').reset_index()
site_m2['geometry'] = a['geometry']
site_m2 = site_m2.drop(['index', 'ASSET_NAME'], axis = 1)

In [268]:
dict_list = []
# match ussing address for sites that arent matched yet (df3)
for name in df3['Address']:
    match = match_name(name, rebuild.ASSET_ADDR, 90)
    
    # New dict for storing data
    dict_ = {}
    dict_.update({"match_name" : name})
    dict_.update({"ASSET_ADDR" : match[0]})
    dict_.update({"score" : match[1]})
    dict_list.append(dict_)
    
merge_table = pd.DataFrame(dict_list)

# dataframe that still does have matches 
adrs_n = merge_table.loc[merge_table['score'] == -1] 
adrs_n = df3.loc[df3['Address'].isin(adrs_n['match_name'])]  


# matches for third 
site_m3 = merge_table.loc[merge_table['score'] != -1].sort_values('ASSET_ADDR').drop_duplicates('ASSET_ADDR').reset_index()
a = rebuild.loc[rebuild['ASSET_ADDR'].isin(site_m3['ASSET_ADDR'])].sort_values('ASSET_ADDR').reset_index()
site_m3['geometry'] = a['geometry']
site_m3 = site_m3.drop(['index', 'ASSET_ADDR'], axis = 1)

In [271]:
match =  pd.concat([site_m1, site_m2, site_m3], axis=0)
match

Unnamed: 0,match_name,score,geometry
0,8th & Diamond Playground,100,POINT (-75.14745424859925 39.98337655477285)
1,Athletic Recreation Center,100,POINT (-75.17825478966257 39.97771982788517)
2,Barrett Playground,100,POINT (-75.13567280093206 40.03127365506787)
3,Belfield Recreation Center,100,POINT (-75.15633887665476 40.04160580121268)
4,Capitolo Playground,100,POINT (-75.1593818382852 39.93387898706955)
5,Carroll Park,100,POINT (-75.23672770704781 39.97102106031431)
6,Cecil B. Moore Recreation Center,100,POINT (-75.16773310928932 39.99367232535786)
7,Cherashore Playground,100,POINT (-75.13693943601245 40.03866370145978)
8,Chew Playground,100,POINT (-75.17402029466038 39.93822118220981)
9,Cobbs Creek Environmental Center,100,POINT (-75.2506190898427 39.95201743911613)


In [230]:
df

Unnamed: 0,Project site,Address,Status
1,Capitolo Playground,900 Federal St.,Under construction
2,Cecil B. Moore Recreation Center,2551 N. 22nd St.,Community engagement and design
3,Disston Recreation Center,4423 Longshore Ave.,Under construction
4,East Poplar,820 N. 8th St.,Community engagement and design
5,Fishtown Recreation Center,1202 E. Montgomery Ave.,Under construction
...,...,...,...
58,Vogt Recreation Center,4131 Unruh Ave.,"Time-sensitive repairs, more to come"
59,Waterloo Playground,2502-12 N. Howard St.,Community engagement and design
60,West Mill Creek Playground,5100 Parrish St.,"Time-sensitive repairs, more to come"
61,Wynnefield Library,5325 Overbrook Ave.,"Time-sensitive repairs, more to come"
