In [89]:
import pylogit as pl
import pandas as pd
import numpy as np
import json

from transport_network import approx_shape_centroid, get_haversine_distance, Polygon_Location

In [72]:
state_codes={'Detroit': 'mi', 'Boston': 'ma'}
state_fips={'Detroit': '26', 'Boston': '25'}
NUM_ALTS=8
sample_size=5000

PUMA_POP_PATH='./cities/'+'Detroit'+'/raw/ACS/population.csv'
# https://www2.census.gov/programs-surveys/acs/data/pums/2016/1-Year/
PUMS_HH_PATH='./cities/'+'Detroit'+'/raw/PUMS/csv_h'+state_codes['Detroit']+'/ss16h'+state_codes['Detroit']+'.csv'
PUMS_POP_PATH='./cities/'+'Detroit'+'/raw/PUMS/csv_p'+state_codes['Detroit']+'/ss16p'+state_codes['Detroit']+'.csv'
#        POI_PATH = './cities/'+self.city_folder+'/raw/OSM/poi.geojson'
PUMA_TO_POW_PUMA_PATH='./puma_to_pow_puma.csv'
PUMA_SHAPE_PATH = './cities/'+'Detroit'+'/raw/pumas.geojson'
PUMAS_INCLUDED_PATH = './cities/'+'Detroit'+'/raw/pumas_included.json'

In [112]:
hh=pd.read_csv(PUMS_HH_PATH)
pop = pd.read_csv(PUMS_POP_PATH)
hh['PUMA']=hh.apply(lambda row: str(int(row['PUMA'])).zfill(5), axis=1)
pop['PUMA']=pop.apply(lambda row: str(int(row['PUMA'])).zfill(5), axis=1)
pop['POWPUMA']=pop.apply(lambda row: str(int(row['POWPUMA'])).zfill(5) 
                        if not np.isnan(row['POWPUMA']) else 'NaN', axis=1)

#        all_PUMAs=list(set(hh['PUMA']))
pumas_included=json.load(open(PUMAS_INCLUDED_PATH))                                         # For the whole MI
pumas_shape=json.load(open(PUMA_SHAPE_PATH))
pumas_order=[f['properties']['PUMACE10'] for f in pumas_shape['features']]
            
puma_pop = pd.read_csv(PUMA_POP_PATH)
puma_pop = puma_pop.loc[puma_pop['STATE']==int(state_fips['Detroit'])]
puma_pop['PUMA']=puma_pop.apply(lambda row: str(row['PUMA']).zfill(5), axis=1)
puma_pop=puma_pop.set_index('PUMA')


# identify recent movers and vacant houses                                            
hh_vacant_for_rent=hh[(hh['VACS']==1) & (hh['PUMA'].isin(pumas_included['puma']))].copy()          
hh_rented=hh[(hh['TEN']==3) & (hh['PUMA'].isin(pumas_included['puma']))].copy()                                                      
renters_recent_move=hh_rented[hh_rented['MV']==1].copy()    

# get the area of each PUMA
puma_land_sqm=dict(zip(puma_pop.index, puma_pop['AREALAND'].astype('int64')))


In [95]:
# get the distance between each puma and each pow-puma
pow_puma_df=pd.read_csv(PUMA_TO_POW_PUMA_PATH, skiprows=1, header=1)
pow_puma_df_state=pow_puma_df.loc[pow_puma_df[
        'State of Residence (ST)']==state_fips['Detroit']].copy()
pow_puma_df_state['POW_PUMA']=pow_puma_df_state.apply(
        lambda row: str(int(row['PWPUMA00 or MIGPUMA1'])).zfill(5), axis=1)
pow_puma_df_state['PUMA']=pow_puma_df_state.apply(
        lambda row: str(int(row['PUMA'])).zfill(5), axis=1)
all_pow_pumas=set(pow_puma_df_state['POW_PUMA'])
pow_puma_to_puma={}
for p in all_pow_pumas:
    pow_puma_to_puma[p]=list(pow_puma_df_state.loc[
            pow_puma_df_state['POW_PUMA']==p, 'PUMA'].values)

In [90]:
# find the centroid of each puma
puma_centroids={}
pow_puma_centroids={}
for puma in set(pow_puma_df_state['PUMA']):
    centr=approx_shape_centroid(pumas_shape['features'][pumas_order.index(puma)]['geometry'])
    puma_centroids[puma]=centr

In [117]:
# find the centroid of each pow-puma
all_pow_pumas=set(pow_puma_df_state['POW_PUMA'])

for pow_puma in all_pow_pumas:
    pumas=pow_puma_to_puma[pow_puma]
    puma_centr=[puma_centroids[puma] for puma in pumas]
    # TODO, shold be weighted by area- ok if similar size
    pow_puma_centroids[pow_puma]=[np.mean([pc[0] for pc in puma_centr]),
                                    np.mean([pc[1] for pc in puma_centr])]

In [118]:
# calculate distance between puma and pow-puma
dist_mat={}
for puma in puma_centroids:
    dist_mat[puma]={}
    for pow_puma in pow_puma_centroids:
        dist = get_haversine_distance(
                puma_centroids[puma], pow_puma_centroids[pow_puma])
        # external distance
        if dist > 0:
            dist_mat[puma][pow_puma] = dist
        # inner distance
        else:
            dist_mat[puma][pow_puma] = np.sqrt(puma_land_sqm[puma] / np.pi)

In [135]:
# build the PUMA aggregate data data frame
median_income_by_puma=hh.groupby('PUMA')['HINCP'].median()
#TODO: get more zonal attributes such as access to employment, amenities etc.

puma_obj=[{'PUMA':puma,
            'med_income':median_income_by_puma.loc[puma],
            'puma_pop_per_sqm':float(puma_pop.loc[puma]['POP100'])/puma_land_sqm[puma]
            } for puma in pumas_included['puma']]

puma_attr_df=pd.DataFrame(puma_obj)
puma_attr_df=puma_attr_df.set_index('PUMA')

In [128]:
# create features at property level
# normalise rent stratifying by bedroom number
renters_recent_move.loc[renters_recent_move['BDSP']>2, 'BDSP']=3            # change [the number of bedroom] >2 to 3
renters_recent_move.loc[renters_recent_move['BDSP']<1, 'BDSP']=1            # change [the number of bedroom] <1 to 1
hh_vacant_for_rent.loc[hh_vacant_for_rent['BDSP']>2, 'BDSP']=3          
hh_vacant_for_rent.loc[hh_vacant_for_rent['BDSP']<1, 'BDSP']=1
rent_mean={}
rent_std={}
for beds in range(1,4):
    rent_mean[beds]=renters_recent_move.loc[renters_recent_move['BDSP']==beds, 'RNTP'].mean()
    rent_std[beds]=renters_recent_move.loc[renters_recent_move['BDSP']==beds, 'RNTP'].std()

In [137]:
for df in [renters_recent_move, hh_vacant_for_rent]:
    df['norm_rent']=df.apply(
        lambda row: (row['RNTP']-rent_mean[row['BDSP']])/rent_std[row['BDSP']], axis=1)
    # Age of building
    df['built_since_jan2010']=df.apply(lambda row: row['YBL']>=14, axis=1)
    df['puma_pop_per_sqmeter']=df.apply(lambda row: puma_attr_df.loc[row['PUMA']]['puma_pop_per_sqm'], axis=1)
    df['med_income']=df.apply(lambda row: puma_attr_df.loc[row['PUMA']]['med_income'], axis=1)  
all_rooms_available = pd.concat([hh_vacant_for_rent, renters_recent_move], axis=0) 
median_norm_rent = all_rooms_available.groupby('PUMA')['norm_rent'].median()
puma_attr_df['media_norm_rent'] =  puma_attr_df.apply(lambda row: median_norm_rent[row.name], axis=1)

In [142]:
for df in [renters_recent_move, hh_vacant_for_rent]:
    print(df.shape)

(1076, 234)
(159, 234)


In [143]:
renters_recent_move

Unnamed: 0,RT,SERIALNO,ST,DIVISION,PUMA,REGION,ADJHSG,ADJINC,WGTP,NP,...,wgtp75,wgtp76,wgtp77,wgtp78,wgtp79,wgtp80,norm_rent,built_since_jan2010,puma_pop_per_sqmeter,med_income
86,H,2753,26,3,02908,2,1000000,1007588,130,3,...,133,40,238,119,131,35,0.013562,False,0.001907,62000.0
145,H,4341,26,3,03210,2,1000000,1007588,265,1,...,422,495,76,106,441,407,-0.971084,False,0.001579,29300.0
154,H,4593,26,3,02702,2,1000000,1007588,76,2,...,25,74,25,84,77,96,-0.177841,False,0.001636,67000.0
156,H,4601,26,3,03212,2,1000000,1007588,226,1,...,215,335,243,228,383,70,-0.872619,False,0.001639,29250.0
159,H,4618,26,3,03004,2,1000000,1007588,276,1,...,281,471,447,79,86,261,0.187355,False,0.001187,56000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49471,H,1515895,26,3,03204,2,1000000,1007588,127,1,...,149,242,183,120,212,159,-0.490867,False,0.001568,45800.0
49519,H,1517464,26,3,02907,2,1000000,1007588,279,1,...,416,235,467,88,85,318,0.161270,False,0.001069,68000.0
49526,H,1517811,26,3,03100,2,1000000,1007588,295,1,...,283,96,272,315,250,308,-0.829978,False,0.000086,52600.0
49536,H,1518026,26,3,02800,2,1000000,1007588,69,1,...,110,65,27,63,133,69,0.122370,False,0.000132,81800.0


In [None]:
# num of avaiable housing units in each PUMA
num_available_houses_in_puma = hh_vacant_for_rent.groupby('PUMA')['SERIALNO'].count()
puma_attr_df['num_houses'] = puma_attr_df.apply(lambda row: num_available_houses_in_puma[row.name], axis=1)

renters_recent_move=renters_recent_move[['SERIALNO', 'PUMA','HINCP',  'norm_rent', 'RNTP', 'built_since_jan2010', 'puma_pop_per_sqmeter', 'med_income', 'BDSP', 'NP']]
hh_vacant_for_rent=hh_vacant_for_rent[['PUMA', 'HINCP', 'norm_rent', 'RNTP','built_since_jan2010', 'puma_pop_per_sqmeter', 'med_income', 'BDSP']]
    
rent_normalisation={"mean": rent_mean, "std": rent_std}   

home_loc_mnl = {'home_loc_mnl_PUMAs': {}, 'home_loc_mnl_hh': {}}    