In [None]:
# refinement test

#input development
#1. geo cross walk table:
    #specify refine target geo: city, taz, blkgrp, parcels
    #specify sample unit: city, taz, blkgrp, parcels
    #settings: sample geography = "SAMPLEGEO"

#2. control table
    #convert from control totals: 
        # read categorical info and fill control table
        # summarize inputs by categories to use as baseline
    #new controls: 
        #a. manual modify specific controls 
        #b. read target database and make controls by category

#3. seed tables
    #base inputs with sample and refine geo units



In [2]:
import pandas as pd
from collections import defaultdict

In [3]:
# first test using city as target and sample units
# use oakland as test region

In [4]:
def building_geos(st, year, weight = None):
    """ 
    function: add a set of geograpies to buildings based on 2045 Urbansim model dataset
    st: HDF file, urbansim model data
    year: year string 
    ren dict: dictionary for rename columns
    """
    ren_dict = {'b_city_id':'CITY', 'county_id':'COUNTY', 'census_bg_id':'BLKGRPID', 'b_zone_id':'TAZ'}
    weight = [] if not(weight) else [weight]
    dfgeo = pd.merge(st[year + '/buildings'][['parcel_id','b_city_id', 'b_zone_id'] + weight], 
                        st[year + '/parcels'][['census_bg_id','county_id']], 
                        left_on='parcel_id', right_index= True, how='left' )
    dfgeo.rename(columns = ren_dict, inplace=True)
    dfgeo['BLKGRPID'] = dfgeo['COUNTY'] * (10**7) + dfgeo['BLKGRPID']
    dfgeo['TRACTID'] =  dfgeo['BLKGRPID'] // 10
    dfgeo['REGION'] = 2

    return dfgeo

In [5]:
target_geo = 'TAZ'
sample_geo = 'CITY'
year = '2045'
geo_units = ['COUNTY', 'CITY','TAZ', 'TRACTID', 'BLKGRPID', 'REGION']
weight = 'residential_units'
prj_name = 'refine'

hdf_in = 'run4032_45.h5'
hdf_target = 'run4032_taz_draft_ypsi.h5'

## step 1, make geo cross walk using existing model database

In [6]:
st= pd.HDFStore(hdf_in, 'r')

dfgeo = building_geos(st, year, weight)
dfgeo = dfgeo.loc[dfgeo.COUNTY == 125] #sele t Oakland for now

dfgeo_grpby = dfgeo.groupby([target_geo, sample_geo])[weight].sum().to_frame(weight).reset_index()
geocross = dfgeo_grpby.sort_values(by=[target_geo, weight], ascending = False).drop_duplicates(target_geo)
geocross = geocross.drop_duplicates(target_geo)
geocross['SAMPLEGEO'] = geocross[sample_geo]
geocross['REGION'] = 2
geocross = geocross.astype(int)

geocross.to_csv(prj_name + '_geo_cross_walk.csv')
print('geo_cross_walk: ', prj_name + '_geo_cross_walk.csv')


geo_cross_walk:  refine_geo_cross_walk.csv


In [7]:
# st= pd.HDFStore(hdf_in, 'r')

# dfgeo = building_geos(st, year)
# dfgeo = dfgeo.loc[dfgeo.COUNTY == 125] #sele t Oakland for now

# geocross = dfgeo.drop_duplicates(target_geo)[geo_units]
# geocross['SAMPLEGEO'] = geocross[sample_geo]
# geocross = geocross.astype(int)

# geocross.to_csv(prj_name + '_geo_cross_walk.csv')
# print('geo_cross_walk: ', prj_name + '_geo_cross_walk.csv')

## step 2: make popsim config control table from household control totals

In [8]:
def cats_to_ctrl(dict_cats, target_geo, seed_tbl, dfctrl = None):
    if dfctrl is None:
        dfctrl = pd.DataFrame(columns = ['target', 'geography', 'seed_table', 
                                            'importance', 'control_field', 'expression'])
        indv = 0
    else:
        indv = len(dfctrl) + 1
    
    for c in dict_cats.keys():
        vname = ('hh' + c) if seed_tbl == 'households' else c
        ccount = 0
        for vmin, vmax in dict_cats[c]:
            indv += 1
            ccount += 1
            if vmin == vmax:
                expression = '({}.{}=={})'.format(seed_tbl, c, str(vmin))
            elif vmax == -1:
                expression = '({}.{}>={})'.format(seed_tbl, c, str(vmin))
            else:
                expression = '({}.{}>={}) & ({}.{}<={})'.format(seed_tbl, c, str(vmin), seed_tbl, c, str(vmax))
            dfctrl.loc[indv]= [vname + str(vmin), target_geo, seed_tbl, 500, 
                                    vname.upper() + str(ccount), expression]

    return dfctrl

In [9]:
stc= pd.HDFStore(hdf_target, 'r')
ctotals = stc['/base/annual_household_control_totals']

#convert single column 'race_id' to race_id_min and max columns
ctotals['race_id_min'] = ctotals['race_id']
ctotals['race_id_max'] = ctotals['race_id']

cats = [x[:-4] for x in ctotals.columns if 'min' in x]

#extract all categories and boundary values from hh control totals
dict_cats = {}
for c in cats:
    dict_cats[c] = ctotals[[c+'_min', c+'_max']].drop_duplicates().sort_values(by=c+'_min').values

In [10]:
# build config control table using annual_household_control_totals
seed_tbl = 'households'
simctrl = cats_to_ctrl(dict_cats, target_geo, seed_tbl)
simctrl.loc[len(simctrl) + 1] = ['num_hh', target_geo, seed_tbl, 10000000, 'HHBASE', '(households.persons > 0)']

simctrl.to_csv(prj_name + '_controls.csv')
print('config control table: ', prj_name + '_controls.csv')

config control table:  refine_controls.csv


In [11]:
# (optional, only when person controls are desired)
# add person controls to config controls 
ptarget_geo = 'TAZ'
pseed_tbl = 'persons'
dict_pcats = {
    'age':[[0, 17], [18, 24], [25, 64], [65, -1]],
    'sex':[[1, 1], [2, 2]],
    'race_id':[[1, 1], [2, 2], [3, 3], [4,4]]
}

simctrl = cats_to_ctrl(dict_pcats, ptarget_geo, pseed_tbl, dfctrl= simctrl)
simctrl.to_csv(prj_name + '_controls.csv')
print('config control table: ', prj_name + '_controls.csv')

config control table:  refine_controls.csv


## step 3. generate geo controls 

In [12]:
#special case: already have target database
#prepare households and persons tables for summarization purpose(refined/official results)

bldg_geos = building_geos(stc, year)

households = pd.merge(stc[year + '/households'], bldg_geos[geo_units], 
                                        left_on='building_id', right_index= True, how='left' )
households = households.loc[households.COUNTY == 125]
# households['PUMA'] = households['CITY']
households.dropna(axis=0, inplace=True)
households = households.astype(int)


persons = pd.merge(stc[year + '/persons'], households[geo_units], 
                                        left_on='household_id', right_index= True, how='left' )
# persons['PUMA'] = persons['CITY']
persons.dropna(axis=0, inplace=True)
persons = persons.astype(int)


In [13]:
#summarize target HH distribution based on config_controls
#single geo level for now

for ind, r in simctrl.iterrows():
    sr = pd.eval('{}.loc[{}]'.format(r.seed_table, r.expression)).groupby([r.geography]).size()
    sr.name = r.control_field
    if 'sumt' in locals() or 'sumt' in globals():
        sumt = pd.concat([sumt, sr], axis=1)
    else:
        sumt = sr

geo_ctrl = pd.DataFrame(sumt)
geo_ctrl.index.name = target_geo
geo_ctrl['REGION'] = 2
geo_ctrl.fillna(0, inplace=True)
geo_ctrl = geo_ctrl.astype(int)

geo_ctrl.to_csv('{}_control_totals_{}.csv'.format(prj_name, target_geo.lower()))
print('geography control table: ', '{}_control_totals_{}.csv'.format(prj_name, target_geo.lower()))

geography control table:  refine_control_totals_taz.csv


## step 4. make seed HHs and Persons

In [15]:
# make hhs sample data
bldg_geos = building_geos(st, year)

hhs = pd.merge(st[year + '/households'], bldg_geos[geo_units], 
                        left_on='building_id', right_index= True,how='left') 
hhs = hhs.loc[hhs[sample_geo].isin(geocross[sample_geo].unique())]
hhs['SAMPLEGEO'] = hhs[sample_geo]                      
hhs['WGTP'] = 1
hhs = hhs.astype(int)

hhs.to_csv(prj_name + '_seed_households.csv')
print('seed households table: ', prj_name + '_seed_households.csv')

seed households table:  refine_seed_households.csv


In [17]:
pps = pd.merge(st[year + '/persons'], hhs[geo_units], 
                            left_on='household_id', right_index=True, how='left')
pps = pps.loc[pps[sample_geo].isin(geocross[sample_geo].unique())]
pps = pps.astype(int)
pps['PWGTP'] = 1
pps['SAMPLEGEO'] = pps[sample_geo] 
#pps.rename(columns={'household_id':'hhnum'}, inplace=True)
pps.to_csv(prj_name + '_seed_persons.csv')
print('seed persons table: ', prj_name + '_seed_persons.csv')

seed persons table:  refine_seed_persons.csv
