In [None]:
# refinement test

#input development
#1. geo cross walk table:
    #specify refine target geo: city, taz, blkgrp, parcels
    #specify sample unit: city, taz, blkgrp, parcels
    #settings: sample geography = "SAMPLEGEO"

#2. control table
    #convert from control totals: 
        # read categorical info and fill control table
        # summarize inputs by categories to use as baseline
    #new controls: 
        #a. manual modify specific controls 
        #b. read target database and make controls by category

#3. seed tables
    #base inputs with sample and refine geo units



In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
def building_geos(st, year, weight = None):
    """ 
    function: add a set of geograpies to buildings based on 2045 Urbansim model dataset
    st: HDF file, urbansim model data
    year: year string 
    weight: column use to evaluate overlaps between target area and sampling area
    """
    ren_dict = {'b_city_id':'CITY', 'county_id':'COUNTY', 'census_bg_id':'BLKGRPID', 'b_zone_id':'TAZ'}
    weight = [] if not(weight) else [weight]
    dfgeo = pd.merge(st[year + '/buildings'][['parcel_id','b_city_id', 'b_zone_id'] + weight], 
                        st[year + '/parcels'][['census_bg_id','county_id']], 
                        left_on='parcel_id', right_index= True, how='left' )
    dfgeo.rename(columns = ren_dict, inplace=True)
    dfgeo['BLKGRPID'] = dfgeo['COUNTY'] * (10**7) + dfgeo['BLKGRPID']
    dfgeo['TRACTID'] =  dfgeo['BLKGRPID'] // 10
    dfgeo['REGION'] = 2

    return dfgeo

In [3]:

# three sampling strategies:
# 1. "LARGEST": sampling from largest single area (default)
# 2. "OVERLAP": sampling from largest area and other overlapped area proportions (determinted by HUs and HHs)
# 3. "ALL": sampling from all areas fully or partially overalapping target area 
sampling_method = 'ALL'  

target_geo = 'TAZ'
sample_geo = 'CITY'
if target_geo == sample_geo:
    sampling_method = 'LARGEST' # overwrite sampling method

year = '2045'
geo_units = ['COUNTY', 'CITY','TAZ', 'TRACTID', 'BLKGRPID', 'REGION']
weight = 'residential_units'
prj_name = 'refine'
new_id_base = 2 * 10 ** 10 # to add to original id and make sure no potential duplicates in geo, hh or person ids.

hdf_in = 'run4032_45.h5'
hdf_target = 'run4032_taz_draft_ypsi.h5'


## step 1, make geo cross walk using existing model database

In [None]:
# first test using city as target and sample units
# use oakland as test region

In [21]:
st= pd.HDFStore(hdf_in, 'r')

dfgeo = building_geos(st, year, weight)
dfgeo = dfgeo.loc[dfgeo.COUNTY == 125] #select Oakland for now
dfgeo_grpby = dfgeo.groupby([target_geo, sample_geo])[weight].sum().to_frame(weight).reset_index()

geocross_all = dfgeo_grpby.sort_values(by=[target_geo, weight], ascending = False) #sort by the proportion of overlaps
geocross = geocross_all.drop_duplicates(target_geo)
geocross['SAMPLEGEO'] = geocross[sample_geo]
#if sampling method is not on single area, then set 'SAMPLEGEO' to target area id plus new_id_base 
if sampling_method != 'LARGEST':
    geocross_dup = geocross_all.loc[geocross_all.duplicated(target_geo,keep = False)]
    geocross.loc[geocross[target_geo].isin(geocross_dup[target_geo].unique()), 'SAMPLEGEO'] = geocross[target_geo] + new_id_base
geocross['REGION'] = 2
geocross = geocross.astype('int64')

geocross.to_csv(prj_name + '_geo_cross_walk.csv')

print('sampling_method:', sampling_method)
print('geo_cross_walk: ', prj_name + '_geo_cross_walk.csv')
print('geocross:', len(geocross), '   geocross_all:', len(geocross_all))

sampling_method: ALL
geo_cross_walk:  refine_geo_cross_walk.csv
geocross: 723    geocross_all: 754


## step 2: make popsim config control table from household control totals

In [22]:
def cats_to_ctrl(dict_cats, target_geo, seed_tbl, dfctrl = None):
    """ Convert UrbanSim HH control total categories to Popsim config control table
        dict_cats: category dict
        target_geo: refinement geo unit
        seed_tbl: value for seed_table column
        dfctrl: existing control table 
    """

    if dfctrl is None:
        dfctrl = pd.DataFrame(columns = ['target', 'geography', 'seed_table', 
                                            'importance', 'control_field', 'expression'])
        indv = 0
    else:
        indv = len(dfctrl) + 1
    
    for c in dict_cats.keys():
        vname = ('hh' + c) if seed_tbl == 'households' else c
        ccount = 0
        for vmin, vmax in dict_cats[c]:
            indv += 1
            ccount += 1
            if vmin == vmax:
                expression = '({}.{}=={})'.format(seed_tbl, c, str(vmin))
            elif vmax == -1:
                expression = '({}.{}>={})'.format(seed_tbl, c, str(vmin))
            else:
                expression = '({}.{}>={}) & ({}.{}<={})'.format(seed_tbl, c, str(vmin), seed_tbl, c, str(vmax))
            dfctrl.loc[indv]= [vname + str(vmin), target_geo, seed_tbl, 500, 
                                    vname.upper() + str(ccount), expression]

    return dfctrl

In [23]:
stc= pd.HDFStore(hdf_target, 'r')
ctotals = stc['/base/annual_household_control_totals']

#convert single value column 'race_id' to 'race_id_min' and 'race_id_min' columns
ctotals['race_id_min'] = ctotals['race_id']
ctotals['race_id_max'] = ctotals['race_id']

cats = [x[:-4] for x in ctotals.columns if 'min' in x]

#extract all categories and boundary values from hh control totals
dict_cats = {}
for c in cats:
    dict_cats[c] = ctotals[[c+'_min', c+'_max']].drop_duplicates().sort_values(by=c+'_min').values

In [24]:
# build config control table using annual_household_control_totals
seed_tbl = 'households'
simctrl = cats_to_ctrl(dict_cats, target_geo, seed_tbl)
simctrl.loc[len(simctrl) + 1] = ['num_hh', target_geo, seed_tbl, 10000000, 'HHBASE', '(households.persons > 0)']

simctrl.to_csv(prj_name + '_controls.csv')
print('config control table: ', prj_name + '_controls.csv')

config control table:  refine_controls.csv


In [25]:
# (optional, only when person controls are desired)
# add person controls to config controls 
ptarget_geo = 'TAZ'
pseed_tbl = 'persons'
dict_pcats = {
    'age':[[0, 17], [18, 24], [25, 64], [65, -1]],
    'sex':[[1, 1], [2, 2]],
    'race_id':[[1, 1], [2, 2], [3, 3], [4,4]]
}

simctrl = cats_to_ctrl(dict_pcats, ptarget_geo, pseed_tbl, dfctrl= simctrl)
simctrl.to_csv(prj_name + '_controls.csv')
print('config control table: ', prj_name + '_controls.csv')

config control table:  refine_controls.csv


## step 3. generate geo controls 

In [26]:
#special case: already have target database
#prepare households and persons tables for summarization purpose(refined/official results)
bldg_geos = building_geos(stc, year)

households = pd.merge(stc[year + '/households'], bldg_geos[geo_units], 
                                        left_on='building_id', right_index= True, how='left' )
households = households.loc[households.COUNTY == 125]
households.dropna(axis=0, inplace=True)
households = households.astype('int64')

persons = pd.merge(stc[year + '/persons'], households[geo_units], 
                                        left_on='household_id', right_index= True, how='left' )
persons.dropna(axis=0, inplace=True)
persons = persons.astype('int64')


In [33]:
#summarize target HH distribution based on config_controls
#single geo level for now

for ind, r in simctrl.iterrows():
    sr = pd.eval('{}.loc[{}]'.format(r.seed_table, r.expression)).groupby([r.geography]).size()
    sr.name = r.control_field
    if ind == 1:
        sumt = sr
    else:
        sumt = pd.concat([sumt, sr], axis=1)

geo_ctrl = pd.DataFrame(sumt)
geo_ctrl.index.name = target_geo
geo_ctrl['REGION'] = 2
geo_ctrl.fillna(0, inplace=True)
geo_ctrl = geo_ctrl.astype('int64')

geo_ctrl.to_csv('{}_control_totals_{}.csv'.format(prj_name, target_geo.lower()))
print('geography control table: ', '{}_control_totals_{}.csv'.format(prj_name, target_geo.lower()))

geography control table:  refine_control_totals_taz.csv


## step 4. make seed HHs and Persons

In [34]:
# make sample data
bldg_geos = building_geos(st, year)

hhs_total = pd.merge(st[year + '/households'], bldg_geos[geo_units], 
                                    left_on='building_id', right_index= True,how='left') 
pps_total = pd.merge(st[year + '/persons'], hhs_total[geo_units], 
                                    left_on='household_id', right_index=True, how='left')

In [35]:
# sample data for sample_geos
print ('sampling_method', sampling_method)

def sample_largest(df_total, geocross, sample_geo):
    dfs = df_total.loc[df_total[sample_geo].isin(geocross[sample_geo].unique())]
    dfs['SAMPLEGEO'] = dfs[sample_geo]                      
    return dfs.astype('int64')

geocross = geocross.loc[geocross['SAMPLEGEO'] <= new_id_base]

hhs = sample_largest(hhs_total, geocross, sample_geo)
hhs['WGTP'] = 1
pps = sample_largest(pps_total, geocross, sample_geo)
pps['PWGTP'] = 1

if sampling_method == 'LARGEST':
    hhs.to_csv(prj_name + '_seed_households.csv')
    print('seed households table: ', prj_name + '_seed_households.csv')
    pps.to_csv(prj_name + '_seed_persons.csv')
    print('seed persons table: ', prj_name + '_seed_persons.csv')



sampling_method ALL


In [None]:
# if sampling from all overlapping area
if sampling_method != 'LARGEST':
    hhlst = pplst = []
    hhind = ppind = new_id_base

    for geo, dfg in geocross_dup.groupby(target_geo):  
        if sampling_method == 'OVERLAP':
            hh_lc = pd.concat( [hhs_total.loc[hhs_total[sample_geo] == dfg[sample_geo].values[0]], 
                                hhs_total.loc[(hhs_total[sample_geo].isin(dfg[sample_geo].values[1:])) & 
                                                (hhs_total[target_geo] == geo)]
                                ] )
        elif sampling_method == 'ALL':
            hh_lc = hhs_total.loc[hhs_total[sample_geo].isin(dfg[sample_geo].values)]
        
        hh_lc['SAMPLEGEO'] = geo + new_id_base 
        hh_lc['new_hhid'] = range(hhind, hhind + len(hh_lc))
        hhlst.append(hh_lc)
        hhind += len(hh_lc)

        pp_lc = pps_total.loc[pps_total.household_id.isin(hh_lc.index.values)]
        pp_lc = pd.merge(pp_lc, hh_lc[['new_hhid', 'SAMPLEGEO']], 
                                        left_on = 'household_id', right_index =True, how ='left')
        pp_lc.index = range(ppind, ppind + len(pp_lc))
        pplst.append(pp_lc)
        ppind += len(pp_lc)

    dfhhs = pd.concat(hhlst)
    dfhhs['WGTP'] = 1
    dfhhs.index = dfhhs['new_hhid']
    dfhhs.drop('new_hhid', axis=1, inplace=True)

    hhs = pd.concat([hhs, dfhhs])
    hhs.index.name = 'household_id'
    hhs = hhs.astype('int64')

    hhs.to_csv(prj_name + '_seed_households.csv')
    print('seed households table: ', prj_name + '_seed_households.csv')

    dfpps = pd.concat(pplst)
    dfpps['PWGTP'] = 1
    dfpps['household_id'] = dfpps['new_hhid']
    dfpps.drop(['new_hhid'], axis=1, inplace=True)
    pps = pd.concat([pps, dfpps])
    pps.index.name = 'person_id'
    pps = pps.astype('int64')

    pps.to_csv(prj_name + '_seed_persons.csv')
    print('seed persons table: ', prj_name + '_seed_persons.csv')

In [None]:
pps