In [1]:
# produce synthpop output summary table in the same format as popsim summarization table
# need inputs: synthpop HHs and persons, populationsim controls table, popsim summarization table

In [2]:
import pandas as pd
import numpy as np

In [3]:
region = "oakland"
geo = "BLKGRP"

popsim_controls = 'configs/controls.csv'
popsim_summary = 'output/summary_BLKGRP.csv'
synpop_out_hh = 'synthpop/oak_all_households.csv'
synpop_out_pop = 'synthpop/oak_all_persons.csv'


In [4]:
#add query_str column by parsing expressions
popsim_controls =  pd.read_csv(popsim_controls)
repd = {"households.":"", "persons.": "", "np.inf":"inf",".isin":" in " }
popsim_controls['query_str'] = popsim_controls['expression']
for k in repd.keys():
    popsim_controls['query_str'] = popsim_controls['query_str'].str.replace(k, repd[k])

In [5]:
# from popsim controls extract needed variables in households and Persons
# vsplit = re.split('[^a-zA-Z0-9]', " ".join(popsim_controls['expression']))
# indhh = [i+1 for i, x in enumerate(vsplit) if x == "households"]
# vhh = set([vsplit[x] for x in indhh])
# indpp = [i+1 for i, x in enumerate(vsplit) if x == "persons"]
# vpp = set([vsplit[x] for x in indpp])
# print(vhh, vpp)

In [6]:

#load HHs and Persons with predefined columns
hcols = ['Unnamed: 0','serialno', 'RT', 'DIVISION', 'puma10', 'REGION', 'ST', 'ADJHSG', 'ADJINC', 'WGTP', 'NP','R18', 'HINCP','VEH', 'TYPE', 'race_of_head', 'hispanic_head', 'age_of_head','workers', 'hh_age_of_head', 'hh_cars', 'hh_children', 'hh_income', 'hh_race_of_head', 'hh_size', 'hh_workers', 'cat_id', 'state', 'county', 'tract', 'block group']

pcols = ['RT', 'serialno', 'DIVISION', 'SPORDER', 'puma10','REGION', 'ST', 'ADJINC', 'RELP', 'PWGTP', 'AGEP', 'RAC1P','HISP', 'ESR', 'SEX', 'puma00', 'person_age', 'person_sex', 'race', 'cat_id', 'hh_id']

households = pd.read_csv(synpop_out_hh, usecols = hcols)
persons = pd.read_csv(synpop_out_pop, usecols = pcols)

households.columns = map(str.upper, households.columns)
persons.columns = map(str.upper, persons.columns)
households = households.set_index(households.columns[0])
print ('hhs', len(households), 'pop', len(persons))

hhs 499379 pop 1182819


In [7]:
# prepare HH and Persons variables needed for query
households['BLKGRP'] = households.STATE  * 10000000000 + households.COUNTY * 10000000 +  households.TRACT * 10 +  households['BLOCK GROUP']
persons = pd.merge(persons, households[['BLKGRP']], left_on = 'HH_ID', right_index = True, how = 'left')

hage = persons.loc[persons.RELP == 0].groupby('HH_ID').AGEP.max()
households.loc[hage.index, 'AGEHOH'] = hage.values

# add HRACE to PUMS sample
hrace = persons.loc[persons.RELP == 0].groupby('HH_ID').RAC1P.max()
households["v0"] = hrace.values
households["HRACE"] = 4
households.loc[households.v0 == 1, "HRACE"] = 1
households.loc[households.v0 == 2, "HRACE"] = 2
households.loc[households.v0 == 6, "HRACE"] = 3

# add HHISP to PUMS sample
hisp = persons.loc[persons.RELP == 0].groupby('HH_ID').HISP.max()
households["v0"] = hisp.values
households["HHISP"] = 0
households.loc[households.v0 > 1, "HHISP"] = 1

# add HWORKERS to PUMS sample
workers = persons.loc[persons.ESR.isin([1,2,4,5])].groupby('HH_ID').size()
households['v0'] = 0
households.loc[workers.index, "v0"] = workers.values
households["HWORKERS"] = 0
households.loc[households.v0 == 1, "HWORKERS"] = 1
households.loc[households.v0 >= 2, "HWORKERS"] = 2

households.drop("v0", axis = 1, inplace = True)


In [8]:
#summarized by BLKGRP
dfgrp = pd.DataFrame(households['BLKGRP'].unique(), columns = ['BLKGRP'])
dfgrp = dfgrp.set_index('BLKGRP')

for ind, r in popsim_controls.iterrows():
    tb = eval(r.seed_table)
    dfgrp[r.target + '_synpop'] = tb.query(r.query_str).groupby('BLKGRP').size()
dfgrp.fillna(0, inplace = True)

In [9]:
#merge Census marginal controls and synthpop summary and compute differences
dfsum_popsim = pd.read_csv(popsim_summary, index_col = 'id')
flds = [x.replace('_control', '') for x in dfsum_popsim.columns.values if x.find('_control') >=0 ]
controls = [ x + '_control' for x in flds]
diffs = [ x + '_diff' for x in flds]
dfsum_popsim = dfsum_popsim[controls]

dfgrp = pd.merge(dfsum_popsim, dfgrp, left_index=True, right_index = True)
for fld in flds:
    dfgrp[fld + "_diff"] = dfgrp[fld + "_synpop"] - dfgrp[fld + "_control"]

dfgrp = dfgrp.replace([np.inf, -np.inf], np.nan)
dfgrp.fillna(0, inplace=True)
dfgrp.index.name = 'geography'

dfgrp.to_csv(region + '_' + geo + '_summary.csv')