In [None]:
### generate control totals needed by populationsim(RSG)


In [None]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import numpy as np
import re
from census import Census
import yaml


In [None]:
conf = yaml.load(open("./region_gq.yml", "r"), Loader=yaml.Loader)


In [None]:
c = Census("b01f26cd56a7f5457534e8436a1e63f4c7189b23", year=2010).sf1
prj_name = conf["region"]["name"]
state = conf["region"]["state"][0]
counties = conf["region"]["counties"]
pre_folder = conf["preprocess"]["folder"]
gq_buildings = conf["preprocess"]["gq_buildings"]
h_pums_csv = "../" + pre_folder + conf["preprocess"]["h_pums_csv"]
p_pums_csv = "../" + pre_folder + conf["preprocess"]["p_pums_csv"]

ouptut_geo_cross = "../{}{}_geo_cross_walk.csv".format(pre_folder, prj_name)
output_control = "../{}{}_control_totals_.csv".format(pre_folder, prj_name)
output_seed_hhs = "../{}{}_seed_households.csv".format(pre_folder, prj_name)
output_seed_persons = "../{}{}_seed_persons.csv".format(pre_folder, prj_name)

In [None]:
class census_downloader:

    def __init__(self, census_reader, states, counties = None, tract_ids = None, blockgroup_ids = None):
        self.states = states
        self.counties = counties
        self.tracts = tract_ids
        self.blockgroups = blockgroup_ids 
        self.cread = census_reader
        self.udpate_states_counties([self.states, self.counties])
            
    def state_download(self, vars):
        return self.cread.get(vars, geo={'for': 'state:{}'.format(self.states)})

    def county_download(self, vars):
        print (self.states,self.counties)
        print(vars)
        return self.cread.get(vars, geo={'for': 'county:{}'.format(self.counties), 
                                        'in': 'state:{}'.format(self.states)})

    def tract_download(self, vars):
        return self.cread.get(vars, geo={'for': 'tract:{}'.format(self.tracts), 
                                    'in': 'state:{} county:{}'.format(self.states, self.counties)})

    def blockgroup_download(self, vars):
        clst = self.counties.split(',')
        cm = []
        for cn in clst:
            cm += self.cread.get(vars, geo={'for': 'block group:{}'.format(self.blockgroups), 
                                'in': 'state:{} county:{} tract:{}'.format(self.states, cn,                                                                                     self.tracts)}) 
        return cm
    
    def fips_lookup(self, states, counties = None):
        if counties == "*": counties = None
        fips_table = pd.read_csv(
                "https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt",
                header=None, names=['state','state_fips', 'county_fips', 'county' ,'type'],                          dtype=str)  
        qstr = '(state in {})'.format(states)
        if counties:
            qstr += ' & (county in {})'.format(counties)
            dfq = fips_table.query(qstr)
            return list(dfq.state_fips.unique()), list(dfq.county_fips.unique())
        dfq = fips_table.query(qstr)
        print(list(dfq.state_fips.unique()))
        return list(dfq.state_fips.unique()), None

    def udpate_states_counties(self, geos):
        for i in [0,1]:
            if geos[i] != None:
                if (type(geos[i]) != list) & (geos[i] != '*'):
                    geos[i] = [geos[i]]
                geos[i] = [str(x) for x in geos[i]]
        if (type(geos[0]) == list) & (geos[0][0].isdigit() == False):
            geos[0], geos[1] = self.fips_lookup(geos[0], geos[1])
        for i in [0,1]:
            if geos[i]: geos[i] = ','.join([str(x).zfill(i+2) for x in geos[i]]) 
                        #i+2 cause state and county need 2 and 3 0s in lead
        self.states = geos[0]
        self.counties = geos[1]


    def download(self, variables):
        dfm = pd.DataFrame()
        if not(self.counties):
            downv = self.state_download(variables)
        elif not(self.tracts):
            downv = self.county_download(variables)
        elif not(self.blockgroups):
            downv = self.tract_download(variables)
        else:
            downv = self.blockgroup_download(variables)
        dfm = pd.DataFrame.from_dict(downv)

        return dfm
 

# make GQ cross walk

In [None]:
stm  = pd.HDFStore("../" + pre_folder + conf["preprocess"]["model_hdf"], 'r')

blgs = pd.merge(stm['buildings'][['parcel_id','b_zone_id','b_city_id']], 
                stm['parcels'][['semmcd', 'county_id','census_bg_id']], 
                left_on='parcel_id', right_index=True, how='left')
blgs = blgs.reset_index()

blgs.rename(columns = {'b_zone_id':'ZONE','b_city_id':'B_CITY_ID', 
                        'census_bg_id':'BLKGRPCE', 'parcel_id':'PARCEL', 
                        'semmcd': 'MCD', 'county_id':'COUNTYFP', 
                        'building_id':'BUILDING_ID'}, inplace=True)
blgs = blgs.fillna(0)
blgs = blgs.astype(int)
blgs['STATEFP'] = '26'
blgs['COUNTYFP'] = blgs['COUNTYFP'].astype(str).str.zfill(3)
blgs['BLKGRPCE'] = blgs['BLKGRPCE'].astype(str).str.zfill(7)
blgs['TRACTCE'] = blgs['BLKGRPCE'].str[:-1]
blgs['REGION'] = '1'
blgs['BLKGRPID'] = blgs['STATEFP'] + blgs['COUNTYFP'] + blgs['BLKGRPCE'] 
blgs['TRACTID'] = blgs['STATEFP'] + blgs['COUNTYFP'] + blgs['TRACTCE'] 

df_tract_puma = pd.read_csv(conf["geographies"]["tract_puma_file"], dtype=str)

GQ_cross_walk = pd.merge(blgs, df_tract_puma, on=['STATEFP', 'COUNTYFP', 'TRACTCE'], how='left')
GQ_cross_walk.rename(columns={'PUMA5CE':'PUMA'}, inplace = True)
# GQ_cross_walk[['BUILDING_ID', 'COUNTYFP', 'ZONE', 'BLKGRPCE', 'PUMA','REGION','BLKGRPID', 'TRACTID']].set_index('BUILDING_ID').to_csv(ouptut_geo_cross)

In [None]:
blgs.loc[blgs.BLKGRPID==0]

# building controls


In [None]:
gq_buildings = pd.read_csv("../" + pre_folder + conf["preprocess"]["gq_buildings"])
bt = 0
for x in range(100, 700, 100):
    bt += 1
    gq_buildings.loc[(gq_buildings.GQ_CODE>=x) & (gq_buildings.GQ_CODE<(x+100)), 
                            'type'] = 'GQTYPE' +str(bt) + 'B'
gq_buildings.loc[(gq_buildings.GQ_CODE>=700), 'type'] = 'GQTYPE7B'

In [None]:
bldgctrl = pd.crosstab(gq_buildings.BUILDING_ID, gq_buildings['type'],
                        values=gq_buildings.RESIDENT_COUNT, aggfunc='sum' )
bldgctrl.fillna(0, inplace=True)
bldgctrl['HHBASE'] = bldgctrl.sum(axis = 1, skipna = True)
bldgctrl = bldgctrl.reset_index()
bldgctrl = pd.merge(bldgctrl, GQ_cross_walk[['BUILDING_ID','BLKGRPID', 'REGION','PUMA']], 
                        on ='BUILDING_ID',how='left')
bldgctrl = bldgctrl.loc[bldgctrl.PUMA.notnull()]
bldgctrl.to_csv(output_control.replace(".csv", "building.csv"))

In [None]:
BLKGRPlist = bldgctrl.BLKGRPID.unique()
GQ_cross_walk = GQ_cross_walk.loc[GQ_cross_walk.BLKGRPID.isin(BLKGRPlist)]
GQ_cross_walk[['BUILDING_ID', 'COUNTYFP', 'ZONE', 'BLKGRPCE', 'PUMA','REGION','BLKGRPID', 'TRACTID']].set_index('BUILDING_ID').to_csv(ouptut_geo_cross)

# Census block group controls

In [None]:
# list Census marginal variables from controls_pre table (same as "controls" table with additional "acs_variables" field )
# "acs_variables" field contains evaluation expressions, including variables from Census API and operations
dfc = pd.read_csv("../" + pre_folder + conf["preprocess"]["pre_control"])
dfc = dfc.loc[dfc.acs_variables.notnull()]

dic_margs = {}
for geo, dfgeo in dfc.groupby('geography'):  
    full_vars= list(set(re.findall(r'P[0-9]{3}[A-Z]{0,1}[0-9]{3}', str(list(dfgeo.acs_variables)))))
    if geo == 'BLKGRP':
        ac5 = census_downloader(c, state, counties, "*", "*")
        geo_cols = ['state', 'county', 'tract', 'block group']
    elif geo == 'TRACT':
        ac5 = census_downloader(c, state, counties, "*")
        geo_cols = ['state', 'county', 'tract']
    
    if full_vars != []:
        dic_margs[geo] = ac5.download(full_vars).set_index(geo_cols)


In [None]:
# compile Census marginals to popsim control variables
for geo, dfg in dfc.groupby('geography'):
    for ind, r in dfg.iterrows():
        if True:
            print(r['acs_variables'])
            dic_margs[geo] = dic_margs[geo].astype(float).fillna(0)
            dic_margs[geo][r['control_field']] = dic_margs[geo].eval(r['acs_variables'].replace('"', ''))
    if geo in dic_margs.keys():
        dic_margs[geo] = dic_margs[geo][list(dfg.control_field)] #keep only control fields

In [None]:
# add unique geoids and PUMA 
for geo, dfm in dic_margs.items():

    dfcross = pd.read_csv(ouptut_geo_cross, dtype = str)
    if dfm.index.nlevels == 3:
        dfm['TRACTID'] = ['{}{}{}'.format(l1.zfill(2),l2.zfill(3),l3.zfill(6)) 
                                                                for l1, l2, l3 in dfm.index]
        dfcr = dfcross.drop_duplicates('TRACTID')
        dfm = pd.merge(dfm.reset_index(), dfcr[['TRACTID','PUMA']], on = 'TRACTID', how = 'left')
    elif dfm.index.nlevels == 4:
        dfcr = dfcross.drop_duplicates('BLKGRPID')
        dfm['BLKGRPID'] = ['{}{}{}{}'.format(l1.zfill(2),l2.zfill(3),l3.zfill(6), l4) 
                                                                for l1, l2, l3, l4 in dfm.index]
        dfm = pd.merge(dfm.reset_index(), dfcr[['BLKGRPID','PUMA']], on = 'BLKGRPID',how = 'left')
    dfm.columns = [col.upper() for col in dfm.columns]
    dfm = dfm.loc[dfm.BLKGRPID.isin(BLKGRPlist)]
    dfm.to_csv(output_control.replace(".csv", geo.lower() + ".csv"))
    

In [None]:
dfm.drop(['GQTYPE1','GQTYPE2','GQTYPE3','GQTYPE4','GQTYPE5','GQTYPE6','GQTYPE7'], axis=1).to_csv(output_control.replace(".csv", geo.lower() + ".csv"))

In [None]:
dfm.to_csv(output_control.replace(".csv", geo.lower() + ".csv"))

In [None]:
output_control

# AJDUST Census marginals

In [None]:
dict_gq = {
    "INST":
        {'TYPE':['GQTYPE1', 'GQTYPE2', 'GQTYPE3', 'GQTYPE4'],
        'POP':'INSTPOP',
        'BPOP':'INSTPOP2'
        },
    "NONI":
        {'TYPE':['GQTYPE5', 'GQTYPE7'],
        'POP':'NONIPOP',
        'BPOP':'NONIPOP2'
        }
    }
dict_gq["INST"]['BTYPE'] = [x + 'B'  for x in dict_gq["INST"]['TYPE']]
dict_gq["NONI"]['BTYPE'] = [x + 'B'  for x in dict_gq["NONI"]['TYPE']]

In [None]:
dfmsel2 = dfm.copy()

In [None]:
dict_gq['INST']['VARS'] =[x for x in dfmsel2.columns if "INST" in x][1:]
dict_gq['NONI']['VARS'] =[x for x in dfmsel2.columns if "NONI" in x][1:]
print(dict_gq['INST']['VARS'] , dict_gq['NONI']['VARS'])

In [None]:
bldg_blk  = bldgctrl.groupby('BLKGRPID')[dict_gq["INST"]['BTYPE'] + dict_gq["NONI"]['BTYPE']].agg('sum')

In [None]:
dfmsel2 = pd.merge(dfmsel2, bldg_blk, left_on ='BLKGRPID', right_index=True, how = 'left')
for k in dict_gq.keys():
    dfmsel2[dict_gq[k]['BPOP']] = dfmsel2[dict_gq[k]['BTYPE']].sum(axis=1)


In [None]:
# get Census variable(age, race, sex) mean cross block groups by single GQTYPE (only one GQTYPE for BGs)
dic_mean = {}
for k in dict_gq.keys():
    print(k)
    for col in dict_gq[k]['TYPE']:
        print(col)
        selind = dfmsel2.loc[(dfmsel2[col]>0) &(dfmsel2[col]==dfmsel2[dict_gq[k]['POP']])].index
        dic_mean[col+'B'] = dfmsel2.loc[selind, dict_gq[k]['VARS']].div(dfmsel2.loc[selind, dict_gq[k]['POP']], axis=0).mean()


In [None]:
# assign above means to 0 pop marginal rows by building GQTYPE


In [None]:
for k in dict_gq.keys():
    print(k)
    zeroindex = dfmsel2.loc[dfmsel2[dict_gq[k]['POP']]==0].index
    for col in dict_gq[k]['BTYPE']:
        dfind = dfmsel2.loc[dfmsel2.index.isin(zeroindex) & (dfmsel2[col]>0)]
        dfmsel2.loc[dfind.index, dict_gq[k]['VARS']] += np.repeat([dic_mean[col]], len(dfind), axis=0) 

In [None]:
for k in dict_gq.keys():
    for n in ['AGE', 'RACE', 'SEX']:
        selc = [c for c in dfmsel2.columns if (k in c) & (n in c)]
        dfmsel2[selc] = dfmsel2[selc].div(dfmsel2[selc].sum(axis=1), axis=0)
        dfmsel2.fillna(0, inplace = True)
        dfmsel2[selc] = dfmsel2[selc].multiply(dfmsel2[dict_gq[k]['BPOP']], axis=0)
    dfmsel2[dict_gq[k]['POP']] = dfmsel2[dict_gq[k]['BPOP']]

    for t, bt in zip(dict_gq[k]['TYPE'],dict_gq[k]['BTYPE']):
        dfmsel2[t] = dfmsel2[bt]

In [None]:
dfmsel2.sum()

In [None]:
[c for c in dfmsel2.columns if ('GQ' in c)]

In [None]:
dfmsel2.columns

In [None]:
dfmsel2.drop([c for c in dfmsel2.columns if ('GQ' in c) | ('POP2' in c)], axis =1)

In [None]:
dfmsel2.drop([c for c in dfmsel2.columns if ('GQ' in c) | ('POP2' in c)], axis =1).to_csv(output_control.replace(".csv", geo.lower() + ".csv"))

In [None]:
dfmsel2

# GQ seed HHs and persons

In [None]:
puma_lst = GQ_cross_walk.PUMA.fillna(0).astype(int).unique()

In [None]:
puma_lst

In [None]:
gq_p_pums = pd.read_csv(p_pums_csv)
gq_p_pums = gq_p_pums.loc[gq_p_pums.PUMA.isin(puma_lst)]
gq_p_pums = gq_p_pums.loc[gq_p_pums["RELP"].isin([16,17])]
gq_p_pums['hh_id'] = gq_p_pums['SERIALNO']
gq_p_pums.to_csv(output_seed_persons)

In [None]:
gq_h_pums =  pd.read_csv(h_pums_csv, index_col="SERIALNO")
gq_h_pums = gq_h_pums.loc[gq_h_pums.PUMA.isin(puma_lst)]
gq_h_pums = gq_h_pums.loc[(gq_h_pums.TYPE > 1) & (gq_h_pums.NP > 0 )]
gq_h_pums['hh_id'] =  gq_h_pums.index.values
gq_h_pums = pd.merge(gq_h_pums, gq_p_pums[['SERIALNO', 'PWGTP']], on = 'SERIALNO', how='left')
gq_h_pums['WGTP'] = gq_h_pums['PWGTP'] 
gq_h_pums.to_csv(output_seed_hhs)

In [None]:
# cm = dic_margs['BLKGRP'].copy()
# for k in ['POP', 'RACE', 'AGE', 'SEX']:
#     for c in ['INST', 'NONI']:
#         cols= []
#         for col in cm.columns:
#             if (k in col) & (c in col):
#                 cols.append(col)
#         print(cols)
#         print('target', target2015[c], 'before',cm[cols].sum().sum(), )
#         cm[cols] = round((cm[cols]/(cm[cols].sum().sum()) * target2015[c]), 0)
#         print('after', cm[cols].sum().sum())


In [None]:
st = pd.HDFStore("../output/pipeline.h5")


In [None]:
st.keys()

In [None]:
%pwd