In [None]:
### generate control totals needed by populationsim(RSG)

In [None]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import re
from census import Census


In [None]:
class ACS5_downloader:

    def __init__(self, census_reader, states, counties = None, tract_ids = None, blockgroup_ids = None):
        self.state = states
        self.counties = counties
        self.tracts = tract_ids
        self.blockgroups = blockgroup_ids 
        self.cread = census_reader
        self.fips_lookup()
    
    def fips_lookup(self):
        fips_table = pd.read_csv(
                "https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt",
                header=None, names=['state','state.fips', 'county.fips', 'county' ,'type'], dtype=str)
        if self.counties:
            if (self.counties != '*'):
                self.counties = fips_table.loc[(fips_table.state.isin(self.state)) & 
                            (fips_table.county.isin(self.counties))]['county.fips'].unique()
                self.counties = ','.join(self.counties)
        if self.state != '*':
            self.state = fips_table.loc[fips_table.state.isin(self.state
                                    )]['state.fips'].unique()
            self.state = ','.join(self.state)

    def state_download(self, vars):
        return self.cread.acs5.get(vars, geo={'for': 'state:{}'.format(self.state)})

    def county_download(self, vars):
        return self.cread.acs5.get(vars, geo={'for': 'county:{}'.format(self.counties), 
                                        'in': 'state:{}'.format(self.state)})

    def tract_download(self, vars):
        return self.cread.acs5.get(vars, geo={'for': 'tract:{}'.format(self.tracts), 
                                    'in': 'state:{} county:{}'.format(self.state, self.counties)})

    def blockgroup_download(self, vars):
        clst = self.counties.split(',')
        cm = []
        for cn in clst:
            cm += self.cread.acs5.get(vars, geo={'for': 'block group:{}'.format(self.blockgroups), 
                                'in': 'state:{} county:{} tract:{}'.format(self.state,cn, self.tracts)}) 
        return cm
    
    def download(self, variables):
        dfm = pd.DataFrame()
        if not(self.counties):
            downv = self.state_download(variables)
        elif not(self.tracts):
            downv = self.county_download(variables)
        elif not(self.blockgroups):
            downv = self.tract_download(variables)
        else:
            downv = self.blockgroup_download(variables)
        dfm = pd.DataFrame.from_dict(downv)

        return dfm
 

In [None]:
state, counties = ['MI'], ['Oakland County', 'Washtenaw County']
c = Census('add_your_key', year=2017) # explicit
geo_cross_csv = "../data/sem_geo_cross_walk.csv"
control_csv = "../preprocess/controls_pre.csv"


In [None]:
# list Census marginal variables from controls_pre table (same as "controls" table with additional "acs_variables" field )
# "acs_variables" field contains evaluation expressions, including variables from Census API and operations
dfc = pd.read_csv(control_csv)
dic_margs = {}
for geo, dfgeo in dfc.groupby('geography'):  
    full_vars= list(set(re.findall(r'B[0-9]{5}[A-Z]{0,1}_[0-9]{3}E', 
                    str(list(dfgeo.acs_variables)))))
    if geo == 'BLKGRP':
        ac5 = ACS5_downloader(c, state, counties, "*", "*")
        geo_cols = ['state', 'county', 'tract', 'block group']
    elif geo == 'TRACT':
        ac5 = ACS5_downloader(c, state, counties, "*")
        geo_cols = ['state', 'county', 'tract']

    dic_margs[geo] = ac5.download(full_vars).set_index(geo_cols)


In [None]:
# compile Census marginals to popsim control variables
for geo, dfg in dfc.groupby('geography'):
    for ind, r in dfg.iterrows():
        print(r['acs_variables'])
        dic_margs[geo] = dic_margs[geo].astype(float).fillna(0)
        dic_margs[geo][r['control_field']] = dic_margs[geo].eval(r['acs_variables'].replace('"', ''))
    dic_margs[geo] = dic_margs[geo][list(dfg.control_field)] #keep only control fields

In [None]:
# add unique geoids and PUMA 
for geo, dfm in dic_margs.items():

    dfcross = pd.read_csv(geo_cross_csv, dtype = str)
    if dfm.index.nlevels == 3:
        dfm['TRACTID'] = ['{}{}{}'.format(l1.zfill(2),l2.zfill(3),l3.zfill(6)) 
                                for l1, l2, l3 in dfm.index]
        dfm = pd.merge(dfm.reset_index(), dfcross[['TRACTID','PUMA']], on = 'TRACTID', how = 'left')

    elif dfm.index.nlevels == 4:
        dfm['BLKGRPID'] = ['{}{}{}{}'.format(l1.zfill(2),l2.zfill(3),l3.zfill(6), l4) 
                                for l1, l2, l3, l4 in dfm.index]
        dfm = pd.merge(dfm.reset_index(), dfcross[['BLKGRPID','PUMA']], on = 'BLKGRPID', how = 'left')    
    dfm.columns = [c.upper() for c in dfm.columns]
    dfm.to_csv('{}_control_totals_{}.csv'.format('_'.join([c[:3].lower() for c in counties]), geo.lower()))
    