In [1]:
### generate control totals needed by populationsim(RSG)

In [2]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import re
from census import Census


In [3]:
state, county = 'MI', 'Oakland County'

c=Census(os.environ["CENSUS"], year=2017)
# c=Census('b01f26cd56a7f5457534e8436a1e63f4c7189b23', year=2017) # explicit
geo_cross_csv = "data/sem_geo_cross_walk.csv"
control_csv = "preprocess/controls_pre.csv"
fips_url = "https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt" 


In [4]:
def download_tract(fields, state_fips, county_fips, tract_fips = None):
    """download TRACT marginals by specified geo info. 
        must have fields, state and county fips 
    """
    if tract_fips == None:
        tract_fips = '*'
    return c.acs5.get(fields, 
                        geo={'for': 'tract:{}'.format(tract_fips), 
                            'in': 'state:{} county:{}'.format(state_fips, county_fips)})

In [5]:
def download_blockgroup(fields, state_fips, county_fips, tract_fips = None, blockgroup_fips = None):
    """download block group marginals by specified geo info. 
        must have fields, state and county fips 
    """
    if tract_fips == None:
        tract_fips, blockgroup_fips = '*', '*'
    elif blockgroup_fips == None:
        blockgroup_fips = '*'

    return c.acs5.get(fields,
                        geo={'for': 'block group:{}'.format(blockgroup_fips), 
                        'in': 'state:{} county:{} tract:{}'.format(state_fips, county_fips,tract_fips)})


In [6]:
def get_marginals(dic_vars, state_fips = 26, county_fips = None):
    """ download block group or tract marginal summaries from Census
        dic_vars: dictionary with BLKGRP or TRACT as keys and variables as values
    """
    geo_margs = {}
    geos = ['state', 'county', 'tract', 'block group']
    for k, v in dic_vars.items():
        if k == 'BLKGRP':
            download_geo = download_blockgroup
            geoids = geos
        elif k == 'TRACT':
            download_geo = download_tract
            geoids = geos[:3]
        
        dfms = []
        for fields in v:
            dfm = pd.DataFrame.from_dict(download_geo(fields, state_fips, county_fips))
            dfm = dfm.set_index(geoids)
            dfm = dfm.astype(float)
            dfm.fillna(0, inplace =True)
            dfms.append(dfm)
        geo_margs[k] = pd.concat(dfms, axis=1) 
 
    return geo_margs          


In [7]:
def makechunks(lst, csize):
    ### split list into sized chunks, Census API default limit is 50 ###
    for i in range(0, len(lst), csize):
        yield lst[i:i + csize]
        

In [8]:
# list Census marginal variables from controls_pre table (same as "controls" table with additional "acs_variables" field )
# "acs_variables" field contains evaluation expressions, including variables from Census API and operations
dfc = pd.read_csv(control_csv)
dic_vars = {}
for geo, dfgeo in dfc.groupby('geography'):
    vars= list(set(re.findall(r'B[0-9]{5}[A-Z]{0,1}_[0-9]{3}E', str(list(dfgeo.acs_variables)))))
    dic_vars[geo] = list(makechunks(vars, 50))



In [9]:
# look up state and county fips
fips_table = pd.read_csv(fips_url, header=None, 
                                names=['state','state.fips', 'county.fips', 'county' ,'type'])
geofips = fips_table.loc[(fips_table.state == state ) & (fips_table.county == county)]


In [10]:
# download marginal variables by geo fips
for ind, r in geofips.iterrows():
    print(r['state.fips'], r['county.fips'])
    final_marg = {}
    for k in dic_vars.keys():
        final_marg[k] = pd.DataFrame()
    geo_margs = get_marginals(dic_vars, r['state.fips'], r['county.fips'])
    for k, v in geo_margs.items():
        final_marg[k] = pd.concat([final_marg[k], geo_margs[k]], axis=0)
    
    

26 125


In [11]:
# compile Census marginals to popsim control variables
for geo, dfg in dfc.groupby('geography'):
    for ind, r in dfg.iterrows():
        print(r['acs_variables'])
        final_marg[geo][r['control_field']] = final_marg[geo].eval(r['acs_variables'].replace('"', ''))
    final_marg[geo] = final_marg[geo][list(dfg.control_field)]

B01001_001E
 "B25007_003E + B25007_013E"
B25007_004E + B25007_005E + B25007_014E + B25007_015E
B25007_006E + B25007_007E + B25007_008E + B25007_016E + B25007_017E + B25007_018E 
B25007_009E + B25007_010E + B25007_011E + B25007_019E + B25007_020E + B25007_021E
B25006_002E
B25006_003E
B25006_005E
B25006_004E + B25006_006E + B25006_007E + B25006_008E
B11005_001E - B25003I_001E
B25003I_001E
B11005_002E
B11005_011E
 "B19001_002E + B19001_003E + B19001_004E + B19001_005E + B19001_006E"
 "B19001_007E + B19001_008E + B19001_009E + B19001_010E + B19001_011E"
B19001_012E + B19001_013E
B19001_014E + B19001_015E + B19001_016E + B19001_017E
B25009_003E + B25009_011E
B25009_004E + B25009_012E
B25009_005E + B25009_013E
B25009_006E + B25009_014E
B25009_007E + B25009_015E
B25009_008E + B25009_016E
B25009_009E + B25009_017E
(B01001_003E  + B01001_027E + B01001_004E + B01001_005E + B01001_006E + B01001_028E + B01001_029E + B01001_030E)* B11002_001E*1.0/B01001_001E
 "(B01001_007E + B01001_008E + B01001_00

In [12]:
# add unique geoids and PUMA 
for geo, dfm in final_marg.items():

    dfcross = pd.read_csv(geo_cross_csv, dtype = str)
    if dfm.index.nlevels == 3:
        dfm['TRACTID'] = ['{}{}{}'.format(l1.zfill(2),l2.zfill(3),l3.zfill(6)) for l1, l2, l3 in dfm.index]
        dfm = pd.merge(dfm.reset_index(), dfcross[['TRACTID','PUMA']], on = 'TRACTID', how = 'left')

    elif dfm.index.nlevels == 4:
        dfm['BLKGRPID'] = ['{}{}{}{}'.format(l1.zfill(2),l2.zfill(3),l3.zfill(6), l4) for l1, l2, l3, l4 in dfm.index]
        dfm = pd.merge(dfm.reset_index(), dfcross[['BLKGRPID','PUMA']], on = 'BLKGRPID', how = 'left') 
    
    dfm.to_csv('{}_control_totals_{}.csv'.format(county[:3].lower(), geo.lower()))
    