In [1]:
# preparation 
# 2017 ACS 5-year HHs;    https://www2.census.gov/programs-surveys/acs/data/pums/2017/5-Year/csv_hmi.zip
# 2017 ACS 5-year Persons;    https://www2.census.gov/programs-surveys/acs/data/pums/2017/5-Year/csv_pmi.zip
# change downloaded files to "acs2017_psam_h26.csv" and "acs2017_psam_p26.csv"

In [2]:
import pandas as pd

In [3]:
state, counties = 'MI', ['Oakland County', 'Washtenaw County']

geo_cross_csv = "data/sem_geo_cross_walk.csv"
fips_url = "https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt" 
puma_county_csv = "preprocess/2010_Census_Tract_to_2010_PUMA.txt"

h_pums_csv = "preprocess/ACS/acs2017_psam_h26.csv"
p_pums_csv = "preprocess/ACS/acs2017_psam_p26.csv"

In [4]:
# look up state and county fips
fips_table = pd.read_csv(fips_url, header=None, 
                                names=['state','state.fips', 'county.fips', 'county' ,'type'])
geofips = fips_table.loc[(fips_table.state == state ) & (fips_table.county.isin(counties))]

In [5]:
puma_county =  pd.read_csv(puma_county_csv)
puma_lst = puma_county.loc[(puma_county.STATEFP == geofips['state.fips'].values[0]) & puma_county.COUNTYFP.isin(geofips['county.fips'].values)].PUMA5CE.unique()

print(puma_lst)

[2901 2902 2903 2904 2905 2906 2907 2908 2701 2702 2703]


In [6]:
h_pums =  pd.read_csv(h_pums_csv, index_col="SERIALNO")
p_pums = pd.read_csv(p_pums_csv, index_col="SERIALNO")

h_pums = h_pums.loc[h_pums.PUMA.isin(puma_lst)]
p_pums = p_pums.loc[p_pums.PUMA.isin(puma_lst)]
h_pums = h_pums.loc[(h_pums.TYPE == 1) & (h_pums.NP > 0 )] #remove group quarters and empty units
p_pums = p_pums.loc[h_pums.index]

print (len(h_pums), len(p_pums))

27432 65324


In [7]:
def preprocess_pums(h_pums, p_pums):

    # add AGEHOH to PUMS sample
    hage = p_pums.loc[p_pums.RELP == 0].groupby('SERIALNO').AGEP.max()
    h_pums.loc[hage.index, 'AGEHOH'] = hage.values

    # add HRACE to PUMS sample
    hrace = p_pums.loc[p_pums.RELP == 0].groupby('SERIALNO').RAC1P.max()
    h_pums["v0"] = hrace.values
    h_pums["HRACE"] = 4
    h_pums.loc[h_pums.v0 == 1, "HRACE"] = 1
    h_pums.loc[h_pums.v0 == 2, "HRACE"] = 2
    h_pums.loc[h_pums.v0 == 6, "HRACE"] = 3

    # add HHISP to PUMS sample
    hisp = p_pums.loc[p_pums.RELP == 0].groupby('SERIALNO').HISP.max()
    h_pums["v0"] = hisp.values
    h_pums["HHISP"] = 0
    h_pums.loc[h_pums.v0 > 1, "HHISP"] = 1

    # add HWORKERS to PUMS sample
    workers = p_pums.loc[p_pums.ESR.isin([1,2,4,5])].groupby('SERIALNO').size()
    h_pums['v0'] = 0
    h_pums.loc[workers.index, "v0"] = workers.values
    h_pums["HWORKERS"] = 0
    h_pums.loc[h_pums.v0 == 1, "HWORKERS"] = 1
    h_pums.loc[h_pums.v0 >= 2, "HWORKERS"] = 2

    for v in ["AGEHOH","HRACE", "HHISP","HWORKERS" ]:
        print (sorted(h_pums[v].unique()))
        print (h_pums.groupby(v).size())
    h_pums.drop("v0", axis = 1, inplace = True)

    return h_pums, p_pums

In [8]:
h_pums, p_pums = preprocess_pums(h_pums, p_pums)
h_pums['hh_id'] = h_pums.index.values
p_pums['hh_id'] = p_pums.index.values

print('save../')
h_pums.to_csv('{}_seed_households.csv'.format('_'.join([c[:3].lower() for c in counties])))
print('save../')
p_pums.to_csv('{}_seed_persons.csv'.format('_'.join([c[:3].lower() for c in counties])))

[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 93, 94]
AGEHOH
16      2
17      5
18     15
19     55
20     85
     ... 
88    107
89    105
90     80
93     47
94    220
Length: 77, dtype: int64
[1, 2, 3, 4]
HRACE
1    22306
2     2846
3     1622
4      658
dtype: int64
[0, 1]
HHISP
0    26759
1      673
dtype: int64
[0, 1, 2]
HWORKERS
0     6599
1    10350
2    10483
dtype: int64
save../
save../
