In [1]:
import pandas as pd

In [2]:
outfolder = "../output/2018_run2/"

In [3]:
dfh = pd.read_csv(outfolder + 'synthetic_households.csv', index_col = 'household_id')
dfp = pd.read_csv(outfolder + 'synthetic_persons.csv')

In [6]:
print(dfh.index.name)
print(dfh.columns)
print(dfp.index.name)
print(dfp.columns)

household_id
Index(['puma', 'tract', 'blkgrp', 'hh_id', 'persons', 'cars', 'hincp', 'r18',
       'age_of_head', 'race_id', 'hhisp', 'hworkers', 'adjinc', 'children',
       'type', 'ybl', 'bld', 'valp', 'rent', 'adjhsg', 'ten', 'hht'],
      dtype='object')
None
Index(['puma', 'tract', 'blkgrp', 'hh_id', 'household_id', 'member_id', 'age',
       'sex', 'esr', 'race_id', 'relate', 'hisp', 'naicsp', 'industry', 'wkhp',
       'wkw', 'schg', 'mil', 'pincp'],
      dtype='object')


In [5]:
dfh.columns = [x.lower() for x in dfh.columns]
dfp.columns = [x.lower() for x in dfp.columns]
dfh.rename(columns={'grntp': 'rent',
                    'np': 'persons',
                    'race_of_head': 'race_id',
                    'veh': 'cars',
                    'noc': 'children',
                    'agehoh':'age_of_head',
                    'hrace':'race_id'
                    }, inplace=True)
dfp.rename(columns={'agep': 'age',
                     'rac1p': 'race_id',
                     'relp': 'relate',
                     'per_num': 'member_id',
                     }, inplace=True)


In [7]:
# add person_id, household_id + member_id
dfp['person_id'] = dfp['household_id'] * 100 + dfp['member_id']
dfp = dfp.set_index('person_id')

# add county
dfh['county'] = dfh['tract'].astype(str).str[2:5].astype(int)
dfp['county'] = dfp['tract'].astype(str).str[2:5].astype(int)


# synthesized hrace/race_id: 1: white 2: black, 3: asian, 4: others
# synthesized hhisp: 0: not hisp 1: hisp
# target race_id: 1, non-hisp white; 2. non-hisp black, 3. hisp 4. others
dfh.loc[dfh.race_id > 2, 'race_id'] = 4
dfh.loc[dfh.hhisp == 1, 'race_id'] = 3

#hh inc: hincp * adjinc / 1000,000
# keep 'hincp' and 'adjinc' for ABM
dfh["income"] = dfh['hincp']
dfh["income"] *= dfh["adjinc"] / 1000000.0

#hh rent: grntp * adjhsg / 1000,000
dfh["rent"] *= dfh["adjhsg"] / 1000000.0

#update number of workers from persons table since only family workers counted in HH table
#in regional forecast, worker = labor force, different from travel model and Census (worker =  employed )
# ESR Character 1
# Employment status recode
# b .N/A (less than 16 years old)
# 1 .Civilian employed, at work
# 2 .Civilian employed, with a job but not at work
# 3 .Unemployed
# 4 .Armed forces, at work
# 5 .Armed forces, with a job but not at work
# 6 .Not in labor force
dfp['worker'] = 0
dfp.loc[dfp.esr.isin(range(1,6)),'worker'] = 1
dfh['workers'] = dfp.groupby('household_id').worker.sum()

#update number of childrens(AGEP<18) from persons table since NOC has only
dfp['child'] = 0
dfp.loc[(dfp.age<18),'child'] = 1
dfh['children'] = dfp.groupby('household_id').child.sum()

#person race_id, same as HH race_id
dfp.loc[dfp.race_id > 2, 'race_id'] = 4
dfp.loc[dfp.hisp > 1, 'race_id'] = 3

In [8]:
dfh[['county', 'tract', 'blkgrp', 'persons', 'cars', 'income', 'hincp', 'adjinc',
       'age_of_head', 'race_id', 'children',
       'type', 'ybl', 'bld', 'valp', 'rent', 'ten', 'hht',
       'workers']].to_csv(outfolder + 'synthetic_households_processed.csv')

dfp[['county', 'tract', 'blkgrp', 'household_id', 'member_id', 'age',
       'sex', 'race_id', 'relate', 'naicsp', 'industry', 'wkhp',
       'wkw', 'schg', 'mil', 'pincp']].to_csv(outfolder + 'synthetic_persons_processed.csv')