Skip to content

Commit

Permalink
script to generate 9 county bay area population in parallel
Browse files Browse the repository at this point in the history
  • Loading branch information
mxndrwgrdnr committed Apr 9, 2018
1 parent dd648f4 commit af117fd
Showing 1 changed file with 42 additions and 46 deletions.
88 changes: 42 additions & 46 deletions scripts/sfbay_synth.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,18 @@
import time
import os
import pandas as pd, numpy as np
import pandas as pd
from glob import glob
import warnings

from synthpop.census_helpers import Census
from synthpop.recipes.starter2 import Starter
from synthpop.synthesizer import synthesize_all, synthesize_all_in_parallel, enable_logging
from synthpop.synthesizer import synthesize_all_in_parallel

pd.set_option('display.max_columns', 500)
import warnings
warnings.filterwarnings('ignore')




counties = [
"Napa County", "Santa Clara County", "Solano County", "San Mateo County",
"Marin County", "San Francisco County", "Sonoma County",
"Contra Costa County", "Alameda County"]
# county_tuples = []
# for county in counties:
# print('Starting {0}'.format(county))
# starter = Starter(os.environ["CENSUS"], "CA", county)
# county_dfs = synthesize_all(starter)
# county_tuples.append(county_dfs)
# hh_all = pd.concat([county[0] for county in county_tuples])
# p_all = pd.concat([county[1] for county in county_tuples])
# fits_all = {}
# for county in county_tuples:
# fits_all.update(county[2])


if __name__ == '__main__':

Expand All @@ -44,42 +28,49 @@

hh_all.index.name = 'household_id'
p_all.index.name = 'person_id'
p_all.rename(columns = {'hh_id':'household_id'}, inplace = True)

hh_all['age_of_head'] = p_all[p_all.RELP == 0].groupby('household_id').AGEP.max()
hh_all['race_of_head'] = p_all[p_all.RELP == 0].groupby('household_id').RAC1P.max()
hh_all['workers'] = p_all[p_all.ESR.isin([1, 2, 4, 5])].groupby('household_id').size()
hh_all['children'] = p_all[p_all.AGEP < 18].groupby('household_id').size()
p_all.rename(columns={'hh_id': 'household_id'}, inplace=True)

hh_all['age_of_head'] = p_all[p_all.RELP == 0].groupby(
'household_id').AGEP.max()
hh_all['race_of_head'] = p_all[p_all.RELP == 0].groupby(
'household_id').RAC1P.max()
hh_all['workers'] = p_all[p_all.ESR.isin([1, 2, 4, 5])].groupby(
'household_id').size()
hh_all['children'] = p_all[p_all.AGEP < 18].groupby(
'household_id').size()
hh_all['tenure'] = 2
hh_all.tenure[hh_all.TEN < 3] = 1 #tenure coded 1:own, 2:rent
hh_all.tenure[hh_all.TEN < 3] = 1 # tenure coded 1:own, 2:rent
hh_all['recent_mover'] = 0
hh_all.recent_mover[hh_all.MV < 4] = 1 #1 if recent mover (within last five years)
hh_all = hh_all.rename(columns = {'VEH':'cars', 'HINCP':'income', 'NP':'persons', 'BLD':'building_type'})

hh_all.recent_mover[hh_all.MV < 4] = 1 # 1 if recent mover
hh_all = hh_all.rename(columns={
'VEH': 'cars', 'HINCP': 'income', 'NP': 'persons',
'BLD': 'building_type'})

for col in hh_all.columns:
if col not in ['persons', 'income',
'age_of_head', 'race_of_head', 'hispanic_head',
'workers', 'children', 'cars',
'tenure', 'recent_mover', 'building_type', 'serialno',
'state', 'county', 'tract', 'block group']:
del hh_all[col]

p_all.rename(columns = {'AGEP':'age', 'RAC1P':'race_id', 'NP':'persons', 'SPORDER':'member_id', 'HISP': 'hispanic',
'RELP':'relate', 'SEX':'sex', 'WKHP':'hours', 'SCHL':'edu', 'PERNP':'earning'},
inplace = True)
if col not in [
'persons', 'income', 'age_of_head', 'race_of_head',
'hispanic_head', 'workers', 'children', 'cars', 'tenure',
'recent_mover', 'building_type', 'serialno', 'state',
'county', 'tract', 'block group']:
del hh_all[col]

p_all.rename(columns={
'AGEP': 'age', 'RAC1P': 'race_id', 'NP': 'persons',
'SPORDER': 'member_id', 'HISP': 'hispanic', 'RELP': 'relate',
'SEX': 'sex', 'WKHP': 'hours', 'SCHL': 'edu', 'PERNP': 'earning'},
inplace=True)
p_all['student'] = 0
p_all.student[p_all.SCH.isin([2,3])] = 1
p_all.student[p_all.SCH.isin([2, 3])] = 1
p_all['work_at_home'] = 0
p_all.work_at_home[p_all.JWTR == 11] = 1
p_all['worker'] = 0
p_all.worker[p_all.ESR.isin([1, 2, 4, 5])] = 1

for col in p_all.columns:
if col not in ['household_id', 'member_id',
'relate', 'age', 'sex', 'race_id', 'hispanic',
'student', 'worker', 'hours',
'work_at_home', 'edu', 'earning']:
'relate', 'age', 'sex', 'race_id', 'hispanic',
'student', 'worker', 'hours',
'work_at_home', 'edu', 'earning']:
del p_all[col]

hh_all.to_csv('{0}_hh_synth_parallel.csv'.format(county))
Expand All @@ -96,7 +87,8 @@
for hh_file in hh_fnames:
county = hh_file.split('_hh')[0]
hh_df = pd.read_csv(hh_file, index_col='household_id', header=0)
p_df = pd.read_csv(glob(county + '_p*.csv')[0], index_col='person_id', header=0)
p_df = pd.read_csv(
glob(county + '_p*.csv')[0], index_col='person_id', header=0)
print(county + ': {0}'.format(str(hh_df.iloc[0].county)))
hh_df.index += hh_index_start
p_df.household_id += hh_index_start
Expand All @@ -105,7 +97,11 @@
p_df_list.append(p_df)
hh_index_start = hh_df.index.values[-1] + 1
p_index_start = p_df.index.values[-1] + 1

hh_all = pd.concat(hh_df_list)
p_all = pd.concat(p_df_list)
print(len(hh_all.iloc[hh_all.index.duplicated(keep=False)]))
print(len(p_all.iloc[p_all.index.duplicated(keep=False)]))
p_all.to_csv('sfbay_persons_2018_04_08.csv')
hh_all.to_csv('sfbay_households_2018_04_08.csv')
hh_all.to_csv('sfbay_households_2018_04_08.csv')

0 comments on commit af117fd

Please sign in to comment.