diff --git a/synthpop/census_helpers.py b/synthpop/census_helpers.py index 3b04825..3516cfc 100644 --- a/synthpop/census_helpers.py +++ b/synthpop/census_helpers.py @@ -6,15 +6,23 @@ # code to retry when census api fails sess = requests.Session() -adapter = requests.adapters.HTTPAdapter(max_retries = 100) +adapter = requests.adapters.HTTPAdapter(max_retries=100) sess.mount('https://', adapter) # TODO DOCSTRING!! + + class Census: - def __init__(self, key): - self.c = census.Census(key, session = sess) - self.base_url = "https://s3-us-west-1.amazonaws.com/synthpop-data2/" + def __init__(self, key, acsyear=2016): + self.c = census.Census(key, session=sess) + + if acsyear >= 2018: + storage = "https://storage.googleapis.com/synthpop-public/PUMS2018/pums_2018_acs5/" + else: + storage = "https://s3-us-west-1.amazonaws.com/synthpop-data2/" + self.base_url = storage + self.acsyear_files = acsyear self.pums_relationship_file_url = self.base_url + "tract10_to_puma.csv" self.pums_relationship_df = None self.pums10_population_base_url = \ @@ -180,7 +188,7 @@ def download_population_pums(self, state, puma10=None, puma00=None, **kargs): if (puma10 is None) & (puma00 is None): return self._read_csv(self.pums_population_state_base_url % (state), **kargs) pums = self._read_csv(self.pums10_population_base_url % (state, puma10), **kargs) - if puma00 is not None: + if (puma00 is not None) & (self.acsyear_files < 2018): pums00 = self._read_csv(self.pums00_population_base_url % (state, puma00), **kargs) pums = pd.concat([pums, pums00], ignore_index=True) return pums @@ -190,7 +198,7 @@ def download_household_pums(self, state, puma10=None, puma00=None, **kargs): if (puma10 is None) & (puma00 is None): return self._read_csv(self.pums_household_state_base_url % (state), **kargs) pums = self._read_csv(self.pums10_household_base_url % (state, puma10), **kargs) - if puma00 is not None: + if (puma00 is not None) & (self.acsyear_files < 2018): pums00 = self._read_csv(self.pums00_household_base_url % (state, puma00), **kargs) pums = pd.concat([pums, pums00], ignore_index=True) diff --git a/synthpop/recipes/starter.py b/synthpop/recipes/starter.py index ba61140..0814072 100644 --- a/synthpop/recipes/starter.py +++ b/synthpop/recipes/starter.py @@ -38,8 +38,9 @@ class Starter: tract_to_puma_map : dictionary keys are tract ids and pumas are puma ids """ + def __init__(self, key, state, county, tract=None, acsyear=2016): - self.c = c = Census(key) + self.c = c = Census(key, acsyear) self.state = state self.county = county self.tract = tract @@ -117,9 +118,17 @@ def __init__(self, key, state, county, tract=None, acsyear=2016): # Put the needed PUMS variables here. These are also the PUMS variables # that will be in the outputted synthetic population - self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP', + self.h_pums_cols = ('serialno', 'PUMA10', 'RT', 'NP', 'TYPE', 'VEH', 'WIF', 'NOC', 'FINCP') - self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'AGEP', 'RAC1P', 'SEX') + self.p_pums_cols = ('serialno', 'PUMA10', 'AGEP', 'RAC1P', 'SEX') + + if self.acsyear < 2018: + self.h_pums_cols = list(self.h_pums_cols) + self.h_pums_cols.insert(1, 'PUMA00') + self.h_pums_cols = tuple(self.h_pums_cols) + self.p_pums_cols = list(self.p_pums_cols) + self.p_pums_cols.insert(1, 'PUMA00') + self.p_pums_cols = tuple(self.p_pums_cols) def get_geography_name(self): # this synthesis is at the block group level for most variables diff --git a/synthpop/recipes/starter2.py b/synthpop/recipes/starter2.py index 8008d9e..6826dda 100644 --- a/synthpop/recipes/starter2.py +++ b/synthpop/recipes/starter2.py @@ -42,8 +42,9 @@ class Starter: tract_to_puma_map : dictionary keys are tract ids and pumas are puma ids """ + def __init__(self, key, state, county, tract=None, acsyear=2016): - self.c = c = Census(key) + self.c = c = Census(key, acsyear) self.state = state self.county = county self.tract = tract @@ -179,11 +180,18 @@ def __init__(self, key, state, county, tract=None, acsyear=2016): # Put the needed PUMS variables here. These are also the PUMS variables # that will be in the outputted synthetic population - self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP', 'TYPE', + self.h_pums_cols = ('serialno', 'PUMA10', 'RT', 'NP', 'TYPE', 'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18') - self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RELP', 'AGEP', + self.p_pums_cols = ('serialno', 'PUMA10', 'RELP', 'AGEP', 'ESR', 'RAC1P', 'HISP', 'SEX', 'SPORDER', 'PERNP', 'SCHL', 'WKHP', 'JWTR', 'SCH') + if self.acsyear < 2018: + self.h_pums_cols = list(self.h_pums_cols) + self.h_pums_cols.insert(1, 'PUMA00') + self.h_pums_cols = tuple(self.h_pums_cols) + self.p_pums_cols = list(self.p_pums_cols) + self.p_pums_cols.insert(1, 'PUMA00') + self.p_pums_cols = tuple(self.p_pums_cols) def get_geography_name(self): # this synthesis is at the block group level for most variables