Merge f041f89 into 218bffe

UDST · Jul 28, 2020 · 6ad75f1 · 6ad75f1
2 parents 218bffe + f041f89
commit 6ad75f1
Show file tree

Hide file tree

Showing 9 changed files with 2,099 additions and 12 deletions.
diff --git a/demos/Synthesis workflow.ipynb b/demos/Synthesis workflow.ipynb
diff --git a/demos/img/Starter.png b/demos/img/Starter.png
diff --git a/demos/img/download_pums.png b/demos/img/download_pums.png
diff --git a/demos/img/pums_tracts.png b/demos/img/pums_tracts.png
diff --git a/demos/img/subject_table.png b/demos/img/subject_table.png
diff --git a/demos/synthesize_state_v4.py b/demos/synthesize_state_v4.py
@@ -0,0 +1,56 @@
+import os
+import sys
+import argparse
+
+from synthpop.recipes.starter3 import Starter
+from synthpop.synthesizer import synthesize_all, enable_logging
+import pandas as pd
+import numpy as np
+
+
+def run(state='AL', county='all', year=2013):
+    def synthesize_county(state, county, year):
+        starter = Starter(os.environ["CENSUS"], state, county, acsyear=year)
+        synthetic_population = synthesize_all(starter)
+        hh = synthetic_population[0]
+        p = synthetic_population[1]
+        state_fips = hh['state'].unique()[0]
+        county_fips = hh['county'].unique()[0]
+
+        # verify if there is a folder with the name of the state where we can save the synthesized files
+        if not(os.path.isdir(state_fips)):
+            os.mkdir(state_fips)
+
+        p.to_csv('./{}/p_{}_{}_{}.csv'.format(state_fips, state_fips, county_fips, year))
+        hh.to_csv('./{}/hh_{}_{}_{}.csv'.format(state_fips, state_fips, county_fips, year))
+
+    if year >= 2018:
+        url_national_data = "https://storage.googleapis.com/synthpop-public/PUMS2018/pums_2018_acs5/"
+    else:
+        url_national_data = "https://s3-us-west-1.amazonaws.com/synthpop-data2/"
+    national_data = pd.read_csv(url_national_data + 'national_county.txt', dtype='str')
+    state_data = national_data[national_data['State'] == state]
+
+    if county != 'all':
+        state_data = state_data[state_data['County ANSI'].isin(county.split(','))]
+
+    for index, row in state_data.iterrows():
+        county_name = row['County Name']
+        synthesize_county(state, county_name, year)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) > 1:
+        parser = argparse.ArgumentParser()
+        parser.add_argument("-s", "--state", type=str, help="State to synthesize")
+        parser.add_argument("-c", "--county", type=str,
+                            help="County or list of counties to synthesize in ANSI")
+        parser.add_argument("-y", "--year", type=int, help="Year to synthesize")
+
+        args = parser.parse_args()
+        state = args.state if args.state else 'AL'
+        county = args.county if args.county else 'all'
+        year = args.year if args.year else 2013
+        run(state, county, year)
+    else:
+        run()
diff --git a/synthpop/census_helpers.py b/synthpop/census_helpers.py
@@ -6,15 +6,23 @@
 
 # code to retry when census api fails
 sess = requests.Session()
-adapter = requests.adapters.HTTPAdapter(max_retries = 100)
+adapter = requests.adapters.HTTPAdapter(max_retries=100)
 sess.mount('https://', adapter)
 
 # TODO DOCSTRING!!
+
+
 class Census:
 
-    def __init__(self, key):
-        self.c = census.Census(key, session = sess)
-        self.base_url = "https://s3-us-west-1.amazonaws.com/synthpop-data2/"
+    def __init__(self, key, acsyear=2016):
+        self.c = census.Census(key, session=sess)
+
+        if acsyear >= 2018:
+            storage = "https://storage.googleapis.com/synthpop-public/PUMS2018/pums_2018_acs5/"
+        else:
+            storage = "https://s3-us-west-1.amazonaws.com/synthpop-data2/"
+        self.base_url = storage
+        self.acsyear_files = acsyear
         self.pums_relationship_file_url = self.base_url + "tract10_to_puma.csv"
         self.pums_relationship_df = None
         self.pums10_population_base_url = \
@@ -176,21 +184,25 @@ def _read_csv(self, loc, **kargs):
         return self.pums_cache[loc]
 
     def download_population_pums(self, state, puma10=None, puma00=None, **kargs):
+        print('Downloading population pums from %s' % (self.base_url))
         state = self.try_fips_lookup(state)
         if (puma10 is None) & (puma00 is None):
             return self._read_csv(self.pums_population_state_base_url % (state), **kargs)
         pums = self._read_csv(self.pums10_population_base_url % (state, puma10), **kargs)
-        if puma00 is not None:
+        if (puma00 is not None) & (self.acsyear_files < 2018):
+            print('Reading PUMS00 from %s' % (self.base_url))
             pums00 = self._read_csv(self.pums00_population_base_url % (state, puma00), **kargs)
             pums = pd.concat([pums, pums00], ignore_index=True)
         return pums
 
     def download_household_pums(self, state, puma10=None, puma00=None, **kargs):
+        print('Downloading households pums from %s' % (self.base_url))
         state = self.try_fips_lookup(state)
         if (puma10 is None) & (puma00 is None):
             return self._read_csv(self.pums_household_state_base_url % (state), **kargs)
         pums = self._read_csv(self.pums10_household_base_url % (state, puma10), **kargs)
-        if puma00 is not None:
+        if (puma00 is not None) & (self.acsyear_files < 2018):
+            print('Reading PUMS00 from %s' % (self.base_url))
             pums00 = self._read_csv(self.pums00_household_base_url % (state, puma00), **kargs)
             pums = pd.concat([pums, pums00], ignore_index=True)
 

diff --git a/synthpop/recipes/starter.py b/synthpop/recipes/starter.py
@@ -38,8 +38,9 @@ class Starter:
     tract_to_puma_map : dictionary
         keys are tract ids and pumas are puma ids
     """
+
     def __init__(self, key, state, county, tract=None, acsyear=2016):
-        self.c = c = Census(key)
+        self.c = c = Census(key, acsyear)
         self.state = state
         self.county = county
         self.tract = tract
@@ -117,9 +118,17 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
 
         # Put the needed PUMS variables here.  These are also the PUMS variables
         # that will be in the outputted synthetic population
-        self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP',
+        self.h_pums_cols = ('serialno', 'PUMA10', 'RT', 'NP',
                             'TYPE', 'VEH', 'WIF', 'NOC', 'FINCP')
-        self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'AGEP', 'RAC1P', 'SEX')
+        self.p_pums_cols = ('serialno', 'PUMA10', 'AGEP', 'RAC1P', 'SEX')
+
+        if self.acsyear < 2018:
+            self.h_pums_cols = list(self.h_pums_cols)
+            self.h_pums_cols.insert(1, 'PUMA00')
+            self.h_pums_cols = tuple(self.h_pums_cols)
+            self.p_pums_cols = list(self.p_pums_cols)
+            self.p_pums_cols.insert(1, 'PUMA00')
+            self.p_pums_cols = tuple(self.p_pums_cols)
 
     def get_geography_name(self):
         # this synthesis is at the block group level for most variables

diff --git a/synthpop/recipes/starter2.py b/synthpop/recipes/starter2.py
@@ -42,8 +42,9 @@ class Starter:
     tract_to_puma_map : dictionary
         keys are tract ids and pumas are puma ids
     """
+
     def __init__(self, key, state, county, tract=None, acsyear=2016):
-        self.c = c = Census(key)
+        self.c = c = Census(key, acsyear)
         self.state = state
         self.county = county
         self.tract = tract
@@ -179,11 +180,18 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
 
         # Put the needed PUMS variables here.  These are also the PUMS variables
         # that will be in the outputted synthetic population
-        self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP', 'TYPE',
+        self.h_pums_cols = ('serialno', 'PUMA10', 'RT', 'NP', 'TYPE',
                             'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18')
-        self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RELP', 'AGEP',
+        self.p_pums_cols = ('serialno', 'PUMA10', 'RELP', 'AGEP',
                             'ESR', 'RAC1P', 'HISP', 'SEX', 'SPORDER',
                             'PERNP', 'SCHL', 'WKHP', 'JWTR', 'SCH')
+        if self.acsyear < 2018:
+            self.h_pums_cols = list(self.h_pums_cols)
+            self.h_pums_cols.insert(1, 'PUMA00')
+            self.h_pums_cols = tuple(self.h_pums_cols)
+            self.p_pums_cols = list(self.p_pums_cols)
+            self.p_pums_cols.insert(1, 'PUMA00')
+            self.p_pums_cols = tuple(self.p_pums_cols)
 
     def get_geography_name(self):
         # this synthesis is at the block group level for most variables