PUMS geographies based on ACS and support for last pandas release (#67)

* Reading PUMS files based on different ACS years (gegraphies based on > and < 2010 Census) * Config class to tidy up bucket URL's. * Casting serialno as str for group quarter cases in PUMS 2018 based synthesis * Fix pandas issue #63 on drop_zeros ipu module function
UDST · Nov 3, 2020 · 1c772b7 · 1c772b7
1 parent 0bc36f8
commit 1c772b7
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 19 deletions.
diff --git a/README.rst b/README.rst
@@ -22,3 +22,15 @@ The paper is available here:
 http://www.scag.ca.gov/Documents/PopulationSynthesizerPaper_TRB.pdf
 
 .. _PopGen: http://urbanmodel.asu.edu/popgen.html
+
+# Installation
+
+```
+virtualenv venv --python=python3.7
+source venv/bin/activate
+pip install -r requierements.txt
+cd synthpop/
+python setup.py develop
+```
+To run `Synthpop` you need a Census API that you can get one from [here](https://api.census.gov/data/key_signup.html). After you get and validate the API key you can add it as an enviromental variable to your environment as by adding to `/venv/bin/activate` the following line: 
+`export CENSUS='yourApiKey'`
diff --git a/setup.py b/setup.py
@@ -12,7 +12,8 @@
         'Development Status :: 4 - Beta',
         'Programming Language :: Python :: 2.7',
         'Programming Language :: Python :: 3.5',
-        'Programming Language :: Python :: 3.6'
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7'
     ],
     packages=find_packages(exclude=['*.tests']),
     install_requires=[

diff --git a/synthpop/census_helpers.py b/synthpop/census_helpers.py
@@ -3,18 +3,22 @@
 import numpy as np
 import us
 import requests
+from .config import synthpop_config
 
 # code to retry when census api fails
 sess = requests.Session()
-adapter = requests.adapters.HTTPAdapter(max_retries = 100)
+adapter = requests.adapters.HTTPAdapter(max_retries=100)
 sess.mount('https://', adapter)
 
 # TODO DOCSTRING!!
+
+
 class Census:
 
-    def __init__(self, key):
-        self.c = census.Census(key, session = sess)
-        self.base_url = "https://s3-us-west-1.amazonaws.com/synthpop-data2/"
+    def __init__(self, key, acsyear=2016):
+        self.c = census.Census(key, session=sess)
+        self.base_url = synthpop_config(acsyear).pums_storage()
+        self.acsyear_files = acsyear
         self.pums_relationship_file_url = self.base_url + "tract10_to_puma.csv"
         self.pums_relationship_df = None
         self.pums10_population_base_url = \
@@ -165,7 +169,9 @@ def _read_csv(self, loc, **kargs):
             pums_df = pd.read_csv(loc, dtype={
                 "PUMA10": "object",
                 "PUMA00": "object",
-                "ST": "object"
+                "ST": "object",
+                "SERIALNO": 'str',
+                "serialno": 'str',
             }, **kargs)
             pums_df = pums_df.rename(columns={
                 'PUMA10': 'puma10',
@@ -180,7 +186,7 @@ def download_population_pums(self, state, puma10=None, puma00=None, **kargs):
         if (puma10 is None) & (puma00 is None):
             return self._read_csv(self.pums_population_state_base_url % (state), **kargs)
         pums = self._read_csv(self.pums10_population_base_url % (state, puma10), **kargs)
-        if puma00 is not None:
+        if (puma00 is not None) & (self.acsyear_files < 2018):
             pums00 = self._read_csv(self.pums00_population_base_url % (state, puma00), **kargs)
             pums = pd.concat([pums, pums00], ignore_index=True)
         return pums
@@ -190,7 +196,7 @@ def download_household_pums(self, state, puma10=None, puma00=None, **kargs):
         if (puma10 is None) & (puma00 is None):
             return self._read_csv(self.pums_household_state_base_url % (state), **kargs)
         pums = self._read_csv(self.pums10_household_base_url % (state, puma10), **kargs)
-        if puma00 is not None:
+        if (puma00 is not None) & (self.acsyear_files < 2018):
             pums00 = self._read_csv(self.pums00_household_base_url % (state, puma00), **kargs)
             pums = pd.concat([pums, pums00], ignore_index=True)
 

diff --git a/synthpop/config.py b/synthpop/config.py
@@ -0,0 +1,14 @@
+class synthpop_config:
+
+    def __init__(self, acsyear=2013):
+        self.acsyear = acsyear
+
+    def pums_storage(self):
+        if self.acsyear >= 2018:
+            storage = "https://storage.googleapis.com/synthpop-public/PUMS2018/pums_2018_acs5/"
+        else:
+            storage = "https://s3-us-west-1.amazonaws.com/synthpop-data2/"
+        return storage
+
+    def __call__(self):
+        return self.pums_storage()
diff --git a/synthpop/ipu/ipu.py b/synthpop/ipu/ipu.py
@@ -22,11 +22,11 @@ def _drop_zeros(df):
 
     """
     def for_each_col(col):
-        nz = col.nonzero()[0]
-        return col[nz], nz
+        nz = col.values.nonzero()[0]
+        return col.iloc[nz], nz
 
-    for (col_idx, (col, nz)) in df.apply(for_each_col, axis=0, raw=True).items():
-        yield (col_idx, col, nz)
+    for (col_idx, (col, nz)) in df.apply(for_each_col, axis=0, raw=False).items():
+        yield (col_idx, col.values, nz)
 
 
 class _FrequencyAndConstraints(object):
@@ -63,8 +63,10 @@ class _FrequencyAndConstraints(object):
         Total number of columns across household and person classes.
 
     """
+
     def __init__(self, household_freq, household_constraints, person_freq=None,
                  person_constraints=None):
+
         hh_cols = ((key, col, household_constraints[key], nz)
                    for key, col, nz in _drop_zeros(household_freq))
 
@@ -161,7 +163,7 @@ def _average_fit_quality(freq_wrap, weights):
     return sum(
         _fit_quality(col, weights[nz], constraint)
         for _, col, constraint, nz in freq_wrap.iter_columns()
-        ) / freq_wrap.ncols
+    ) / freq_wrap.ncols
 
 
 def _update_weights(column, weights, constraint):

diff --git a/synthpop/recipes/starter.py b/synthpop/recipes/starter.py
@@ -38,8 +38,9 @@ class Starter:
     tract_to_puma_map : dictionary
         keys are tract ids and pumas are puma ids
     """
+
     def __init__(self, key, state, county, tract=None, acsyear=2016):
-        self.c = c = Census(key)
+        self.c = c = Census(key, acsyear)
         self.state = state
         self.county = county
         self.tract = tract
@@ -117,9 +118,17 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
 
         # Put the needed PUMS variables here.  These are also the PUMS variables
         # that will be in the outputted synthetic population
-        self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP',
+        self.h_pums_cols = ('serialno', 'PUMA10', 'RT', 'NP',
                             'TYPE', 'VEH', 'WIF', 'NOC', 'FINCP')
-        self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'AGEP', 'RAC1P', 'SEX')
+        self.p_pums_cols = ('serialno', 'PUMA10', 'AGEP', 'RAC1P', 'SEX')
+
+        if self.acsyear < 2018:
+            self.h_pums_cols = list(self.h_pums_cols)
+            self.h_pums_cols.insert(1, 'PUMA00')
+            self.h_pums_cols = tuple(self.h_pums_cols)
+            self.p_pums_cols = list(self.p_pums_cols)
+            self.p_pums_cols.insert(1, 'PUMA00')
+            self.p_pums_cols = tuple(self.p_pums_cols)
 
     def get_geography_name(self):
         # this synthesis is at the block group level for most variables

diff --git a/synthpop/recipes/starter2.py b/synthpop/recipes/starter2.py
@@ -42,8 +42,9 @@ class Starter:
     tract_to_puma_map : dictionary
         keys are tract ids and pumas are puma ids
     """
+
     def __init__(self, key, state, county, tract=None, acsyear=2016):
-        self.c = c = Census(key)
+        self.c = c = Census(key, acsyear)
         self.state = state
         self.county = county
         self.tract = tract
@@ -179,11 +180,18 @@ def __init__(self, key, state, county, tract=None, acsyear=2016):
 
         # Put the needed PUMS variables here.  These are also the PUMS variables
         # that will be in the outputted synthetic population
-        self.h_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RT', 'NP', 'TYPE',
+        self.h_pums_cols = ('serialno', 'PUMA10', 'RT', 'NP', 'TYPE',
                             'R65', 'HINCP', 'VEH', 'MV', 'TEN', 'BLD', 'R18')
-        self.p_pums_cols = ('serialno', 'PUMA00', 'PUMA10', 'RELP', 'AGEP',
+        self.p_pums_cols = ('serialno', 'PUMA10', 'RELP', 'AGEP',
                             'ESR', 'RAC1P', 'HISP', 'SEX', 'SPORDER',
                             'PERNP', 'SCHL', 'WKHP', 'JWTR', 'SCH')
+        if self.acsyear < 2018:
+            self.h_pums_cols = list(self.h_pums_cols)
+            self.h_pums_cols.insert(1, 'PUMA00')
+            self.h_pums_cols = tuple(self.h_pums_cols)
+            self.p_pums_cols = list(self.p_pums_cols)
+            self.p_pums_cols.insert(1, 'PUMA00')
+            self.p_pums_cols = tuple(self.p_pums_cols)
 
     def get_geography_name(self):
         # this synthesis is at the block group level for most variables