In [1]:
""" Makes tax units from the ASEC.

Based on Sam Portnow's code at 
https://users.nber.org/~taxsim/to-taxsim/cps/cps-portnow/TaxSimRScriptForDan.R
"""
import numpy as np
import pandas as pd

# personal exemptions

pexemp = pd.DataFrame({
  'year': [2018],
  'pexemp': [0]})


ipum = pd.read_csv('~/UBICenter/covid_ui/asec_2019_ipums.csv.gz')
# ipum = pd.read_csv('~/MaxGhenis/datarepo/asec_2019_ipums.csv.gz')
# set to lower case
ipum.columns = ipum.columns.str.lower()

# Set missing income items to zero so that non-filers etc will get zeroes.
VARS_MISSING_ZERO =[
    'eitcred', 'fedretir', 'fedtax', 'statetax', 'adjginc', 'taxinc',
    'fedtaxac', 'fica', 'stataxac', 'incdivid', 'incint', 'incrent',
    'incother', 'incasist', 'incss', 'incwelfr', 'incwkcom', 'incvet',
    'incchild', 'incunemp', 'inceduc', 'gotveduc', 'gotvothe', 'gotvpens',
    'gotvsurv', 'incssi', 'incwage', 'incbus', 'incfarm', 'incsurv',
    'incdisab', 'incretir', 'inccapg']


# these are the missing codes
MISSING_CODES = [9999, 99999, 999999, 9999999,
                 -9999, -99999, -999999, -9999999,
                 9997, 99997, 999997, 9999997]

for var in VARS_MISSING_ZERO:
    ipum.loc[ipum[var].isna() | ipum[var].isin(MISSING_CODES), var] = 0

assert (ipum[VARS_MISSING_ZERO].isin(MISSING_CODES)).sum().sum() == 0
assert (ipum[VARS_MISSING_ZERO].isna()).sum().sum() == 0

In [2]:
# set 0's to NA for location
COLS_ZERO_TO_NA = ['momloc', 'poploc', 'sploc']
for col in COLS_ZERO_TO_NA:
    ipum.loc[ipum[col] == 0, col] = np.nan


# year before tax returns
ipum['x2'] = ipum.year - 1

# set x3 to fips code
ipum['x3'] = ipum.statefip

# Marital status will be sum of spouse's x4 values
ipum['x4'] = 1

# https://cps.ipums.org/cps-action/variables/RELATE#codes_section
RELATE_COHABITORS_NOT_MARRIED_CODE = 1114
ipum.loc[ipum.relate == RELATE_COHABITORS_NOT_MARRIED_CODE, 'sploc'] = np.nan

# x6 is age of unmarried people and primary married people (sploc > pernum).
# x24 is age of everyone else.
ipum['unmarried_primary_married'] = (
    ipum.sploc.isna() | ((ipum.sploc > 0) & (ipum.sploc > ipum.pernum)))
ipum['x6'] = np.where(ipum.unmarried_primary_married, ipum.age, 0)
ipum['x24'] = ipum.age - ipum.x6


# primary wage or spouse wage
ipum['incwagebusfarm'] = ipum[['incwage', 'incbus', 'incfarm']].sum(axis=1)
ipum['x7'] = np.where(ipum.unmarried_primary_married, ipum.incwagebusfarm, 0)
ipum['x8'] = ipum.incwagebusfarm - ipum.x7

# Add some income variables.
ipum['x9'] = ipum.incdivid
ipum['x10'] = ipum[['incrent', 'incother']].sum(axis=1)
ipum['x11'] = ipum.incretir
ipum['x12'] = ipum.incss
ipum['x27'] = ipum.incint
ipum['x28'] = 0

# Other income.
ipum['x13'] = ipum[['incwelfr', 'incwkcom', 'incvet', 'incsurv', 'incdisab',
                     'incchild', 'inceduc', 'incssi', 'incasist']].sum(axis=1)

ipum['x14'] = ipum.incrent
ipum['x15'] = 0


# /* use Census imputation of itemized deductions where available.*/
# first have to join the exemption table
pexemp.rename(columns={'year': 'x2'}, inplace=True)
ipum = ipum.merge(pexemp, on='x2')

# adjusted gross - taxes + exemptions
ipum['x16'] = (ipum.adjginc - 
    ipum[['pexemp', 'statetax', 'taxinc']].sum(axis=1))
# no values less than 0
ipum['x16'] = np.where(ipum.x16 < 0, 0, ipum.x16)

ipum['x17'] = 0
ipum['x18'] = ipum.incunemp
ipum['x19'] = 0
ipum['x20'] = 0
ipum['x21'] = 0

# Assume inccapg is long term (caploss is no longer in IPUMS CPS)
ipum['x22'] = ipum.inccapg


# Here we output a record for each person, so that tax units can be formed 
# later by summing over person records. The taxunit id is the minimum of
# the pernum or sploc, so spouses will get the same id. For children
# it is the minimum of the momloc or poploc. Other relatives are made
# dependent on the household head (which may be incorrect) and non-relatives
# are separate tax units. 
# */

RELATE_HEAD_OF_HOUSEHOLD_CODE = 101
ipum['hnum'] = np.where(ipum.relate == RELATE_HEAD_OF_HOUSEHOLD_CODE,
                        ipum.pernum, np.nan)

# People with income above the personal exemption must file separately.
ipum['claimed_income'] = ipum[['x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13',
                               'x22']].sum(axis=1)
ipum['notself'] = np.where(ipum.claimed_income <= ipum.pexemp, 1, 0)

# If someone is a dependent of their spouse, set dependent pointer to 0.
ipum.loc[~ipum.sploc.isna() & (ipum.depstat > 0) & (ipum.depstat == ipum.sploc),
         'depstat'] = 0

ipum['is_dep'] = ipum.depstat > 0

# Dependent children must be dependents with a parent who is below age 18,
# or below age 24 if in school.
ipum['depchild'] = np.where(
    ipum.is_dep &
    (~ipum.momloc.isna() | ~ipum.poploc.isna()) &
    ((ipum.age < 18) | ((ipum.age < 24) & (ipum.schlcoll > 0))),
    1, 0)

# Dependent relatives are dependents who are not dependent children.
ipum['deprel'] = np.where(ipum.is_dep & (ipum.depchild == 0), 1, 0)
# Define dependent children under age 13/17/18.
for i in [13, 17, 18]:
    ipum['dep' + str(i)] = np.where(ipum.is_dep & (ipum.age < i), 1, 0)

ipum.groupby(['is_dep', 'depchild', 'deprel']).size()

is_dep  depchild  deprel
False   0         0         126480
True    0         1           2517
        1         0          51104
dtype: int64

In [3]:
# set dependents and taxpayers
deps = ipum[ipum.is_dep].copy(deep=True)

# Define identifier as 100 * serial (household) + tax unit sub-identifier
deps['x1'] = (100 * deps.serial +
              np.where(deps.depchild, np.fmin(deps.momloc, deps.poploc),
                       deps.hnum))

deps['x4'] = np.nan
deps['x5'] = 1
deps['x6'] = 0
deps['x19'] = np.nan
deps['x23'] = np.nan
deps['x24'] = 0

In [4]:
txpyrs = ipum[~ipum.is_dep].copy(deep=True)
txpyrs['x1'] = 100 * txpyrs.serial + np.fmin(txpyrs.pernum, txpyrs.sploc)
txpyrs['x5'] = 0
txpyrs['x23'] = np.nan


# set whats not x1, x2, or x5 in deps to NA
vars = ['x' + str(i) for i in [3, 4, 27, 28] + list(range(6, 23))]
deps[vars] = np.nan

# put them back together
ipum2 = pd.concat([txpyrs, deps])


# sum value over tax #
ipum2['n'] = 1
ipum2.rename({'dep17': 'x25', 'dep18': 'x26'}, axis=1, inplace=True)

concat_sum = ipum2.groupby(['x2', 'x1'])[
    ['n'] + ['x' + str(i) for i in list(range(3, 29))]].sum()
concat_sum.x3 /= concat_sum.n
# x6 and x24 should be max not sum, and n is no longer necessary.
concat_sum.drop(['x6', 'x24', 'n'], axis=1, inplace=True)

concat_max = ipum2.groupby(['x2', 'x1'])[['x6', 'x24']].max()
concat_min = ipum2.groupby(['x2', 'x1'])[['serial', 'pernum']].min()
concat_min.columns = ['x29', 'x30']

concat = concat_sum.join(concat_max).join(concat_min).reset_index()

concat = concat[(concat.x19 >= 0) & (concat.x4) > 0]

concat = concat[['x' + str(i) for i in list(range(1, 31))]]

concat.columns = ['taxsimid', 'year', 'state', 'mstat', 'depx', 'page',
                  'pwages', 'swages', 'dividends', 'otherprop', 'pensions',
                  'gssi', 'transfers', 'rentpaid', 'proptax', 'otheritem',
                  'childcare', 'ui', 'depchild', 'mortgage', 'stcg', 'ltcg',
                  'dep13', 'sage', 'dep17', 'dep18', 'intrec', 'nonprop',
                  'serial', 'pernum']

ids = concat[['taxsimid', 'serial', 'pernum']]

In [5]:
crosswalk = ipum2[['serial', 'pernum', 'x1']].copy(deep=True)
crosswalk.rename({'x1': 'taxsimid'}, axis=1, inplace=True)