-
Notifications
You must be signed in to change notification settings - Fork 30
/
finalprep.py
325 lines (283 loc) · 11.2 KB
/
finalprep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
import numpy as np
import pandas as pd
import taxcalc as tc
from .impute_pencon import impute_pension_contributions
from .constants import UNUSED_READ_VARS
from pathlib import Path
CUR_PATH = Path(__file__).resolve().parent
USABLE_VARS = tc.Records(data=None).USABLE_READ_VARS
USABLE_VARS.add("filer")
def finalprep(data):
"""
Contains all the logic of the puf_data/finalprep.py script.
"""
# - Check the PUF year
max_flpdyr = max(data["flpdyr"])
if max_flpdyr == 2008:
data = transform_2008_varnames_to_2009_varnames(data)
else: # if PUF year is 2009+
data = age_consistency(data)
# - Make recid variable be a unique integer key:
data = create_new_recid(data)
# - Make several variable names be uppercase as in SOI PUF:
data = capitalize_varnames(data)
# - Impute cmbtp variable to estimate income on Form 6251 but not in AGI:
cmbtp_standard = data["e62100"] - data["e00100"] + data["e00700"]
zero = np.zeros(len(data.index))
medical_limit = np.maximum(
zero, data["e17500"] - np.maximum(zero, data["e00100"]) * 0.075
)
med_adj = np.minimum(medical_limit, 0.025 * np.maximum(zero, data["e00100"]))
stx_adj = np.maximum(zero, data["e18400"])
cmbtp_itemizer = (
cmbtp_standard
+ data["p04470"]
+ data["e21040"]
- data["e18500"]
- data["e20800"]
- stx_adj
- med_adj
)
cmbtp = np.where(data["FDED"] == 1, cmbtp_itemizer, cmbtp_standard)
data["cmbtp"] = np.where(data["f6251"] == 1, cmbtp, 0.0)
# - Split earnings variables into taxpayer (p) and spouse (s) amounts:
data = split_earnings_variables(data, max_flpdyr)
# - Add AGI bin indicator used for adjustment factors:
data = add_agi_bin(data)
# - Replace e20500 with g20500:
data = replace_20500(data)
data["s006"] = data["matched_weight"] * 100
# - Remove variables not expected by Tax-Calculator:
data = data.filter(USABLE_VARS, axis=1)
data["blind_head"] = np.where(data["blind_head"] == 1, 1, 0)
data["blind_spouse"] = np.where(data["blind_spouse"] == 1, 1, 0)
# - Convert data to integers:
data = data.round(0).astype("int64")
# - Impute pension contributions:
data = impute_pension_contributions(data.copy(), max_flpdyr)
# - Rename 'filer' to 'data_source'
data = data.rename(columns={"filer": "data_source"})
# - Sort columns to ensure every PUF is the same
data.sort_index(axis=1, inplace=True)
return data
# end of main function code
def create_new_recid(data):
"""
Construct unique recid.
"""
data["recid"] = data.index + 1
return data
def age_consistency(data):
"""
Construct age_head from agerange if available; otherwise use CPS value.
Construct age_spouse as a normally-distributed agediff from age_head.
"""
# set random-number-generator seed so that always get same random numbers
np.random.seed(seed=123456789)
# generate random integers to smooth age distribution in agerange
shape = data["age_head"].shape
agefuzz8 = np.random.randint(0, 9, size=shape)
agefuzz9 = np.random.randint(0, 10, size=shape)
agefuzz10 = np.random.randint(0, 11, size=shape)
agefuzz15 = np.random.randint(0, 16, size=shape)
# assign age_head using agerange midpoint or CPS age if agerange absent
data["age_head"] = np.where(
data["agerange"] == 0,
data["age_head"],
(data["agerange"] + 1 - data["dsi"]) * 10,
)
# smooth the agerange-based age_head within each agerange
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 1, data["dsi"] == 0),
data["age_head"] - 3 + agefuzz9,
data["age_head"],
)
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 2, data["dsi"] == 0),
data["age_head"] - 4 + agefuzz9,
data["age_head"],
)
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 3, data["dsi"] == 0),
data["age_head"] - 5 + agefuzz10,
data["age_head"],
)
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 4, data["dsi"] == 0),
data["age_head"] - 5 + agefuzz10,
data["age_head"],
)
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 5, data["dsi"] == 0),
data["age_head"] - 5 + agefuzz10,
data["age_head"],
)
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 6, data["dsi"] == 0),
data["age_head"] - 5 + agefuzz15,
data["age_head"],
)
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 1, data["dsi"] == 1),
data["age_head"] - 0 + agefuzz8,
data["age_head"],
)
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 2, data["dsi"] == 1),
data["age_head"] - 2 + agefuzz8,
data["age_head"],
)
data["age_head"] = np.where(
np.logical_and(data["agerange"] == 3, data["dsi"] == 1),
data["age_head"] - 4 + agefuzz10,
data["age_head"],
)
# convert zero age_head to one
data["age_head"] = np.where(data["age_head"] == 0, 1, data["age_head"])
# assign age_spouse relative to age_head if married;
# if head is not married, set age_spouse to zero;
# if head is married but has unknown age, set age_spouse to one;
# do not specify age_spouse values below 15
adiff = np.random.normal(0.0, 4.0, size=shape)
agediff = np.int_(adiff.round())
age_sp = data["age_head"] + agediff
age_spouse = np.where(age_sp < 15, 15, age_sp)
data["age_spouse"] = np.where(
data["mars"] == 2, np.where(data["age_head"] == 1, 1, age_spouse), 0
)
return data
def capitalize_varnames(data):
"""
Capitalize some variable names.
"""
renames = {
"dsi": "DSI",
"eic": "EIC",
"fded": "FDED",
"flpdyr": "FLPDYR",
"mars": "MARS",
"midr": "MIDR",
"xtot": "XTOT",
"recid": "RECID",
}
data = data.rename(columns=renames)
return data
def transform_2008_varnames_to_2009_varnames(data):
"""
Convert 2008 IRS-SOI PUF variable names into 2009 PUF variable names.
"""
data["e18400"] = data["e18425"] + data["e18450"]
# drop unused variables only existing in 2008 IRS-SOI PUF
UNUSED = {
"e18425",
"e18450",
"e25370",
"e25380",
"state",
"e87500",
"e87510",
"e87520",
"e87540",
}
data = data.drop(UNUSED, 1)
# drop variables not expected by Tax-Calculator
data = data.drop(UNUSED_READ_VARS, 1)
return data
def split_earnings_variables(data, data_year):
"""
Split earnings subject to FICA or SECA taxation between taxpayer and spouse
"""
# split wage-and-salary earnings subject to FICA taxation
# the two e00200x variables come from the CPS. We'll use them just for
# the wage ratio that we split up the PUF wages from
total = np.where(data["MARS"] == 2, data["e00200p"] + data["e00200s"], 0).astype(
float
)
frac_p = np.where(total != 0, data["e00200p"] / total, 1.0)
frac_s = 1.0 - frac_p
data["e00200p"] = np.around(frac_p * data["e00200"], 2)
data["e00200s"] = np.around(frac_s * data["e00200"], 2)
# specify FICA-SECA maximum taxable earnings (mte) for data_year
# TODO: add these to a parameter file
if data_year == 2008:
mte = 102000
elif data_year == 2009:
mte = 106800
elif data_year == 2011:
mte = 106800
else:
raise ValueError("illegal SOI PUF data year {}".format(data_year))
# total self-employment earnings subject to SECA taxation
# (minimum handles a few secatip values slightly over the mte cap)
secatip = np.minimum(mte, data["e30400"]) # for taxpayer
secatis = np.minimum(mte, data["e30500"]) # for spouse
# split self-employment earnings subject to SECA taxation
# ... compute secati?-derived frac_p and frac_s
total = np.where(data["MARS"] == 2, secatip + secatis, 0).astype(float)
frac_p = np.where(total != 0, secatip / total, 1.0)
frac_s = 1.0 - frac_p
# ... split e00900 (Schedule C) and e02100 (Schedule F) net earnings/loss
data["e00900p"] = np.around(frac_p * data["e00900"], 2)
data["e00900s"] = np.around(frac_s * data["e00900"], 2)
data["e02100p"] = np.around(frac_p * data["e02100"], 2)
data["e02100s"] = np.around(frac_s * data["e02100"], 2)
# ... estimate Schedule K-1 box 14 self-employment earnings/loss
# ... Note: secati? values fall in the [0,mte] range.
# ... So, if sum of e00900? and e02100? is negative and secati? is
# ... zero, we make a conservative assumption and set box14 to zero
# ... (rather than to a positive number), but we allow the estimate
# ... of box 14 to be negative (that is, represent a loss).
nonbox14 = data["e00900p"] + data["e02100p"]
box14 = np.where(
np.logical_and(nonbox14 <= 0, secatip <= 0), 0.0, secatip - nonbox14
)
data["k1bx14p"] = box14.round(2)
nonbox14 = data["e00900s"] + data["e02100s"]
box14 = np.where(
np.logical_and(nonbox14 <= 0, secatis <= 0), 0.0, secatis - nonbox14
)
data["k1bx14s"] = box14.round(2)
# ... check consistency of self-employment earnings estimates
raw = data["e00900p"] + data["e02100p"] + data["k1bx14p"]
estp = np.where(raw < 0, 0.0, np.where(raw > mte, mte, raw))
raw = data["e00900s"] + data["e02100s"] + data["k1bx14s"]
ests = np.where(raw < 0, 0.0, np.where(raw > mte, mte, raw))
assert np.allclose(estp, secatip, rtol=0.0, atol=0.01)
assert np.allclose(ests, secatis, rtol=0.0, atol=0.01)
return data
def add_agi_bin(data):
"""
Add an AGI bin indicator used in Tax-Calc to apply adjustment factors
"""
agi = pd.Series([0] * len(data.e00100))
agi[data.e00100 < 0] = 0
agi[(data.e00100 >= 0) & (data.e00100 < 5000)] = 1
agi[(data.e00100 >= 5000) & (data.e00100 < 10000)] = 2
agi[(data.e00100 >= 10000) & (data.e00100 < 15000)] = 3
agi[(data.e00100 >= 15000) & (data.e00100 < 20000)] = 4
agi[(data.e00100 >= 20000) & (data.e00100 < 25000)] = 5
agi[(data.e00100 >= 25000) & (data.e00100 < 30000)] = 6
agi[(data.e00100 >= 30000) & (data.e00100 < 40000)] = 7
agi[(data.e00100 >= 40000) & (data.e00100 < 50000)] = 8
agi[(data.e00100 >= 50000) & (data.e00100 < 75000)] = 9
agi[(data.e00100 >= 75000) & (data.e00100 < 100000)] = 10
agi[(data.e00100 >= 100000) & (data.e00100 < 200000)] = 11
agi[(data.e00100 >= 200000) & (data.e00100 < 500000)] = 12
agi[(data.e00100 >= 500000) & (data.e00100 < 1e6)] = 13
agi[(data.e00100 >= 1e6) & (data.e00100 < 1.5e6)] = 14
agi[(data.e00100 >= 1.5e6) & (data.e00100 < 2e6)] = 15
agi[(data.e00100 >= 2e6) & (data.e00100 < 5e6)] = 16
agi[(data.e00100 >= 5e6) & (data.e00100 < 1e7)] = 17
agi[(data.e00100 >= 1e7)] = 18
data["agi_bin"] = agi
return data
def replace_20500(data):
"""
Replace e20500, net casualty losses, with g20500, gross casualty losses
(gross loss values less than 10% AGI are unknown and assumed to be zero)
"""
gross = np.where(
data.e20500 > 0.0, data.e20500 + 0.10 * np.maximum(0.0, data.e00100), 0.0
)
data["g20500"] = np.int_(gross.round())
return data