Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streamline logic of create_distribution_table utility and improve tbi fuzzing #1972

Merged
merged 3 commits into from Apr 17, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions taxcalc/taxcalcio.py
Expand Up @@ -606,6 +606,7 @@ def write_decile_table(dfx, tfile, tkind='Totals'):
Write to tfile the tkind decile table using dfx DataFrame.
"""
dfx = add_quantile_table_row_variable(dfx, 'expanded_income', 10,
decile_details=False,
weight_by_income_measure=False)
gdfx = dfx.groupby('table_row', as_index=False)
rtns_series = gdfx.apply(unweighted_sum, 's006')
Expand Down
95 changes: 49 additions & 46 deletions taxcalc/tbi/tbi_utils.py
Expand Up @@ -8,6 +8,7 @@
from __future__ import print_function
import os
import time
import copy
import hashlib
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -289,20 +290,16 @@ def chooser(agg):
mask indices with the output for those NUM_TO_FUZZ indices being zero and
the output for all the other indices being one.
"""
# select indices of records with change in tax liability after
# a one dollar increase in income
# mark the records chosen to be fuzzed (that is, have ans value of zero)
ans = [1] * len(agg)
# select indices of records from agg
indices = np.where(agg)
if len(indices[0]) >= NUM_TO_FUZZ:
num = min(len(indices[0]), NUM_TO_FUZZ)
if num > 0:
choices = np.random.choice(indices[0], # pylint: disable=no-member
size=NUM_TO_FUZZ, replace=False)
else:
msg = ('Not enough differences in income tax when adding '
'one dollar for chunk with name: {}')
raise ValueError(msg.format(agg.name))
# mark the records chosen to be fuzzed (ans=0)
ans = [1] * len(agg)
for idx in choices:
ans[idx] = 0
size=num, replace=False)
for idx in choices:
ans[idx] = 0
return ans


Expand Down Expand Up @@ -346,13 +343,15 @@ def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing):
# pylint: disable=too-many-arguments
assert bin_type == 'dec' or bin_type == 'bin' or bin_type == 'agg'
if bin_type == 'dec':
df2 = add_quantile_table_row_variable(df2, imeasure, 10)
df2 = add_quantile_table_row_variable(df2, imeasure,
10, decile_details=True)
gdf2 = df2.groupby('table_row')
elif bin_type == 'bin':
df2 = add_income_table_row_variable(df2, imeasure,
bins=STANDARD_INCOME_BINS)
gdf2 = df2.groupby('table_row')
else:
df2 = add_quantile_table_row_variable(df2, imeasure, 1)
gdf2 = df2.groupby('table_row')
gdf2 = df2
if do_fuzzing:
df2['nofuzz'] = gdf2['mask'].transform(chooser)
else: # never do any results fuzzing
Expand All @@ -373,11 +372,14 @@ def create(df1, df2, bin_type, imeasure, suffix, cols_to_fuzz, do_fuzzing):
df2['expanded_income_baseline'] = df1['expanded_income']
create(df1, df2, 'dec', 'expanded_income_baseline', '_xdec',
columns_to_create, do_fuzzing)
df2_xdec = copy.deepcopy(df2)
create(df1, df2, 'bin', 'expanded_income_baseline', '_xbin',
columns_to_create, do_fuzzing)
df2_xbin = copy.deepcopy(df2)
create(df1, df2, 'agg', 'expanded_income_baseline', '_agg',
columns_to_create, do_fuzzing)
return df2
df2_aggr = copy.deepcopy(df2)
return (df2_xdec, df2_xbin, df2_aggr)


AGGR_ROW_NAMES = ['ind_tax', 'payroll_tax', 'combined_tax']
Expand All @@ -392,17 +394,18 @@ def summary(df1, df2, mask):
"""
# pylint: disable=too-many-statements,too-many-locals

df2 = create_results_columns(df1, df2, mask)
df2_xdec, df2_xbin, df2_aggr = create_results_columns(df1, df2, mask)
del df2

summ = dict()

# tax difference totals between reform and baseline
tdiff = df2['iitax_agg'] - df1['iitax']
aggr_itax_d = (tdiff * df2['s006']).sum()
tdiff = df2['payrolltax_agg'] - df1['payrolltax']
aggr_ptax_d = (tdiff * df2['s006']).sum()
tdiff = df2['combined_agg'] - df1['combined']
aggr_comb_d = (tdiff * df2['s006']).sum()
tdiff = df2_aggr['iitax_agg'] - df1['iitax']
aggr_itax_d = (tdiff * df1['s006']).sum()
tdiff = df2_aggr['payrolltax_agg'] - df1['payrolltax']
aggr_ptax_d = (tdiff * df1['s006']).sum()
tdiff = df2_aggr['combined_agg'] - df1['combined']
aggr_comb_d = (tdiff * df1['s006']).sum()
aggrd = [aggr_itax_d, aggr_ptax_d, aggr_comb_d]
summ['aggr_d'] = pd.DataFrame(data=aggrd, index=AGGR_ROW_NAMES)

Expand All @@ -414,54 +417,54 @@ def summary(df1, df2, mask):
summ['aggr_1'] = pd.DataFrame(data=aggr1, index=AGGR_ROW_NAMES)

# totals for reform
aggr_itax_2 = (df2['iitax_agg'] * df2['s006']).sum()
aggr_ptax_2 = (df2['payrolltax_agg'] * df2['s006']).sum()
aggr_comb_2 = (df2['combined_agg'] * df2['s006']).sum()
aggr_itax_2 = (df2_aggr['iitax_agg'] * df1['s006']).sum()
aggr_ptax_2 = (df2_aggr['payrolltax_agg'] * df1['s006']).sum()
aggr_comb_2 = (df2_aggr['combined_agg'] * df1['s006']).sum()
aggr2 = [aggr_itax_2, aggr_ptax_2, aggr_comb_2]
summ['aggr_2'] = pd.DataFrame(data=aggr2, index=AGGR_ROW_NAMES)

# create difference tables grouped by xdec
df2['iitax'] = df2['iitax_xdec']
df2_xdec['iitax'] = df2_xdec['iitax_xdec']
summ['diff_itax_xdec'] = \
create_difference_table(df1, df2,
create_difference_table(df1, df2_xdec,
groupby='weighted_deciles',
income_measure='expanded_income',
tax_to_diff='iitax')

df2['payrolltax'] = df2['payrolltax_xdec']
df2_xdec['payrolltax'] = df2_xdec['payrolltax_xdec']
summ['diff_ptax_xdec'] = \
create_difference_table(df1, df2,
create_difference_table(df1, df2_xdec,
groupby='weighted_deciles',
income_measure='expanded_income',
tax_to_diff='payrolltax')

df2['combined'] = df2['combined_xdec']
df2_xdec['combined'] = df2_xdec['combined_xdec']
summ['diff_comb_xdec'] = \
create_difference_table(df1, df2,
create_difference_table(df1, df2_xdec,
groupby='weighted_deciles',
income_measure='expanded_income',
tax_to_diff='combined')

# create difference tables grouped by xbin
df2['iitax'] = df2['iitax_xbin']
df2_xbin['iitax'] = df2_xbin['iitax_xbin']
diff_itax_xbin = \
create_difference_table(df1, df2,
create_difference_table(df1, df2_xbin,
groupby='standard_income_bins',
income_measure='expanded_income',
tax_to_diff='iitax')
summ['diff_itax_xbin'] = diff_itax_xbin

df2['payrolltax'] = df2['payrolltax_xbin']
df2_xbin['payrolltax'] = df2_xbin['payrolltax_xbin']
diff_ptax_xbin = \
create_difference_table(df1, df2,
create_difference_table(df1, df2_xbin,
groupby='standard_income_bins',
income_measure='expanded_income',
tax_to_diff='payrolltax')
summ['diff_ptax_xbin'] = diff_ptax_xbin

df2['combined'] = df2['combined_xbin']
df2_xbin['combined'] = df2_xbin['combined_xbin']
diff_comb_xbin = \
create_difference_table(df1, df2,
create_difference_table(df1, df2_xbin,
groupby='standard_income_bins',
income_measure='expanded_income',
tax_to_diff='combined')
Expand All @@ -474,13 +477,13 @@ def summary(df1, df2, mask):
result_type='weighted_sum')

suffix = '_xdec'
df2_cols_with_suffix = [c for c in list(df2) if c.endswith(suffix)]
df2_cols_with_suffix = [c for c in list(df2_xdec) if c.endswith(suffix)]
for col in df2_cols_with_suffix:
root_col_name = col.replace(suffix, '')
df2[root_col_name] = df2[col]
df2['expanded_income_baseline'] = df1['expanded_income']
df2_xdec[root_col_name] = df2_xdec[col]
df2_xdec['expanded_income_baseline'] = df1['expanded_income']
summ['dist2_xdec'] = \
create_distribution_table(df2, groupby='weighted_deciles',
create_distribution_table(df2_xdec, groupby='weighted_deciles',
income_measure='expanded_income_baseline',
result_type='weighted_sum')

Expand All @@ -492,13 +495,13 @@ def summary(df1, df2, mask):
summ['dist1_xbin'] = dist1_xbin

suffix = '_xbin'
df2_cols_with_suffix = [c for c in list(df2) if c.endswith(suffix)]
df2_cols_with_suffix = [c for c in list(df2_xbin) if c.endswith(suffix)]
for col in df2_cols_with_suffix:
root_col_name = col.replace(suffix, '')
df2[root_col_name] = df2[col]
df2['expanded_income_baseline'] = df1['expanded_income']
df2_xbin[root_col_name] = df2_xbin[col]
df2_xbin['expanded_income_baseline'] = df1['expanded_income']
dist2_xbin = \
create_distribution_table(df2, groupby='standard_income_bins',
create_distribution_table(df2_xbin, groupby='standard_income_bins',
income_measure='expanded_income_baseline',
result_type='weighted_sum')
summ['dist2_xbin'] = dist2_xbin
Expand Down