In [55]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import norm

### Betas used in SAS scripts

In [2]:
logits = pd.read_sas('../SASFiles/betalogit2.sas7bdat')

In [3]:
logits

Unnamed: 0,Equation_Number,Name,LNINCOME,MARS,FAMSIZE,AGEDE,LNINTST,LNDBE,LNPENSIONS,CONSTANT
0,1.0,Capital Gains in AGI,-0.099938,0.627291,-0.046338,-0.111985,0.138229,0.301582,0.007249,-2.904612
1,2.0,Taxable IRA Distributions,0.015322,0.823563,-0.379812,1.698492,0.0,0.0,0.0,-3.144786
2,3.0,IRA Contributions,0.416388,1.158114,-0.31264,-0.402224,0.0,0.0,0.0,-7.887568
3,4.0,"Self-employed SEP, Simple",1.029694,0.363567,-0.189241,-0.61088,0.0,0.0,0.0,-14.43913
4,5.0,Self-employed Health Ins.,0.228681,0.312261,-0.14415,-0.305424,0.0,0.0,0.0,-4.218427
5,6.0,Student Loan Interest,0.288347,0.572776,-0.156224,-2.01507,0.0,0.0,0.0,-5.552857
6,7.0,Charitable Deduction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8.0,Miscellaneous Deduction,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,9.0,Other Miscellaneous Deductions,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,10.0,Child and Dependent Care Expenses,0.543203,-0.278586,-0.044728,-1.511192,0.0,0.0,0.0,-7.297078


In [4]:
ols = pd.read_sas('../SASFiles/betaols2.sas7bdat')

In [5]:
ols

Unnamed: 0,Equation_Number,Name,LNINCOME,MARS,FAMSIZE,AGEDE,LNINTST,LNDBE,LNPENSIONS,CONSTANT
0,1.0,Capital Gains in AGI,-0.097798,0.409067,0.052662,0.043151,0.216368,0.066569,-0.032706,7.011154
1,2.0,Taxable IRA Distributions,-0.04559,0.645743,-0.149783,-0.163632,0.0,0.0,0.0,8.868776
2,3.0,IRA Contributions,0.142279,0.401322,-0.12144,0.212282,0.0,0.0,0.0,6.124615
3,4.0,"Self-employed SEP, Simple",0.460756,-0.251884,0.051566,-0.106968,0.0,0.0,0.0,3.678588
4,5.0,Self-employed Health Ins.,0.110074,0.384159,0.091397,-0.14856,0.0,0.0,0.0,6.439997
5,6.0,Student Loan Interest,0.063095,0.276323,-0.050471,-0.227629,0.0,0.0,0.0,4.946671
6,7.0,Charitable Deduction,15350.81,3041.134,-659.364,-8823.67,0.0,0.0,0.0,-184018.7
7,8.0,Miscellaneous Deduction,3995.944,-258.5037,-414.3326,-3780.358,0.0,0.0,0.0,-49210.57
8,9.0,Other Miscellaneous Deductions,1756.956,-4840.531,-588.8901,-11339.82,0.0,0.0,0.0,-37577.58
9,10.0,Child and Dependent Care Expenses,0.102497,-0.3175,0.054443,-0.265133,0.0,0.0,0.0,6.439979


## Data Prep

In [6]:
puf = pd.read_csv('../taxdata/puf_data/StatMatch/Matching/puf2011.csv')
puf = puf[puf['MARS'] != 0]

In [7]:
# log of total income
puf['totinc'] = (puf.E00200 + puf.E00300 + puf.E00600 + puf.E00650 + puf.E00800 +
                 puf.E00900 + puf.E01700 + puf.E02000 + puf.E02100 + puf.E02300 +
                 puf.E02400)
puf['tincx'] = np.where(puf['totinc'] <= 0, 0, puf['totinc'])
puf['lnincome'] = np.log(1. + puf['tincx'])
# log of interest income
puf['intst'] = np.where(puf.E00300 <= 0., 0., puf.E00300)
puf['lnintst'] = np.log(1. + puf['intst'])
# log of dividends
puf['dbe'] = np.where(puf.E00600 + puf.E00650 <= 0., 0., puf.E00600 + puf.E00650)
puf['lndbe'] = np.log(1. + (puf['dbe']))
# log of pensions
puf['pensions'] = np.where(puf.E01700 <= 0., 0., puf.E01700)
puf['lnpensions'] = np.log(1. + puf['pensions'])
# total charitable giving
puf['totcharitable'] = puf.E19800 + puf.E20100

# MARS - 0 for single and HOH, 1 for all married filers
puf['mars_reg'] = np.where((puf.MARS == 1) | (puf.MARS == 4), 0, 1)

# use number of dependents and filing status to get family size
puf['famsize'] = puf.XTOT  # run this by John. I believe it's the same as his method

# use deduction amounts to determine AGEDE
single_elderly_st = np.where((puf.MARS == 1) & (puf.FDED == 2) &
                             (puf.P04470 >= 7250), True, False)
hoh_elderly_st = np.where((puf.MARS == 4) & (puf.FDED == 2) &
                          (puf.P04470 >= 9950), True, False)
joint_st_one = np.where((puf.MARS == 3) & (puf.FDED == 2) &
                        (puf.P04470 >= 12750) & (puf.P04470 < 13900), True, False)
joint_st_two = np.where((puf.MARS == 3) & (puf.FDED == 2) &
                        (puf.P04470 >= 13900), True, False)
single_elderly_item = np.where((puf.MARS == 1) & (puf.FDED == 1) &
                               (puf.E02400 > 0), True, False)
hoh_elderly_item = np.where((puf.MARS == 4) & (puf.FDED == 1) &
                            (puf.E02400 > 0), True, False)
joint_one_item = np.where((puf.MARS == 3) & (puf.FDED == 1) &
                          (puf.E02400 > 0) & (puf.E02400 <= 25000), True, False)
joint_two_item = np.where((puf.MARS == 3) & (puf.FDED == 1) &
                          (puf.E02400 > 25000), True, False)
puf['agede'] = np.where((single_elderly_st) | (hoh_elderly_st) | (joint_st_one) |
                        (single_elderly_item) | (hoh_elderly_item) | (joint_one_item), 1,
                        np.where((joint_st_two) | (joint_two_item), 2, 0))

# add constant
puf['constant'] = np.ones(len(puf))

In [8]:
# add boolean indicators for claiming certain deductions/income sources
puf['cg_agi'] = np.where(puf.E01100 > 0, 1, 0)  # capital gains in agi
puf['ira_dist'] = np.where(puf.E01400 > 0, 1, 0)  # taxable ira distributions
puf['ira_con'] = np.where(puf.E03150 > 0, 1, 0)  # ira contributions
puf['sep'] = np.where(puf.E03300 > 0, 1, 0)  # self-employed sep, simple
puf['se_hi'] = np.where(puf.E03270 > 0, 1, 0)  # self-employed health insurance deduction
puf['charitable'] = np.where((puf.E19800 > 0) | (puf.E20100 > 0), 1, 0)  # charitable contributions
puf['misc'] = np.where(puf.E20400 > 0, 1, 0)  # misc. deductions
puf['cdc'] = np.where(puf.E07220 > 0, 1, 0)  # child and dependent care credit
puf['med_exp'] = np.where(puf.E17500 > 0, 1, 0)  # medical expense deduction
puf['sl_ded'] = np.where(puf.E03210 > 0, 1, 0)  # student loan interest deduction

## Logit Models

Statsmodels

In [9]:
# capital gains in agi
cg_model = sm.Logit(puf['cg_agi'], puf[['lnincome', 'mars_reg', 'famsize', 'agede',
                                        'lnintst', 'lndbe', 'lnpensions', 'constant']]).fit()
print cg_model.summary()

Optimization terminated successfully.
         Current function value: 0.068302
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                 cg_agi   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163778
Method:                           MLE   Df Model:                            7
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                 0.05712
Time:                        15:22:18   Log-Likelihood:                -11187.
converged:                       True   LL-Null:                       -11865.
                                        LLR p-value:                1.800e-288
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome      -0.0079      0.008     -0.988      0.323      -0.023       0.008
mars_reg       0.1594      0.

In [10]:
# taxable ira distributions
ira_model = sm.Logit(puf['ira_dist'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print ira_model.summary()

Optimization terminated successfully.
         Current function value: 0.330910
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:               ira_dist   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                  0.1227
Time:                        15:22:18   Log-Likelihood:                -54198.
converged:                       True   LL-Null:                       -61779.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.0258      0.003      8.577      0.000       0.020       0.032
mars_reg       2.4286      0.

In [11]:
# ira contributions
irac_model = sm.Logit(puf['ira_con'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print irac_model.summary()

Optimization terminated successfully.
         Current function value: 0.110921
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                ira_con   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                 0.02580
Time:                        15:22:18   Log-Likelihood:                -18167.
converged:                       True   LL-Null:                       -18648.
                                        LLR p-value:                5.322e-207
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.0720      0.007     10.086      0.000       0.058       0.086
mars_reg       0.9169      0.

In [12]:
# sep
sep_model = sm.Logit(puf['sep'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print sep_model.summary()

Optimization terminated successfully.
         Current function value: 0.165334
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                    sep   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                  0.1732
Time:                        15:22:18   Log-Likelihood:                -27079.
converged:                       True   LL-Null:                       -32751.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.5554      0.007     74.206      0.000       0.541       0.570
mars_reg       0.7370      0.

In [13]:
# self-employed health insurance
sehi_model = sm.Logit(puf['se_hi'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print sehi_model.summary()

Optimization terminated successfully.
         Current function value: 0.319280
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                  se_hi   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                 0.09923
Time:                        15:22:19   Log-Likelihood:                -52294.
converged:                       True   LL-Null:                       -58054.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.2546      0.005     54.431      0.000       0.245       0.264
mars_reg       1.0688      0.

In [14]:
# child and dependent care credit
cdc_model = sm.Logit(puf['cdc'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print cdc_model.summary()

Optimization terminated successfully.
         Current function value: 0.275738
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                    cdc   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                  0.1952
Time:                        15:22:19   Log-Likelihood:                -45162.
converged:                       True   LL-Null:                       -56117.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome      -0.0632      0.003    -21.483      0.000      -0.069      -0.057
mars_reg      -1.3876      0.

In [15]:
# medical expense deduction
medex_model = sm.Logit(puf['med_exp'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print medex_model.summary()

Optimization terminated successfully.
         Current function value: 0.217480
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                med_exp   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                 0.06202
Time:                        15:22:19   Log-Likelihood:                -35620.
converged:                       True   LL-Null:                       -37975.
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome      -0.0435      0.003    -12.509      0.000      -0.050      -0.037
mars_reg       1.1598      0.

In [16]:
# student loan interest deduction
sl_model = sm.Logit(puf['sl_ded'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print sl_model.summary()

Optimization terminated successfully.
         Current function value: 0.182581
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                 sl_ded   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                 0.01438
Time:                        15:22:19   Log-Likelihood:                -29904.
converged:                       True   LL-Null:                       -30341.
                                        LLR p-value:                1.280e-187
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome      -0.0332      0.004     -8.202      0.000      -0.041      -0.025
mars_reg      -0.3322      0.

In [17]:
# charitable contribution deduction
char_model = sm.Logit(puf['charitable'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print char_model.summary()

Optimization terminated successfully.
         Current function value: 0.453089
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:             charitable   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                  0.3429
Time:                        15:22:20   Log-Likelihood:                -74210.
converged:                       True   LL-Null:                   -1.1294e+05
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.8410      0.005    155.125      0.000       0.830       0.852
mars_reg       0.8683      0.

In [18]:
# misc. deduction
misc_model = sm.Logit(puf['misc'], puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print misc_model.summary()

Optimization terminated successfully.
         Current function value: 0.510833
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                   misc   No. Observations:               163786
Model:                          Logit   Df Residuals:                   163781
Method:                           MLE   Df Model:                            4
Date:                Mon, 16 Oct 2017   Pseudo R-squ.:                  0.1984
Time:                        15:22:20   Log-Likelihood:                -83667.
converged:                       True   LL-Null:                   -1.0438e+05
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.5405      0.004    130.367      0.000       0.532       0.549
mars_reg       0.6770      0.

## OLS Models

In [19]:
# capital gains in agi
puf['lncgagi'] = np.log(1. + puf.E01100)
sub_puf = puf[puf['cg_agi'] == 1]
cg_ols = sm.OLS(sub_puf.lncgagi, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede',
                                          'lnintst', 'lndbe', 'lnpensions', 'constant']]).fit()
print cg_ols.summary()

                            OLS Regression Results                            
Dep. Variable:                lncgagi   R-squared:                       0.185
Model:                            OLS   Adj. R-squared:                  0.183
Method:                 Least Squares   F-statistic:                     72.68
Date:                Mon, 16 Oct 2017   Prob (F-statistic):           5.32e-95
Time:                        15:22:20   Log-Likelihood:                -4506.1
No. Observations:                2246   AIC:                             9028.
Df Residuals:                    2238   BIC:                             9074.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.0283      0.021      1.320      0.1

In [20]:
# taxable ira distributions
puf['lnira_dist'] = np.log(1. + puf.E01400)
sub_puf = puf[puf['ira_dist'] == 1]
ira_ols = sm.OLS(sub_puf.lnira_dist, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print ira_ols.summary()

                            OLS Regression Results                            
Dep. Variable:             lnira_dist   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.025
Method:                 Least Squares   F-statistic:                     129.8
Date:                Mon, 16 Oct 2017   Prob (F-statistic):          1.13e-109
Time:                        15:22:21   Log-Likelihood:                -43156.
No. Observations:               20509   AIC:                         8.632e+04
Df Residuals:                   20504   BIC:                         8.636e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.0297      0.005      6.057      0.0

In [21]:
# ira contributions
puf['lnira_con'] = np.log(1. + puf.E03150)
sub_puf = puf[puf['ira_con'] == 1]
irac_ols = sm.OLS(sub_puf.lnira_con, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print irac_ols.summary()

                            OLS Regression Results                            
Dep. Variable:              lnira_con   R-squared:                       0.146
Model:                            OLS   Adj. R-squared:                  0.145
Method:                 Least Squares   F-statistic:                     168.5
Date:                Mon, 16 Oct 2017   Prob (F-statistic):          2.05e-133
Time:                        15:22:21   Log-Likelihood:                -5286.8
No. Observations:                3959   AIC:                         1.058e+04
Df Residuals:                    3954   BIC:                         1.062e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.0853      0.006     13.834      0.0

In [22]:
# self-employed sep
puf['lnsep'] = np.log(1. + puf.E03300)
sub_puf = puf[puf['sep'] == 1]
sep_ols = sm.OLS(sub_puf.lnsep, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print sep_ols.summary()

                            OLS Regression Results                            
Dep. Variable:                  lnsep   R-squared:                       0.060
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     131.5
Date:                Mon, 16 Oct 2017   Prob (F-statistic):          4.89e-109
Time:                        15:22:21   Log-Likelihood:                -12282.
No. Observations:                8270   AIC:                         2.457e+04
Df Residuals:                    8265   BIC:                         2.461e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.1552      0.007     22.228      0.0

In [23]:
# self-employed health insurance deduction
puf['lnhi'] = np.log(1. + puf.E03270)
sub_puf = puf[puf['se_hi'] == 1]
sehi_ols = sm.OLS(sub_puf.lnhi, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print sehi_ols.summary()

                            OLS Regression Results                            
Dep. Variable:                   lnhi   R-squared:                       0.128
Model:                            OLS   Adj. R-squared:                  0.127
Method:                 Least Squares   F-statistic:                     681.9
Date:                Mon, 16 Oct 2017   Prob (F-statistic):               0.00
Time:                        15:22:22   Log-Likelihood:                -24044.
No. Observations:               18644   AIC:                         4.810e+04
Df Residuals:                   18639   BIC:                         4.814e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.0527      0.002     24.042      0.0

In [24]:
# child and dependent care contribution
puf['lncdc'] = np.log(1. + puf.E07220)
sub_puf = puf[puf['cdc'] == 1]
cdc_ols = sm.OLS(sub_puf.lncdc, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print cdc_ols.summary()

                            OLS Regression Results                            
Dep. Variable:                  lncdc   R-squared:                       0.165
Model:                            OLS   Adj. R-squared:                  0.165
Method:                 Least Squares   F-statistic:                     877.5
Date:                Mon, 16 Oct 2017   Prob (F-statistic):               0.00
Time:                        15:22:22   Log-Likelihood:                -23491.
No. Observations:               17713   AIC:                         4.699e+04
Df Residuals:                   17708   BIC:                         4.703e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.2973      0.009     32.668      0.0

In [25]:
# medical expenses
puf['lnmed_exp'] = np.log(1. + puf.E17500)
sub_puf = puf[puf['med_exp'] == 1]
medex_ols = sm.OLS(sub_puf.lnmed_exp, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print medex_ols.summary()

                            OLS Regression Results                            
Dep. Variable:              lnmed_exp   R-squared:                       0.122
Model:                            OLS   Adj. R-squared:                  0.122
Method:                 Least Squares   F-statistic:                     351.7
Date:                Mon, 16 Oct 2017   Prob (F-statistic):          6.34e-284
Time:                        15:22:22   Log-Likelihood:                -12306.
No. Observations:               10120   AIC:                         2.462e+04
Df Residuals:                   10115   BIC:                         2.466e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.1226      0.005     23.908      0.0

In [26]:
# student loan interest deduction
puf['lnsl'] = np.log(1. + puf.E03210)
sub_puf = puf[puf['sl_ded'] == 1]
sl_ols = sm.OLS(sub_puf.lnsl, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print sl_ols.summary()

                            OLS Regression Results                            
Dep. Variable:                   lnsl   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     4.539
Date:                Mon, 16 Oct 2017   Prob (F-statistic):            0.00116
Time:                        15:22:23   Log-Likelihood:                -12276.
No. Observations:                7463   AIC:                         2.456e+04
Df Residuals:                    7458   BIC:                         2.460e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome      -0.0099      0.010     -1.019      0.3

In [27]:
# charitable contributions deduction
puf['lncharitable'] = np.log(1. + puf.totcharitable)
sub_puf = puf[puf['charitable'] == 1]
char_ols = sm.OLS(sub_puf.lncharitable, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print char_ols.summary()

                            OLS Regression Results                            
Dep. Variable:           lncharitable   R-squared:                       0.146
Model:                            OLS   Adj. R-squared:                  0.146
Method:                 Least Squares   F-statistic:                     3193.
Date:                Mon, 16 Oct 2017   Prob (F-statistic):               0.00
Time:                        15:22:23   Log-Likelihood:            -1.4932e+05
No. Observations:               74946   AIC:                         2.986e+05
Df Residuals:                   74941   BIC:                         2.987e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.2993      0.003     94.082      0.0

In [28]:
# misc. deduction
puf['lnmisc'] = np.log(1. + puf.E20400)
sub_puf = puf[puf['misc'] == 1]
misc_ols = sm.OLS(sub_puf.lnmisc, sub_puf[['lnincome', 'mars_reg', 'famsize', 'agede', 'constant']]).fit()
print misc_ols.summary()

                            OLS Regression Results                            
Dep. Variable:                 lnmisc   R-squared:                       0.062
Model:                            OLS   Adj. R-squared:                  0.062
Method:                 Least Squares   F-statistic:                     904.1
Date:                Mon, 16 Oct 2017   Prob (F-statistic):               0.00
Time:                        15:22:23   Log-Likelihood:            -1.2104e+05
No. Observations:               54774   AIC:                         2.421e+05
Df Residuals:                   54769   BIC:                         2.421e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
lnincome       0.2628      0.005     58.073      0.0

In [29]:
# extract all parameters
logit_betas = pd.DataFrame({'cg_logit': cg_model.params,
                            'ira_logit': ira_model.params,
                            'irac_logit': irac_model.params,
                            'sep_logit': sep_model.params,
                            'sehi_logit': sehi_model.params,
                            'cdc_logit': cdc_model.params,
                            'medex_logit': medex_model.params,
                            'sl_logit': sl_model.params,
                            'char_logit': char_model.params,
                            'misc_logit': misc_model.params}).fillna(0.)
ols_betas = pd.DataFrame({'cg_ols': cg_ols.params,
                          'ira_ols': ira_ols.params,
                          'irac_ols': irac_ols.params,
                          'sep_ols': sep_ols.params,
                          'sehi_ols': sehi_ols.params,
                          'cdc_ols': cdc_ols.params,
                          'medex_ols': medex_ols.params,
                          'sl_ols': sl_ols.params,
                          'char_ols': char_ols.params,
                          'misc_ols': misc_ols.params}).fillna(0.)

In [30]:
logit_betas.to_csv('data/logit_betas.csv')
ols_betas.to_csv('data/ols_betas.csv')