In [7]:
import numpy as np
import pandas as pd

import statsmodels.api as sm
# pull the data
data = pd.read_stata('eitc.dta')
data

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.000000
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203
2,13.0,1991.0,6.4,2,0,8178.193833,0.000000,33,11,0,8.178194
3,14.0,1991.0,9.1,0,1,9369.570485,0.000000,43,11,0,9.369570
4,15.0,1991.0,8.6,3,1,14706.607930,14706.607930,23,7,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
13741,95.0,1996.0,6.4,0,0,6370.898662,0.000000,41,9,0,6.370899
13742,95.0,1996.0,6.4,0,0,30704.703633,30688.336520,42,10,1,0.016367
13743,95.0,1996.0,6.4,2,0,43495.602294,43475.143403,53,3,0,0.020459
13744,95.0,1996.0,6.4,0,0,46850.860421,41326.959847,28,11,1,5.523901


In [15]:
# preparing dummy variables
data['post93'] = np.where(data['year'] > 1993, 1, 0)
data['mom'] = np.where(data['children'] > 0, 1, 0)
data['mom_post93'] = data['mom'] * data['post93']
data

Unnamed: 0,state,year,urate,children,nonwhite,finc,earn,age,ed,work,unearn,post93,mom,mom_post93
0,11.0,1991.0,7.6,0,1,18714.394273,18714.394273,26,10,1,0.000000,0,0,0
1,12.0,1991.0,7.2,1,0,4838.568282,471.365639,22,9,1,4.367203,0,1,0
2,13.0,1991.0,6.4,2,0,8178.193833,0.000000,33,11,0,8.178194,0,1,0
3,14.0,1991.0,9.1,0,1,9369.570485,0.000000,43,11,0,9.369570,0,0,0
4,15.0,1991.0,8.6,3,1,14706.607930,14706.607930,23,7,1,0.000000,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13741,95.0,1996.0,6.4,0,0,6370.898662,0.000000,41,9,0,6.370899,1,0,0
13742,95.0,1996.0,6.4,0,0,30704.703633,30688.336520,42,10,1,0.016367,1,0,0
13743,95.0,1996.0,6.4,2,0,43495.602294,43475.143403,53,3,0,0.020459,1,1,1
13744,95.0,1996.0,6.4,0,0,46850.860421,41326.959847,28,11,1,5.523901,1,0,0


In [16]:
#Isolate X and y variables
y = data.loc[:, 'work'].values
X = data.loc[:, ['post93', 'mom', 'mom_post93']].values

In [17]:
# Do logistic regression
X = sm.add_constant(X)
model1 = sm.Logit(y, X).fit()
model1.summary(yname = "Work",
               xname = ("intercept", "After 1993", "Is mom",
                        "Mom after 1993"),
               title = "Impact of tax credit on employment - model1")

Optimization terminated successfully.
         Current function value: 0.686491
         Iterations 4


0,1,2,3
Dep. Variable:,Work,No. Observations:,13746.0
Model:,Logit,Df Residuals:,13742.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 25 Aug 2022",Pseudo R-squ.:,0.009118
Time:,14:38:01,Log-Likelihood:,-9436.5
converged:,True,LL-Null:,-9523.3
Covariance Type:,nonrobust,LLR p-value:,2.058e-37

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.3042,0.036,8.443,0.000,0.234,0.375
After 1993,-0.0085,0.053,-0.161,0.872,-0.112,0.095
Is mom,-0.5212,0.047,-10.985,0.000,-0.614,-0.428
Mom after 1993,0.1885,0.070,2.708,0.007,0.052,0.325


In [18]:
#Isolate X and Y variables part 2
X = data.loc[:, ['post93', 'mom', 'mom_post93',
                    'nonwhite','ed']].values

#Do logistic regression

X = sm.add_constant(X)
model2 = sm.Logit(y, X).fit()
model2.summary(yname = "Work",
               xname = ("intercept", "After 1993", "Is mom",
                        "Mom after 1993", "Hispanic or Black",
                        "Years of education"),
               title = "Impact of tax credit on employment - model2")

Optimization terminated successfully.
         Current function value: 0.680664
         Iterations 4


0,1,2,3
Dep. Variable:,Work,No. Observations:,13746.0
Model:,Logit,Df Residuals:,13740.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 25 Aug 2022",Pseudo R-squ.:,0.01753
Time:,14:38:01,Log-Likelihood:,-9356.4
converged:,True,LL-Null:,-9523.3
Covariance Type:,nonrobust,LLR p-value:,5.205e-70

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.1687,0.071,-2.367,0.018,-0.308,-0.029
After 1993,-0.0046,0.053,-0.086,0.932,-0.108,0.099
Is mom,-0.5287,0.048,-10.986,0.000,-0.623,-0.434
Mom after 1993,0.1973,0.070,2.817,0.005,0.060,0.335
Hispanic or Black,-0.2199,0.036,-6.129,0.000,-0.290,-0.150
Years of education,0.0687,0.007,10.270,0.000,0.056,0.082


In [20]:
# preparing dummy variables for placebo experiment
data['post92'] = np.where(data['year'] > 1992, 1, 0)
data['mom_post92'] = data['mom'] * data['post92']

# prepare placebo dataset
placebo = data[data['year'] < 1994]

In [21]:
#Isolate X and Y variables for placebo experiment
y_placebo = placebo.loc[:, 'work'].values
X_placebo = placebo.loc[:, ['post92', 'mom', 'mom_post92']].values

In [22]:
#Do logistic regression
X_placebo = sm.add_constant(X_placebo)
model_placebo = sm.Logit(y_placebo, X_placebo).fit()
model_placebo.summary(yname = "Work",
               xname = ("intercept", "After 1992", "Is mom",
                        "Mom after 1992"),
               title = "Impact of tax credit on employment - model placebo")


Optimization terminated successfully.
         Current function value: 0.684872
         Iterations 4


0,1,2,3
Dep. Variable:,Work,No. Observations:,7401.0
Model:,Logit,Df Residuals:,7397.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 25 Aug 2022",Pseudo R-squ.:,0.01193
Time:,14:44:03,Log-Likelihood:,-5068.7
converged:,True,LL-Null:,-5130.0
Covariance Type:,nonrobust,LLR p-value:,2.29e-26

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,0.3124,0.044,7.154,0.000,0.227,0.398
After 1992,-0.0259,0.077,-0.335,0.737,-0.177,0.126
Is mom,-0.5138,0.057,-8.950,0.000,-0.626,-0.401
Mom after 1992,-0.0239,0.102,-0.234,0.815,-0.224,0.176
