In [1]:
# imports
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
# allow plots to appear directly in the notebook
%matplotlib inline

### Question2

In [2]:
# read data into a DataFrame
data = pd.read_stata('progresa.dta')
pd.set_option('display.max_columns', None)

In [3]:
# set the dummy variable
data['poor']=data['poor'].apply(lambda x:1 if x=='pobre' else (0 if x=="no pobre" else x) )
data['progresa']=data['progresa'].apply(lambda x:1 if x=='basal' else x )
data["year_dummy"]=data['year'].apply(lambda x:1 if x==98 else (0 if x== 97 else x))

In [4]:
data.head(10)

Unnamed: 0,year,sex,indig,dist_sec,sc,grc,fam_n,min_dist,dist_cap,poor,progresa,hohedu,hohwag,welfare_index,hohsex,hohage,age,village,folnum,grc97,sc97,year_dummy
0,97,1.0,0.0,4.473,1.0,6.0,7,21.168384,21.168384,1,0,6,0.0,583.0,1.0,35,12,163,2.0,6.0,1.0,0
1,98,1.0,0.0,4.473,1.0,7.0,7,21.168384,21.168384,1,0,6,0.0,583.0,1.0,35,13,163,2.0,6.0,1.0,1
2,97,0.0,0.0,3.154,0.0,6.0,6,127.11478,154.196003,1,1,4,0.0,684.0,1.0,85,14,271,4.0,6.0,0.0,0
3,98,0.0,0.0,3.154,0.0,6.0,6,127.11478,154.196003,1,1,4,0.0,684.0,1.0,85,15,271,4.0,6.0,0.0,1
4,97,1.0,1.0,1.935,1.0,5.0,10,127.657608,333.048731,1,1,0,500.0,660.0,1.0,60,16,418,10.0,5.0,1.0,0
5,98,1.0,1.0,1.935,0.0,6.0,10,127.657608,333.048731,1,1,0,500.0,660.0,1.0,60,17,418,10.0,5.0,1.0,1
6,97,1.0,1.0,1.935,0.0,6.0,9,127.657608,333.048731,1,1,0,500.0,595.599976,1.0,50,14,418,15.0,6.0,0.0,0
7,98,1.0,1.0,1.935,,,9,127.657608,333.048731,1,1,0,500.0,595.599976,1.0,50,15,418,15.0,6.0,0.0,1
8,97,1.0,1.0,1.935,1.0,5.0,7,127.657608,333.048731,1,1,6,480.0,618.5,1.0,36,14,418,23.0,5.0,1.0,0
9,98,1.0,1.0,1.935,0.0,5.0,7,127.657608,333.048731,1,1,6,480.0,618.5,1.0,36,15,418,23.0,5.0,1.0,1


In [5]:
# shape of the DataFrame
data.shape

(29096, 22)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29096 entries, 0 to 29095
Data columns (total 22 columns):
year             29096 non-null int8
sex              29088 non-null float64
indig            29034 non-null float64
dist_sec         29096 non-null float32
sc               26436 non-null float64
grc              26521 non-null float64
fam_n            29096 non-null int8
min_dist         29096 non-null float64
dist_cap         29096 non-null float64
poor             29096 non-null category
progresa         29096 non-null category
hohedu           29096 non-null int8
hohwag           29096 non-null float32
welfare_index    29002 non-null float32
hohsex           29084 non-null float64
hohage           29096 non-null int8
age              29096 non-null int8
village          29096 non-null int16
folnum           29096 non-null float32
grc97            29096 non-null float32
sc97             28906 non-null float32
year_dummy       29096 non-null int64
dtypes: category(2), float32

In [7]:
print(data['poor'].unique())
print(data['progresa'].unique())
print(data['year_dummy'].unique())

[1, 0]
Categories (2, int64): [0 < 1]
[0, 1]
Categories (2, int64): [0 < 1]
[0 1]


In [8]:
#(a)
data_poor=data[data['poor']==1]
f1='sc ~ progresa + year_dummy + progresa:year_dummy + sex+ indig+dist_sec+ fam_n+ min_dist+ dist_cap+ hohedu+ hohwag+ welfare_index+ hohsex+hohage+ age '
results1 = smf.ols(f1, data=data_poor).fit(cov_type='HC3')
print(results1.summary())

                            OLS Regression Results                            
Dep. Variable:                     sc   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.361
Method:                 Least Squares   F-statistic:                     1062.
Date:                Wed, 11 Dec 2019   Prob (F-statistic):               0.00
Time:                        11:07:08   Log-Likelihood:                -9353.0
No. Observations:               21934   AIC:                         1.874e+04
Df Residuals:                   21918   BIC:                         1.887e+04
Df Model:                          15                                         
Covariance Type:                  HC3                                         
                               coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------
Intercept               

Intercept: baseline. It is enrollment rate of the control group in 1997.

progresa: It is the difference between the the control group and treatment group.

year_dummy: It is the difference between 1997 and 1998.

progresa:year_dummy : It is the average treatment effects for poor household in a difference-indifferences using data from 1997 and 1998.

These estimates of the treatment effect compare to the estimates based on the simple difference is that they include the change of time, and we assume the change is constant for both treatment group and control group. Therefore,we can say that the difference after time period is the treatment effect of the program.
Yes,we find a difference because progresa:year_dummy is treatment effect, and its coefficient is about 0.06, its p-value is 0

The counterfactual assumption underlying this regression is we have to make sure that the trend for both treatment group and control group are constant, and the assignment for deciding the group must be uncorrelated with the outcome. However, it is not always the case in the reality.

In [10]:
#(b)
data_98=data[data['year']==98]
f2='sc ~ progresa + poor+ progresa:poor+ sex+ indig+dist_sec+ fam_n+ min_dist+ dist_cap+ hohedu+ hohwag+ welfare_index+ hohsex+hohage+ age '
results2 = smf.ols(f2, data=data_98).fit(cov_type='HC3')
print(results2.summary())

                            OLS Regression Results                            
Dep. Variable:                     sc   R-squared:                       0.316
Model:                            OLS   Adj. R-squared:                  0.315
Method:                 Least Squares   F-statistic:                     444.2
Date:                Wed, 11 Dec 2019   Prob (F-statistic):               0.00
Time:                        15:45:37   Log-Likelihood:                -5354.3
No. Observations:               11914   AIC:                         1.074e+04
Df Residuals:                   11898   BIC:                         1.086e+04
Df Model:                          15                                         
Covariance Type:                  HC3                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

We can estimate this regression model by making the data only include 1998 year, and then add progres,poor, and interaction of progres and poor into regression.

The counterfactual assumption underlying this regression is that the poor condition is uncorrelated with getting progres, however, it is not always the case in the reality. Besides, we have to assume missing data is random. In the reality, those missing data maybe is the people who don't go to school. Therefore, when we run the regression, we can get the significant effect. Maybe in the reality, even though giving these people grant, it can not increase the likeihood of going to school for those people.

These treatment effects compare to the estimates from last question are that they only estimate the outcome of 1998 data and they estimate the difference between of 1.poor and not poor and 2.treatment group and control group.

Possible explanation for differences:
The coefficient of progresa 0.028 means that the students who get the grant are more likely to go to school compare to who don't get it, which is maybe because now they can go to school even though they can't afford the fee.

The coefficient of poor -0.031 means that the poor students are less likely to go to school compare to who is not poor, which is maybe because the poor students need to do other jobs to earn money. Thus, the enrollment of them is smaller.

The coefficient of progresa:poor 0.039 means that the poor students who get the grant are more likely to go to school, which is maybe because now they can go to school when they originally can't afford the fee.