In [11]:
### load required packages
import pandas
import numpy
import statsmodels.api
import sklearn.linear_model

# Low dimensional data

In [12]:
### read data
lowDim_data = pandas.read_csv('../data/lowDim_dataset.csv')


## 1. Propensity score estimation
## 1.1 Select variables to estimate propensity score

Note: Select criteria is t-statistics. We keep the covariate with has t-statistics larger than t_prop=2. 

In [13]:
### perfom K logistic regression to select for V (covariates for propensity score estimation)

K = lowDim_data.shape[1]-2
t_prop = 2 # threshold to select covariate
T = lowDim_data[['A']]

t_stat_prop = numpy.array(())
for i in range(1, K+1):
    idx = 'V'+str(i)
    X = lowDim_data[[idx]]

    log_reg = statsmodels.api.Logit(T, X).fit() 
    cov = log_reg.cov_params()
    std_err = numpy.sqrt(numpy.diag(cov))
    t_stat_prop = numpy.append(t_stat_prop,numpy.array(round(log_reg.params / std_err,2)))


Optimization terminated successfully.
         Current function value: 0.684936
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.686798
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.692697
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.677862
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.687807
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.689411
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.691128
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.665423
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.680357
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.691334
  

## 1.2 Estimate propensity score with L2 logistic

In [14]:
### propensity score estimation with logistic regression with L2 penalty
# V is a subset of covariates (selected by t_stat_prop)
V = lowDim_data.iloc[:,2:].iloc[:,abs(t_stat_prop)>=t_prop]
T = numpy.array(T).ravel()

clf = sklearn.linear_model.LogisticRegression(random_state=0, solver='lbfgs', penalty='l2', C=1.0).fit(V, T)
PS = clf.predict_proba(V)[:,1]
T

array([0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,

## 1.3 Calculate weights according to PS

In [15]:
### calculate weights
weights = T/PS + (1-T)/(1-PS)

## 2. ATE estimation
## 2.1 Select variables to estimate ATE

Note: Select criteria is t-statistics. We keep the covariate with has t-statistics larger than t_reg=2. 

In [16]:
### perform K linear regressions to select for Z
t_reg = 2 # threshold to select covariate
Y = lowDim_data[['Y']]

t_stat_reg = numpy.array(())
for i in range(1, K+1):
    idx = 'V'+str(i)
    X = lowDim_data[['A',idx]]
    X = statsmodels.api.add_constant(X) # adding a constant
    
    linear_reg = statsmodels.api.OLS(Y, X).fit() 
    cov = linear_reg.cov_params()
    std_err = numpy.sqrt(numpy.diag(cov))
    t_stat_reg = numpy.append(t_stat_reg,numpy.array(round(linear_reg.params[2] / std_err[2],2)))

## 2.2 Estimate ATE with weighted linear regression 

$Y_i = \alpha_0+\tau T_i+\alpha_{11}Z_{i1}+\alpha_{12}Z_{i2}+ ... +\alpha_{1m}Z_{im}+\alpha_{21}(Z_{i1}-\bar{Z_1})T_i+\alpha_{22}(Z_{i2}-\bar{Z_2})T_i+...+\alpha_{2m}(Z_{im}-\bar{Z_m})T_i+\epsilon_i$

In [17]:
### weighted least square linear regression on selected covariates Z

# construct required independent variables

T = lowDim_data['A']

Z = lowDim_data.iloc[:,2:].iloc[:,abs(t_stat_reg)>t_reg]
centralized_Z = lowDim_data.iloc[:,2:].iloc[:,abs(t_stat_reg)>t_reg] - \
                lowDim_data.iloc[:,2:].iloc[:,abs(t_stat_reg)>t_reg].mean(axis=0)
centralized_Z_T = centralized_Z.multiply(T, axis='index')

X_train = pandas.concat([T,Z,centralized_Z,centralized_Z_T],axis=1)
Y_train = numpy.array(Y).ravel()

In [19]:
reg = sklearn.linear_model.LinearRegression(fit_intercept=True).fit(X_train, Y_train, sample_weight=weights)
ATE = reg.coef_[0]
print("Estimated ATE for low-dim data is {:.2f}".format(ATE))

Estimated ATE for low-dim data is 2.51


# Repeat for high dimensional data

In [20]:
### read data
highDim_data = pandas.read_csv('../data/highDim_dataset.csv')
highDim_data

Unnamed: 0,Y,A,V1,V2,V3,V4,V5,V6,V7,V8,...,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185
0,-11.682472,1,0,1,2,16,3,-1,13,-0.13,...,5,7,8,6,8,-1,-1,-1,-1,-1
1,-13.176546,0,1,1,12,14,14,14,13,0.24,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,-2.195401,1,0,1,21,22,10,10,14,0.27,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,-0.005454,1,1,1,9,20,11,2,10,0.09,...,-10,-10,-10,-10,-10,-10,-10,-10,-10,-10
4,-1.987538,1,1,1,7,16,16,11,6,0.15,...,70,70,80,70,80,-10,-10,-10,-10,-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-15.054763,0,0,1,14,18,19,19,10,0.36,...,7,8,9,10,9,6,8,8,10,8
1996,-8.310797,0,0,1,21,22,15,15,12,0.02,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1997,-15.645703,0,1,1,14,20,5,5,3,0.56,...,6,6,8,7,5,6,5,7,7,6
1998,-18.302366,1,0,1,2,16,7,-1,11,0.50,...,7,8,8,7,6,-1,-1,-1,-1,-1


In [21]:
### perfom K logistic regression to select for V (covariates for propensity score estimation)

K = highDim_data.shape[1]-2
t_prop = 2 # threshold to select covariate
T = highDim_data[['A']]

t_stat_prop = numpy.array(())
for i in range(1, K+1):
    idx = 'V'+str(i)
    X = highDim_data[[idx]]

    log_reg = statsmodels.api.Logit(T, X).fit() 
    cov = log_reg.cov_params()
    std_err = numpy.sqrt(numpy.diag(cov))
    t_stat_prop = numpy.append(t_stat_prop,numpy.array(round(log_reg.params / std_err,2)))

Optimization terminated successfully.
         Current function value: 0.688663
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.689171
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.688522
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.688663
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690168
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690028
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.689965
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.691124
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.691320
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.686929
  

Optimization terminated successfully.
         Current function value: 0.683647
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.681713
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.684674
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.683030
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.679895
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.684122
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.690207
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.687970
         Iterations 4
Optimization terminated successfully.
         Current function value: 0.692716
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692743
  

Optimization terminated successfully.
         Current function value: 0.692573
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692676
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692664
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692434
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.692385
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690669
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690289
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690468
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.690611
         Iterations 3
Optimization terminated successfully.
         Current function value: 0.691122
  

In [22]:
### propensity score estimation with logistic regression with L2 penalty
# V is a subset of covariates (selected by t_stat_prop)
V = highDim_data.iloc[:,2:].iloc[:,abs(t_stat_prop)>=t_prop]
T = numpy.array(T).ravel()

clf = sklearn.linear_model.LogisticRegression(random_state=0, solver='newton-cg', penalty='l2', C=1.0).fit(V, T)
PS = clf.predict_proba(V)[:,1]

In [23]:
### calculate weights
weights = T/PS + (1-T)/(1-PS)

In [24]:
### perform K linear regressions to select for Z
t_reg = 2 # threshold to select covariate
Y = highDim_data[['Y']]

t_stat_reg = numpy.array(())
for i in range(1, K+1):
    idx = 'V'+str(i)
    X = highDim_data[['A',idx]]
    X = statsmodels.api.add_constant(X) # adding a constant
    
    linear_reg = statsmodels.api.OLS(Y, X).fit() 
    cov = linear_reg.cov_params()
    std_err = numpy.sqrt(numpy.diag(cov))
    t_stat_reg = numpy.append(t_stat_reg,numpy.array(round(linear_reg.params[2] / std_err[2],2)))

In [29]:
### weighted least square linear regression on selected covariates Z

# construct required independent variables

T = highDim_data['A']

Z = highDim_data.iloc[:,2:].iloc[:,abs(t_stat_reg)>t_reg]
centralized_Z = highDim_data.iloc[:,2:].iloc[:,abs(t_stat_reg)>t_reg] - \
                highDim_data.iloc[:,2:].iloc[:,abs(t_stat_reg)>t_reg].mean(axis=0)
centralized_Z_T = centralized_Z.multiply(T, axis='index')

X_train = pandas.concat([T,Z,centralized_Z,centralized_Z_T],axis=1)
Y_train = numpy.array(Y).ravel()
X_train

Unnamed: 0,A,V3,V6,V7,V10,V11,V12,V14,V16,V17,...,V176,V177,V178,V179,V180,V181,V182,V183,V184,V185
0,1,2,-1,13,21,2,15.00,15.00,5.00,15.00,...,-5.229,-4.2,-4.06,-4.7535,-2.689,-0.231,-0.4165,-0.496,-0.346,-0.3035
1,0,12,14,13,22,2,10.00,20.00,15.00,15.00,...,-0.000,-0.0,-0.00,-0.0000,-0.000,-0.000,-0.0000,-0.000,-0.000,-0.0000
2,1,21,10,14,24,2,20.00,20.00,10.00,10.00,...,-11.229,-12.2,-13.06,-11.7535,-11.689,-0.231,-0.4165,-0.496,-0.346,-0.3035
3,1,9,2,10,22,2,16.00,18.00,18.00,8.00,...,-20.229,-21.2,-22.06,-20.7535,-20.689,-9.231,-9.4165,-9.496,-9.346,-9.3035
4,1,7,11,6,28,4,14.29,16.33,18.37,16.33,...,59.771,58.8,67.94,59.2465,69.311,-9.231,-9.4165,-9.496,-9.346,-9.3035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,14,19,10,29,2,25.00,20.00,10.00,10.00,...,-0.000,-0.0,-0.00,-0.0000,-0.000,0.000,0.0000,0.000,0.000,0.0000
1996,0,21,15,12,22,2,30.00,20.00,15.00,10.00,...,-0.000,-0.0,-0.00,-0.0000,-0.000,-0.000,-0.0000,-0.000,-0.000,-0.0000
1997,0,14,5,3,23,4,20.00,20.00,20.00,0.00,...,-0.000,-0.0,-0.00,-0.0000,-0.000,0.000,0.0000,0.000,0.000,0.0000
1998,1,2,-1,11,28,2,40.00,20.00,5.00,10.00,...,-3.229,-3.2,-4.06,-3.7535,-4.689,-0.231,-0.4165,-0.496,-0.346,-0.3035


In [27]:
reg = sklearn.linear_model.LogisticRegression(fit_intercept=True).fit(X_train, Y_train, sample_weight=weights)
ATE = reg.coef_[0]
print("Estimated ATE for high-dim data is {:.2f}".format(ATE))

ValueError: Unknown label type: 'continuous'