In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import copy
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
def Stratification(data, ps):
    data_copy = data.copy()[['A', 'Y']]
    stratum = pd.qcut(x=ps, q=[0, .2, .4, .6, .8, 1.], labels=False)
    data_copy['stratum'] = stratum

    ATE = 0
    N = data_copy.shape[0]
    for j in range(5):
        data_new = data_copy[data_copy.stratum == j]
        Nj = data_new.shape[0]
        
        data_use1 = data_new[data_new['A'] == 1]
        N1j = data_use1.shape[0]
        s1 = sum(data_use1.A * data_use1.Y) / N1j
        
        data_use2 = data_new[data_new['A'] == 0]
        N0j = data_use2.shape[0]
        s2 = sum((1 - data_use2.A) * data_use2.Y) / N0j
        
        ATE = ATE + (Nj/N) * (s1 - s2)
    return ATE

In [3]:
highDim_data = pd.read_csv('../data/highDim_dataset.csv')
lowDim_data = pd.read_csv('../data/lowDim_dataset.csv')

In [4]:
X=lowDim_data.iloc[:,2:].values
A=lowDim_data['A'].values
Y=lowDim_data['Y'].values
params = {'learning_rate':[0.01,0.05,0.1,0.5], 'n_estimators':[50,100,150]}
gscv = GridSearchCV(AdaBoostClassifier(),params,cv=5).fit(X,A)
gscv.best_params_

{'learning_rate': 0.01, 'n_estimators': 50}

In [5]:
gbm = AdaBoostClassifier(learning_rate = 0.01, n_estimators = 50).fit(X,A)
ps_low = gbm.predict_proba(X)[:,1]
ps_low = pd.DataFrame(ps_low).rank(method='first')
ps_low = np.array(ps_low[0])

In [6]:
Stratification(lowDim_data, ps_low)

4.5606235232602526

In [7]:
X=highDim_data.iloc[:,2:].values
A=highDim_data['A'].values
Y=highDim_data['Y'].values
params = {'learning_rate':[0.01,0.05,0.1,0.5], 'n_estimators':[50,100,150]}
gscv = GridSearchCV(AdaBoostClassifier(),params,cv=5).fit(X,A)
gscv.best_params_

{'learning_rate': 0.1, 'n_estimators': 150}

In [8]:
gbm = AdaBoostClassifier(learning_rate = 0.1, n_estimators = 150).fit(X,A)
ps_high = gbm.predict_proba(X)[:,1]
ps_high = pd.DataFrame(ps_high).rank(method='first')
ps_high = np.array(ps_high[0])

In [9]:
Stratification(highDim_data, ps_high)

-54.47440250301423

In [10]:
# High Dim
T_H = np.array(highDim_data[['A']]).ravel()
X_H = highDim_data.iloc[:,2:]

# none penalty
lgr_H_none = LogisticRegression(penalty='none', max_iter=10000).fit(X_H, T_H)
ps_high_none = lgr_H_none.predict_proba(X_H)[:,1]

# OVR + L1 penalty
lgr_H_l1 = LogisticRegression(solver='liblinear', penalty='l1', max_iter=10000).fit(X_H, T_H)
ps_high_l1 = lgr_H_l1.predict_proba(X_H)[:,1]

# l2 penalty
lgr_H_l2 = LogisticRegression(penalty='l2', max_iter=10000).fit(X_H, T_H)
ps_high_l2 = lgr_H_l2.predict_proba(X_H)[:,1]

In [11]:
# Low Dim
T_L = np.array(lowDim_data[['A']]).ravel()
X_L = lowDim_data.iloc[:,2:]

lgr_L_none = LogisticRegression(penalty='none', max_iter=10000).fit(X_L, T_L)
ps_low_none = lgr_L_none.predict_proba(X_L)[:,1]

lgr_L_l1 = LogisticRegression(solver='liblinear', penalty='l1', max_iter=10000).fit(X_L, T_L)
ps_low_l1 = lgr_L_l1.predict_proba(X_L)[:,1]


lgr_L_l2 = LogisticRegression(penalty='l2', max_iter=10000).fit(X_L, T_L)
ps_low_l2 = lgr_L_l2.predict_proba(X_L)[:,1]