In [None]:
%reset

In [None]:

import os
import numpy as np
import pandas as pd
import statsmodels.api as sm

from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestRegressor


In [None]:
os.chdir('/home/adam/adam/cfrnet/data/')
os.getcwd()

In [None]:
def calculate_r2(y_true, y_pred):
    rss = np.sum((y_true - y_pred)**2)
    tss = np.sum((y_true - np.mean(y_true))**2)
    r2 = 1 - np.true_divide(rss, tss)
    return r2

def calculate_rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

def ipw_weights(t, X):
    clf = LogisticRegression(random_state=1234,
                             class_weight='balanced',
                             penalty='none',
                             max_iter=2000,
                             n_jobs=-1,
                             solver='newton-cg').fit(X, t)

    weights = clf.predict_proba(X)[:, 1]
    weights[~t] = 1 - weights[~t]
    weights[weights < 0.1] = 0.1 # clipping
    weights = 1 / weights

    return weights

def stratify(y, t, X):

    model = LogisticRegression(random_state=1234,
                               class_weight='balanced',
                               penalty='none',
                               max_iter=2000,
                               n_jobs=-1,
                               solver='newton-cg').fit(X, t)

    weights = model.predict_proba(X)[:, 1]
    bins = [0, 0.4, 0.6, 0.75, 1]
    X = np.hstack((t, X))
    X_1 = X[(bins[0] < weights) & (weights <= bins[3])] # BUG!
    X_2 = X[(bins[1] < weights) & (weights <= bins[2])]
    X_3 = X[(bins[2] < weights) & (weights <= bins[3])]
    X_4 = X[(bins[3] < weights) & (weights <= bins[4])]

    y_1 = y[(bins[0] < weights) & (weights <= bins[3])]
    y_2 = y[(bins[1] < weights) & (weights <= bins[2])]
    y_3 = y[(bins[2] < weights) & (weights <= bins[3])]
    y_4 = y[(bins[3] < weights) & (weights <= bins[4])]

    return [y_1, y_2, y_3, y_4], [X_1, X_2, X_3, X_4], model

def stratify_predict(y, t, X, model):

    weights = model.predict_proba(X)[:, 1]
    bins = [0, 0.4, 0.6, 0.75, 1]
    X = np.hstack((t, X))
    X_1 = X[(bins[0] < weights) & (weights <= bins[3])]
    X_2 = X[(bins[1] < weights) & (weights <= bins[2])]
    X_3 = X[(bins[2] < weights) & (weights <= bins[3])]
    X_4 = X[(bins[3] < weights) & (weights <= bins[4])]

    y_1 = y[(bins[0] < weights) & (weights <= bins[3])]
    y_2 = y[(bins[1] < weights) & (weights <= bins[2])]
    y_3 = y[(bins[2] < weights) & (weights <= bins[3])]
    y_4 = y[(bins[3] < weights) & (weights <= bins[4])]

    n_all = len(y_1) + len(y_2) + len(y_3) + len(y_4)
    n_1 = len(y_1)
    n_2 = len(y_2)
    n_3 = len(y_3)
    n_4 = len(y_4)

    return [y_1, y_2, y_3, y_4], [X_1, X_2, X_3, X_4], [n_1, n_2, n_3, n_4]

def results_summary(df_results):
    ate = [np.mean(df_results['ate']),
           np.percentile(df_results['ate'], q=2.5, interpolation='higher'),
           np.percentile(df_results['ate'], q=97.5, interpolation='lower')]

    rmse = [np.mean(df_results['rmse']),
            np.percentile(df_results['rmse'], q=2.5, interpolation='higher'),
            np.percentile(df_results['rmse'], q=97.5, interpolation='lower')]

    r2 = [np.mean(df_results['r2']),
          np.percentile(df_results['r2'], q=2.5, interpolation='higher'),
          np.percentile(df_results['r2'], q=97.5, interpolation='lower')]

    summary = {'ate': ate, 'rmse': rmse, 'r2': r2}
    df_summary = pd.DataFrame(data = summary).T.round(2)

    return df_summary

In [None]:
train_data = np.load('bfpguerin_12_24.train.npz')
print(train_data.files)
test_data = np.load('bfpguerin_12_24.test.npz')
print(test_data.files)

In [None]:
y_train, t_train, x_train = train_data['yf'], train_data['t'], train_data['x']
y_test, t_test, x_test = test_data['yf'], test_data['t'], test_data['x']

In [None]:
print(y_train.shape)
print(y_test.shape)

print(t_train.shape)
print(t_test.shape)

print(x_train.shape)
print(x_test.shape)


In [None]:
from sklearn.metrics import r2_score

def run_experiment_ols(y_train,
                       t_train,
                       X_train,
                       y_test,
                       t_test,
                       X_test,
                       n_of_experiments,
                       method):

    ate_list = []
    rmse_list = []
    r2_list = []
    r2_training = []

    for i in range(n_of_experiments):

        ### TRAIN ###
        y, t, X = y_train[:, i], t_train[:, i], X_train[:, :, i]
        y, t = y.reshape(len(y), 1), t.reshape(len(t), 1)

        ### PROPENSITY ESTIMATION ### LAND ON PAPER WE ESTIMATE PROPENSITY ON TEST SET
        # https://arxiv.org/pdf/1804.05146.pdf
        # So far we use training data

        if method == 'ols':
            X = np.hstack((t, X))
            model = sm.OLS(y, sm.add_constant(X)).fit()
            r2_training.append(model.rsquared)
        if method == 'ipw':
            weights = ipw_weights(t.flatten(), X)
            X = np.hstack((t, X))
            model = sm.WLS(y, sm.add_constant(X), weights=weights).fit()
            print("Model fitted")
            r2_training.append(model.rsquared)

        if method == '2-ols':
            t = t.flatten()
            model_treated = sm.OLS(y[t], sm.add_constant(X[t, :])).fit()
            model_control = sm.OLS(y[~t], sm.add_constant(X[~t, :])).fit()
            r2_training.append([model_treated.rsquared, model_control.rsquared])

        if method == 'stratify':
            y_list, X_list, pmodel = stratify(y, t, X)
            model_list = [sm.OLS(y_list[i], sm.add_constant(X_list[i])).fit() for i in range(len(X_list))]

        if method == 'rf':
            X = np.hstack((t, X))
            model = RandomForestRegressor(max_depth=2, random_state=0).fit(X, y)

        if method == 'raw':
            ate_list.append(np.mean(y[t]) - np.mean(y[~t]))
            rmse_list.append(np.median(y[t]) - np.median(y[~t]))
            r2_list.append(np.percentile(y, q=95))

        ### TEST ###
        y, t, X = y_test[:, i], t_test[:, i], X_test[:, :, i]
        y, t = y.reshape(len(y), 1), t.reshape(len(t), 1)

        if (method == 'ols') | (method == 'ipw'):

            X = np.hstack((t, X))
            y_pred = model.predict(sm.add_constant(X))

            ate_list.append(model.params[1])
            rmse_list.append(calculate_rmse(y_pred=y_pred, y_true=y))
            r2_list.append(r2_score(y, y_pred))

        if method == '2-ols':
            t = t.flatten()
            y_treated = model_treated.predict(sm.add_constant(X))
            y_control = model_control.predict(sm.add_constant(X))
            ite = y_treated - y_control
            ate = np.mean(ite)
            ate_list.append(ate)

            y_pred = y_treated
            y_pred[~t] = y_control[~t]
            rmse_list.append(calculate_rmse(y_pred=y_pred, y_true=y))
            r2_list.append(r2_score(y, y_pred))

        if method == 'rf':
            ite = model.predict(np.hstack((np.ones(t.shape), X))) - model.predict(np.hstack((np.zeros(t.shape), X)))
            ate = np.mean(ite)
            ate_list.append(ate)

            y_pred = model.predict(np.hstack((t, X)))
            rmse_list.append(calculate_rmse(y_pred=y_pred, y_true=y))
            r2_list.append(r2_score(y, y_pred))

        if method == 'stratify':
            y_list, X_list, n_list = stratify_predict(y, t, X, pmodel)
            y_pred_list = [model_list[i].predict(sm.add_constant(X_list[i])) for i in range(len(model_list))]

            ate = [model_list[i].params[1] for i in range(len(y_list))]
            rmse = [calculate_rmse(y_pred=y_pred_list[i], y_true=y_list[i]) for i in range(len(y_list))]
            r2 = [r2_score(y_list[i], y_pred_list[i]) for i in range(len(y_list))]
            print(ate, n_list)
            ate_list.append(np.average(ate, weights=n_list))
            rmse_list.append(np.average(rmse, weights=n_list))
            r2_list.append(np.average(r2, weights=n_list))

    if (method == 'ols') | (method == 'ipw'):
        print("Mean training R2", np.mean(r2_training))
    if method == '2-ols':
        print("Mean training R2 (treated, control)",
              np.mean([item[0] for item in r2_training]),
              np.mean([item[1] for item in r2_training]))

    results = {'ate': ate_list, 'rmse': rmse_list, 'r2': r2_list}
    df_results = pd.DataFrame(data = results)

    return df_results

In [None]:
n_of_experiments = 100

In [None]:
os.chdir('/home/adam/adam/data/results/')
os.getcwd()

#### RAW

In [None]:
results_raw = run_experiment_ols(y_train=train_data['yf'], y_test=test_data['yf'],
                             t_train=train_data['t'], t_test=test_data['t'],
                             X_train=train_data['x'], X_test=test_data['x'],
                             n_of_experiments=n_of_experiments,
                             method='raw')

summary_raw = results_summary(results_raw)
print(summary_raw)

#### OLS

In [None]:
results_ols = run_experiment_ols(y_train=train_data['yf'], y_test=test_data['yf'],
                             t_train=train_data['t'], t_test=test_data['t'],
                             X_train=train_data['x'], X_test=test_data['x'],
                             n_of_experiments=n_of_experiments,
                             method='ols')


In [None]:
np.savetxt("results_ols_pf_ratio_12h_24h_manual_outcome.csv", results_ols, delimiter=",", fmt='%1.2f')

In [None]:
results_ols = pd.read_csv('results_ols_pf_ratio_12h_24h_manual_outcome.csv', header=None)
results_ols.columns = ['ate', 'rmse', 'r2']

In [None]:
summary_ols = results_summary(results_ols)
print(summary_ols)
np.savetxt("summary_ols_pf_ratio_12h_24h_manual_outcome.csv", summary_ols, delimiter=",", fmt='%1.2f')

#### IPW

In [None]:
results_ipw = run_experiment_ols(y_train=train_data['yf'], y_test=test_data['yf'],
                                 t_train=train_data['t'], t_test=test_data['t'],
                                 X_train=train_data['x'], X_test=test_data['x'],
                                 n_of_experiments=n_of_experiments,
                                 method='ipw')

np.savetxt("results_ipw_pf_ratio_12h_24h_manual_outcome.csv", results_ipw, delimiter=",", fmt='%1.2f')

In [None]:
results_ipw = pd.read_csv('results_ipw_pf_ratio_12h_24h_manual_outcome.csv', header=None)
results_ipw.columns = ['ate', 'rmse', 'r2']

In [None]:
summary_ipw = results_summary(results_ipw)
print(summary_ipw)
np.savetxt("summary_ipw_pf_ratio_12h_24h_manual_outcome.csv", summary_ipw, delimiter=",", fmt='%1.2f')

#### Stratification

In [None]:
results_stratify = run_experiment_ols(y_train=train_data['yf'], y_test=test_data['yf'],
                             t_train=train_data['t'], t_test=test_data['t'],
                             X_train=train_data['x'], X_test=test_data['x'],
                             n_of_experiments=n_of_experiments,
                             method='stratify')

np.savetxt("results_stratify_pf_ratio_12h_24h_manual_outcome.csv", results_stratify, delimiter=",", fmt='%1.2f')

In [None]:
results_stratify = pd.read_csv('results_stratify_pf_ratio_12h_24h_manual_outcome.csv', header=None)
results_stratify.columns = ['ate', 'rmse', 'r2']

In [None]:
summary_stratify = results_summary(results_stratify)
print(summary_stratify)
np.savetxt("summary_stratify_pf_ratio_12h_24h_manual_outcome.csv", summary_stratify, delimiter=",", fmt='%1.2f')

#### 2-OLS

In [None]:
results_2ols = run_experiment_ols(y_train=train_data['yf'], y_test=test_data['yf'],
                             t_train=train_data['t'], t_test=test_data['t'],
                             X_train=train_data['x'], X_test=test_data['x'],
                             n_of_experiments=n_of_experiments,
                             method='2-ols')

np.savetxt("results_2ols_pf_ratio_12h_24h_manual_outcome.csv", results_2ols, delimiter=",", fmt='%1.2f')

In [None]:
summary_2ols = results_summary(results_2ols)
print(summary_2ols)
np.savetxt("summary_2ols_pf_ratio_12h_24h_manual_outcome.csv", summary_2ols, delimiter=",", fmt='%1.2f')

#### RF

In [None]:
results_rf = run_experiment_ols(y_train=train_data['yf'], y_test=test_data['yf'],
                             t_train=train_data['t'], t_test=test_data['t'],
                             X_train=train_data['x'], X_test=test_data['x'],
                             n_of_experiments=n_of_experiments,
                             method='rf')

np.savetxt("results_rf_pf_ratio_12h_24h_manual_outcome.csv", results_rf, delimiter=",", fmt='%1.2f')

In [None]:
summary_rf = results_summary(results_rf)
print(summary_rf)
np.savetxt("summary_rf_pf_ratio_12h_24h_manual_outcome.csv", summary_rf, delimiter=",", fmt='%1.2f')

## Pscore saving

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def save_propensity_plot(t, X, path):
    experiment = 0
    t, X = t[:, experiment].reshape(len(t[:, experiment]), 1).flatten(), X[:, :, experiment]
    pscore = LogisticRegression(random_state=1234,
                                class_weight='balanced',
                                penalty='none',
                                max_iter=10000).fit(X, t).predict_proba(X)[:, 1]

    treated_pscore = pscore[t]
    treated = {'Propensity_score': treated_pscore, 'Group': np.ones(treated_pscore.shape)}
    df_trated = pd.DataFrame(treated)

    control_pscore = pscore[~t]
    control = {'Propensity_score': control_pscore, 'Group': np.zeros(control_pscore.shape)}
    df_control = pd.DataFrame(control)

    df_plot = pd.concat([df_trated, df_control])
    df_plot.loc[df_plot.Group == 1, 'Group'] = 'Treated'
    df_plot.loc[df_plot.Group == 0, 'Group'] = 'Control'

    sns.displot(df_plot, x="Propensity_score", hue="Group", stat="probability")
    plt.savefig(path)

In [None]:
path = 'pscore_12_24_outcome.png'
save_propensity_plot(t=train_data['t'], X=train_data['x'], path=path)


## Additioanl

In [None]:
import numpy as np

from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

import xgboost as xgb

import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

rmse=mean_squared_error(y_test, y_pred, squared=False)

print(rmse)

In [None]:
from sklearn.metrics import r2_score

coefficient_of_dermination = r2_score(y_test, y_pred)
print(coefficient_of_dermination)


### Saving results for CfR

In [None]:
os.chdir('/home/adam/adam/data/results/')
os.getcwd()

In [None]:
ate_tarnet = pd.read_csv('results_tarnet_pf_ratio_12h_24h_manual_outcome.csv', header=None)
ate_cfr = pd.read_csv('results_cfr_pf_ratio_12h_24h_manual_outcome.csv', header=None)

In [None]:
ate_cfr.columns = ['ate', 'rmse', 'r2']
ate_tarnet.columns = ['ate', 'rmse', 'r2']

In [None]:
results_summary(ate_cfr)

In [None]:
results_summary(ate_tarnet)