In [None]:
%reset

In [None]:
import pandas as pd
import numpy as np

import sys, os

import seaborn as sns
import matplotlib.pyplot as plt

from causalinference import CausalModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from importlib import reload

from scipy.stats import wasserstein_distance
from scipy import stats

In [None]:
def process_data(df, outcome, thresh=0.6):

    # prepare outcomes
    outcomes_to_delete = df.filter(regex='outcome').\
                            columns.\
                            to_list()
    outcomes_to_delete.remove(outcome)
    df.drop(columns=outcomes_to_delete,
            inplace=True)
    df.dropna(subset=[outcome], inplace=True)

    # drop columns with missing values exceeding the thresh
    thresh = round(thresh * len(df.index))
    df = df.dropna(thresh=thresh, axis=1)

    # get dummies
    df = pd.get_dummies(df)
    columns_to_drop = ['gender_M'] + df.filter(regex='False').columns.to_list()
    df.drop(columns=columns_to_drop, inplace=True)

    # convert to bool
    for column in df.select_dtypes(include=['uint8']).columns.to_list():
        df[column] = df[column] == 1

    return df

def get_training_data(df, treatment_col, outcome_col):

    cols_num = df.select_dtypes(include=['float64']).columns.to_list()
    if outcome_col in cols_num:
        cols_num.remove(outcome_col)
    cols_bool = df.select_dtypes(include=['uint8', 'bool']).columns.to_list()
    if treatment_col in cols_bool:
        cols_bool.remove(treatment_col)

    t = df_encoded.loc[:, treatment_col].values
    X_bool = df_encoded[cols_bool].values
    X_num = df_encoded[cols_num].values
    y = df_encoded.loc[:, outcome_col].values


    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(X_num)
    X_num = imp.transform(X_num)
    scaler = StandardScaler().fit(X_num)
    X_num = scaler.transform(X_num)
    X = np.hstack((X_num, X_bool))

    return y, t, X

def get_covariate_names(df, treatment_col, outcome_col):

    cols_num = df.select_dtypes(include=['float64']).columns.to_list()
    if outcome_col in cols_num:
        cols_num.remove(outcome_col)
    cols_bool = df.select_dtypes(include=['uint8', 'bool']).columns.to_list()
    if treatment_col in cols_bool:
        cols_bool.remove(treatment_col)

    return cols_num + cols_bool

In [None]:
os.chdir('/home/adam/files/causal_inference')
from causal_inference.model.propensity_model import PropensityModel
reload(sys.modules['causal_inference.model.propensity_model'])
from causal_inference.model.propensity_model import PropensityModel

In [None]:
os.chdir('/home/adam/files/data/13012020/')
df = pd.read_csv('data_guerin_rct.csv')
df.info(max_cols=200)

In [None]:
df = df[(df.pf_ratio > 0) & (df.fio2 > 60)]
df.info(max_cols=200)

In [None]:

df.loc[df.treated, 'pf_ratio'].describe()

In [None]:
df.loc[~df.treated, 'pf_ratio'].describe()

In [None]:
OUTCOME = 'pf_ratio_12h_outcome'
df_encoded = process_data(df=df, outcome=OUTCOME)
df_encoded.info()

y, t, X = get_training_data(df=df_encoded,
                            treatment_col='treated',
                            outcome_col=OUTCOME)

covariates = get_covariate_names(df=df_encoded,
                                 treatment_col='treated',
                                 outcome_col=OUTCOME)

## 3. Causal modelling

## a. All variables

In [None]:
propensity_model_all = PropensityModel(outcome=y,
                                   treatment=t,
                                   covariates=X,
                                   outcome_name=OUTCOME,
                                   treatment_name='proned',
                                   covariates_name=covariates)
propensity_model_all.summary_stats()


In [None]:
propensity_model_all.est_propensity(X=X, t=t, method='balanced')
propensity_model_all.show_propensity()

In [None]:
np.mean(abs(propensity_model_all.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_all.trim()

In [None]:
np.mean(abs(propensity_model_all.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_all.causal_model.blocks = [0, 0.45, 0.6, 0.74, 1]
propensity_model_all.access_balance(method='default')

In [None]:
propensity_model_all.est_treatment_effect()

In [None]:
propensity_model_all.print_models(raw_effect=9, true_effect=15)

In [None]:
propensity_model_all.access_stability_via_matching()


## b. All with polynomial features

In [None]:
propensity_model_all_poly = PropensityModel(outcome=y,
                                   treatment=t,
                                   covariates=X,
                                   outcome_name=OUTCOME,
                                   treatment_name='proned',
                                   covariates_name=covariates)
propensity_model_all_poly.summary_stats()

In [None]:
propensity_cols = propensity_model.get_imbalanced_covariates(thresh=0.14)

if 'gender_V' in propensity_cols:
    propensity_cols.remove('gender_V')
if 'ph' in propensity_cols:
    propensity_cols.remove('ph')
if 'pco2' in propensity_cols:
    propensity_cols.remove('pco2')
if 'po2' in propensity_cols:
    propensity_cols.remove('po2')


y_subset, t_subset, X_subset = get_training_data(df=df_encoded[propensity_cols],
                                                 treatment_col='treated',
                                                 outcome_col=OUTCOME)

covariates = get_covariate_names(df=df_encoded[propensity_cols],
                                 treatment_col='treated',
                                 outcome_col=OUTCOME)


print(covariates)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)
poly.fit_transform(X)
poly = PolynomialFeatures(include_bias=False, interaction_only=True)
X_poly = poly.fit_transform(X)

In [None]:
propensity_model_all_poly.est_propensity(X=X_poly, t=t, method='balanced')
#propensity_model.est_propensity(X=X_subset, t=t_subset, method='balanced')
propensity_model_all_poly.show_propensity()

In [None]:
np.mean(abs(propensity_model_all_poly.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_all_poly.trim()

In [None]:
np.mean(abs(propensity_model_all_poly.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_all_poly.causal_model.blocks = [0, 0.5, 0.65, 0.8, 1]
propensity_model_all_poly.access_balance(method='default')

In [None]:
propensity_model_all_poly.est_treatment_effect()

In [None]:
propensity_model_all_poly.print_models(raw_effect=9, true_effect=15)

In [None]:
propensity_model.access_stability_via_matching()

In [None]:
propensity_model.access_stability_via_ols()

In [None]:
n_of_control = []
n_of_treated = []
p_bin = []
effect = []

for stratum in propensity_model.causal_model.strata:
    p_min = stratum.summary_stats['p_min'].round(2)
    p_max = stratum.summary_stats['p_max'].round(2)
    index = '[{},{}]'.format(p_min, p_max)
    p_bin.append(index)

    stratum.est_via_matching(bias_adj=True)
    ate = stratum.estimates['matching']['ate']
    effect.append(ate)

    n_of_control.append(stratum.summary_stats['N_c'])
    n_of_treated.append(stratum.summary_stats['N_t'])

df = pd.DataFrame({'n_of_treated': n_of_treated,
                   'n_of_control': n_of_control,
                   'ate': effect}, index=p_bin)

In [None]:
ax =  df.plot.bar(y=['n_of_control', 'n_of_treated'], ylabel='Frequency', figsize=(10, 5))
df.plot(y='ate', c='k', ax=ax, use_index=False, secondary_y=True, mark_right=False)
ax.right_ax.set_ylabel('ATE')

In [None]:
# https://seaborn.pydata.org/examples/pairgrid_dotplot.html


In [None]:
df_balance = pd.DataFrame([], columns=['raw_diff', 'all', 'all_poly', 'subset', 'subset_poly'])
df_balance.loc[:,'raw_diff'] = propensity_model.causal_model.summary_stats['ndiff']
df_balance = df_balance.abs().round(2)

In [None]:
COLUMN_NAME = 'all'
df_balance[COLUMN_NAME] = 0

In [None]:
df_strat_balance = pd.DataFrame([])
weights = []
n_of_strata = 0

for stratum in propensity_model.causal_model.strata:
    n_of_strata += 1
    column = 'strata_{}'.format(n_of_strata)
    df_strat_balance[column] = stratum.summary_stats['ndiff']
    weights.append(stratum.summary_stats['N'])
df_strat_balance = df_strat_balance.abs()


for row, column in df.iterrows():
    print(row)
    df_balance.loc[row, COLUMN_NAME] = np.average(column, weights=np.asarray(weights)).round(2)

In [None]:
df_balance.loc['mean'] = df.mean()
df_balance.index = propensity_model.covariates_name

In [None]:
df_balance


In [None]:
# What if I train using an XGBoost?


def compare_balance()