In [None]:
%reset

In [None]:
import pandas as pd
import numpy as np

import sys, os

import seaborn as sns
import matplotlib.pyplot as plt

from causalinference import CausalModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from importlib import reload

from scipy.stats import wasserstein_distance
from scipy import stats

In [None]:
def process_data(df, outcome, thresh=0.6):

    # prepare outcomes
    outcomes_to_delete = df.filter(regex='outcome').\
                            columns.\
                            to_list()
    outcomes_to_delete.remove(outcome)
    df.drop(columns=outcomes_to_delete,
            inplace=True)
    df.dropna(subset=[outcome], inplace=True)

    # drop columns with missing values exceeding the thresh
    thresh = round(thresh * len(df.index))
    df = df.dropna(thresh=thresh, axis=1)

    # get dummies
    df = pd.get_dummies(df)
    columns_to_drop = ['gender_M'] + df.filter(regex='False').columns.to_list()
    df.drop(columns=columns_to_drop, inplace=True)

    # convert to bool
    for column in df.select_dtypes(include=['uint8']).columns.to_list():
        df[column] = df[column] == 1

    return df

def get_training_data(df, treatment_col, outcome_col):

    cols_num = df.select_dtypes(include=['float64']).columns.to_list()
    if outcome_col in cols_num:
        cols_num.remove(outcome_col)
    cols_bool = df.select_dtypes(include=['uint8', 'bool']).columns.to_list()
    if treatment_col in cols_bool:
        cols_bool.remove(treatment_col)

    t = df_encoded.loc[:, treatment_col].values
    X_bool = df_encoded[cols_bool].values
    X_num = df_encoded[cols_num].values
    y = df_encoded.loc[:, outcome_col].values


    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(X_num)
    X_num = imp.transform(X_num)
    scaler = StandardScaler().fit(X_num)
    X_num = scaler.transform(X_num)
    X = np.hstack((X_num, X_bool))

    return y, t, X

def get_covariate_names(df, treatment_col, outcome_col):

    cols_num = df.select_dtypes(include=['float64']).columns.to_list()
    if outcome_col in cols_num:
        cols_num.remove(outcome_col)
    cols_bool = df.select_dtypes(include=['uint8', 'bool']).columns.to_list()
    if treatment_col in cols_bool:
        cols_bool.remove(treatment_col)

    return cols_num + cols_bool

In [None]:
os.chdir('/home/adam/adam/causal_inference')
from causal_inference.model.propensity_model import PropensityModel
reload(sys.modules['causal_inference.model.propensity_model'])
from causal_inference.model.propensity_model import PropensityModel

In [None]:
os.chdir('/home/adam/adam/data/19012021/')
df = pd.read_csv('data_guerin_rct.csv')
df.info(max_cols=200)

In [None]:
df.loc[df.treated, ['fio2', 'peep', 'po2', 'pf_ratio']].describe()

In [None]:
df.loc[~df.treated, ['fio2', 'peep', 'po2', 'pf_ratio']].describe()

In [None]:
OUTCOME = 'pf_ratio_2h_8h_manual_outcome'

In [None]:
df.loc[df.treated, OUTCOME].describe()

In [None]:
df.loc[~df.treated, OUTCOME].describe()


In [None]:
df_encoded = process_data(df=df, outcome=OUTCOME)
df_encoded.info()

y, t, X = get_training_data(df=df_encoded,
                            treatment_col='treated',
                            outcome_col=OUTCOME)

covariates = get_covariate_names(df=df_encoded,
                                 treatment_col='treated',
                                 outcome_col=OUTCOME)

## 3. Causal modelling

#### V1: All variables

In [None]:
propensity_model_all = PropensityModel(outcome=y,
                                   treatment=t,
                                   covariates=X,
                                   outcome_name=OUTCOME,
                                   treatment_name='proned',
                                   covariates_name=covariates)
propensity_model_all.summary_stats()


In [None]:
propensity_model_all.est_propensity(X=X, t=t, method='balanced')
propensity_model_all.show_propensity()

In [None]:
np.mean(abs(propensity_model_all.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_all.trim()

In [None]:
np.mean(abs(propensity_model_all.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_all.causal_model.blocks = [0, 0.4, 0.6, 0.75, 1]
propensity_model_all.access_balance(method='default')


In [None]:
propensity_model_all.est_treatment_effect()

In [None]:
propensity_model_all.print_models(raw_effect=14.4, true_effect=15)

#### With medically relevant features

V2. Balance only potential confounders

In [None]:
propensity_model_subset = PropensityModel(outcome=y,
                                          treatment=t,
                                          covariates=X,
                                          outcome_name=OUTCOME,
                                          treatment_name='proned',
                                          covariates_name=covariates)
propensity_model_subset.summary_stats()


In [None]:
propensity_cols = ['pf_ratio',
                   'peep',
                   'fio2',
                   'tidal_volume_per_kg',
                   'nice_copd_True',
                   'nice_hem_malign_True']

In [None]:
y_subset, t_subset, X_subset = get_training_data(df=df_encoded[propensity_cols],
                                                 treatment_col='treated',
                                                 outcome_col=OUTCOME)

covariates = get_covariate_names(df=df_encoded[propensity_cols],
                                 treatment_col='treated',
                                 outcome_col=OUTCOME)


print(covariates)

In [None]:
propensity_model_subset.est_propensity(X=X_subset, t=t, method='balanced')
#propensity_model.est_propensity(X=X_subset, t=t_subset, method='balanced')
propensity_model_subset.show_propensity()

In [None]:
np.mean(abs(propensity_model_subset.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_subset.trim()

In [None]:
np.mean(abs(propensity_model_subset.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_subset.causal_model.blocks = [0, 0.4, 0.5, 0.6, 0.7, 1]
propensity_model_subset.access_balance(method='default')

In [None]:
propensity_model_subset.est_treatment_effect()

In [None]:
propensity_model_subset.print_models(raw_effect=14.3, true_effect=15)

V3. All + interactions + poly

In [None]:
propensity_model_poly = PropensityModel(outcome=y,
                                          treatment=t,
                                          covariates=X,
                                          outcome_name=OUTCOME,
                                          treatment_name='proned',
                                          covariates_name=covariates)
propensity_model_poly.summary_stats()


In [None]:
y, t, X = get_training_data(df=df_encoded,
                            treatment_col='treated',
                            outcome_col=OUTCOME)

covariates = get_covariate_names(df=df_encoded,
                                 treatment_col='treated',
                                 outcome_col=OUTCOME)


print(covariates)

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(2)
poly.fit_transform(X)
poly = PolynomialFeatures(include_bias=False, interaction_only=True)
X_poly = poly.fit_transform(X)

In [None]:
propensity_model_poly.est_propensity(X=X_poly, t=t, method='balanced')
#propensity_model.est_propensity(X=X_subset, t=t_subset, method='balanced')
propensity_model_poly.show_propensity()

In [None]:
np.mean(abs(propensity_model_poly.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_poly.trim()

In [None]:
np.mean(abs(propensity_model_poly.causal_model.summary_stats['ndiff']))

In [None]:
propensity_model_poly.causal_model.blocks = [0, 0.4, 0.6, 0.7, 0.8, 1]
propensity_model_poly.access_balance(method='default')

In [None]:
propensity_model_poly.est_treatment_effect()

In [None]:
propensity_model_poly.print_models(raw_effect=14.3, true_effect=15)




## Other computations

In [None]:
n_of_control = []
n_of_treated = []
p_bin = []
effect = []

for stratum in propensity_model_all.causal_model.strata:
    p_min = stratum.summary_stats['p_min'].round(2)
    p_max = stratum.summary_stats['p_max'].round(2)
    index = '[{},{}]'.format(p_min, p_max)
    p_bin.append(index)

    stratum.est_via_matching(bias_adj=True)
    ate = stratum.estimates['matching']['ate']
    effect.append(ate)

    n_of_control.append(stratum.summary_stats['N_c'])
    n_of_treated.append(stratum.summary_stats['N_t'])

df = pd.DataFrame({'n_of_treated': n_of_treated,
                   'n_of_control': n_of_control,
                   'ate': effect}, index=p_bin)

In [None]:
ax =  df.plot.bar(y=['n_of_control', 'n_of_treated'], ylabel='Frequency', figsize=(10, 5))
df.plot(y='ate', c='k', ax=ax, use_index=False, secondary_y=True, mark_right=False)
ax.right_ax.set_ylabel('ATE')

In [None]:
# https://seaborn.pydata.org/examples/pairgrid_dotplot.html


In [None]:
df_balance = pd.DataFrame([], columns=['raw_diff', 'all', 'all_poly', 'subset', 'subset_poly'])
df_balance.loc[:,'raw_diff'] = propensity_model_all.causal_model.summary_stats['ndiff']
df_balance = df_balance.abs().round(2)

In [None]:
COLUMN_NAME = 'all'
df_balance[COLUMN_NAME] = 0

In [None]:
df_strat_balance = pd.DataFrame([])
weights = []
n_of_strata = 0

for stratum in propensity_model_all.causal_model.strata:
    n_of_strata += 1
    column = 'strata_{}'.format(n_of_strata)
    df_strat_balance[column] = stratum.summary_stats['ndiff']
    weights.append(stratum.summary_stats['N'])
df_strat_balance = df_strat_balance.abs()


for row, column in df.iterrows():
    print(row)
    df_balance.loc[row, COLUMN_NAME] = np.average(column, weights=np.asarray(weights)).round(2)

In [None]:
df_balance.loc['mean'] = df.mean()
df_balance.index = propensity_model.covariates_name

In [None]:
df_balance


In [None]:
# What if I train using an XGBoost?


def compare_balance()

#### OLS estimation

In [None]:
import statsmodels.api as sm
import numpy as np

In [None]:
X.shape

In [None]:
t.reshape((len(t), 1)).shape

In [None]:
y.shape

In [None]:
X_morbid = np.multiply(t, X[:, 14])

In [None]:
from sklearn.preprocessing import PolynomialFeatures
X_new = sm.add_constant(np.hstack((t.reshape((len(t), 1)),X)))
#X_new = np.hstack((X_new, X_morbid.reshape((len(t), 1))))

In [None]:
poly = PolynomialFeatures(2)
poly.fit_transform(X_new)
poly = PolynomialFeatures(include_bias=True, interaction_only=False)
X_new = poly.fit_transform(X_new)

In [None]:
poly.get_feature_names()

In [None]:
X_morbid = X[:, 14]
sum(X_morbid)

In [None]:
obese = X[:, 14] > 0

In [None]:
obese

In [None]:
X_obese = X_new[obese]

X_obese.shape

In [None]:
sum(t[obese]) / len(t[obese])

In [None]:
sum(t[~obese]) / len(t[~obese])

In [None]:
model = sm.OLS(y, X_new)

results = model.fit()

print('R2: ', results.rsquared)
print(results.summary())

In [None]:
df_ols_summary = pd.DataFrame({'OLS_coef_':results.params.round(2).tolist()},
                              index=['intercept', 'treated'] + covariates)
df_ols_summary['OLS_pvalues_'] = results.pvalues.round(2)
df_ols_summary

In [None]:
from statsmodels.compat import lzip

import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms

name = ['Jarque-Bera', 'Chi^2 two-tail prob.', 'Skew', 'Kurtosis']
test = sms.jarque_bera(results.resid)
lzip(name, test)

In [None]:
# Add interactions

log_reg = sm.Logit(t, X).fit()

In [None]:
print(log_reg.summary())

In [None]:
covariates

In [None]:
import statsmodels.api as sm

log_reg = sm.Logit(t, sm.add_constant(X)).fit()

print(log_reg.summary())

In [None]:
y, t, X = get_training_data(df=df_encoded[['pf_ratio',
                                          'peep',
                                          'fio2',
                                          'tidal_volume',
                                          'nice_copd_True',
                                          'nice_hem_malign_True']],
                            treatment_col='treated',
                            outcome_col=OUTCOME)

covariates = get_covariate_names(df=df_encoded,
                                 treatment_col='treated',
                                 outcome_col=OUTCOME)

In [None]:
new_X = np.hstack((X**(i+1) for i in range(k)))

In [None]:
import numpy as np

from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split

import xgboost as xgb

In [None]:
print(X.shape)

print(t.shape)

print(y.shape)

In [None]:
X_all = np.concatenate((X, t.reshape(len(t), 1)), axis=1)
print(X_all.shape)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_all, y)

In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

rmse=mean_squared_error(y_test, y_pred, squared=False)

print(rmse)

In [None]:
from sklearn.metrics import r2_score

coefficient_of_dermination = r2_score(y_test, y_pred)
print(coefficient_of_dermination)

In [None]:
test_treated = X_test
test_treated[:, -1] = 1
m_1 = xgb_model.predict(test_treated)
test_treated[:, -1] = 0
m_0 = xgb_model.predict(test_treated)

print(np.mean(m_1 - m_0))

In [None]:
def logit_ip_f(y, X):
    """
    Create the f(y|X) part of IP weights
    from logistic regression

    Parameters
    ----------
    y : Pandas Series
    X : Pandas DataFrame

    Returns
    -------
    Numpy array of IP weights

    """
    model = sm.Logit(y, X)
    res = model.fit()
    weights = np.zeros(X.shape[0])
    weights[y == 1] = res.predict(X[y == 1])
    weights[y == 0] = (1 - res.predict(X[y == 0]))
    return weights

In [None]:
denoms = logit_ip_f(t, X)
weights = 1 / denoms

In [None]:
wls = sm.WLS(y, sm.add_constant(X), weights=weights)
res = wls.fit()
print(res.summary())