In [None]:
import pandas as pd
import numpy as np

import sys, os

import seaborn as sns
import matplotlib.pyplot as plt

from causalinference import CausalModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from scipy.stats import wasserstein_distance
from scipy import stats

In [None]:
os.chdir('/home/adam/files/data/04012020/')
df = pd.read_csv('data_guerin_rct.csv')

In [None]:
df.drop(columns=['pf_ratio_4h_outcome'], inplace=True)
df.dropna(subset=['pf_ratio_12h_outcome'], inplace=True)

In [None]:
df.info(max_cols=200)

In [None]:
COLS = ['lactate',
        'tidal_volume',
        'respiratory_rate_measured',
        'peep',
        'fio2']
        #'lung_compliance_static'

# tidal volume is highly correlated with lung_compliance
# and tidal volume is more imbalanced and have more observations

COLS_bool = df.filter(regex='med').columns.to_list()

In [None]:
#df_one_hot_encoded = pd.get_dummies(df[COLS_bool])
#df_one_hot_encoded.drop(columns=df_one_hot_encoded.
#                        filter(regex='False').
#                        filter(regex="nice").
#                        columns.
#                        to_list(),
#                        inplace=True)
#
#df_one_hot_encoded.info()

In [None]:
df[COLS].corr().round(2)

In [None]:
treated = df.iloc[:,0].values.astype('int')
t = df.loc[:, 'treated'].values

X_num = df[COLS].values
X_bool = df[COLS_bool].values

y = df.loc[:, 'pf_ratio_12h_outcome'].values

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X_num)
X_num = imp.transform(X_num)

# Standardize the predictors
scaler = StandardScaler().fit(X_num)
X_num = scaler.transform(X_num)

In [None]:
print(X_num.shape)
print(X_bool.shape)

X = np.hstack((X_num, X_bool))
print(X.shape)

## 3. Causal modelling

In [None]:
# Instantiate CausalModel

causal = CausalModel(y, t, X)
print(causal.summary_stats)

In [None]:
COLS = COLS_num + COLS_bool
X_names = df[COLS].drop(columns=['treated', 'pf_ratio_12h_outcome']).columns.to_list()
ndiff = causal.summary_stats['ndiff']
new_dict = {k: round(v, 2) for k, v in zip(X_names, ndiff)}
print(new_dict)


In [None]:
df_summary = pd.DataFrame({'ndiff': ndiff}, index=X_names)
df_summary['ndiff'] = df_summary['ndiff'].map(lambda x: round(x, 2))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

SEED = 1234
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=SEED,
                         class_weight='balanced',
                         penalty='none').fit(X, t)

pscore = clf.predict_proba(X)[:, 1]

In [None]:
sns.distplot(pscore[t],
             hist = True,
             kde = True,
             label='Prone')

sns.distplot(pscore[~t],
             hist = True,
             kde = True,
             label='Supine')

# Plot formatting
plt.legend(prop={'size': 12})
plt.title('Pscore')
plt.xlabel('pscore')
plt.ylabel('Density')

In [None]:
causal.raw_data._dict['pscore'] = pscore

In [None]:
clf.coef_

In [None]:
# We assign the new p-score
causal.raw_data._dict['pscore'] = pscore

We trim samples to ensure positivity

In [None]:
causal.trim_s()
print(causal.cutoff)
print(causal.summary_stats)

In [None]:
causal.stratify_s()
print(causal.strata)


In [None]:
for stratum in causal.strata:
    print(max(stratum.summary_stats['ndiff']))

Now we see that the imbalance decreased a little, but there is
still lot to do.

In [None]:
#causal.reset()

#### Model


In [None]:
causal.est_via_ols()
print(causal.estimates)

In [None]:
for stratum in causal.strata:
    stratum.est_via_blocking()
[stratum.estimates['blocking']['ate'] for stratum in causal.strata]

Taking the sample-weighted average of the above within-bin least squares estimates results in a propensity score
matching estimator that is commonly known as the blocking estimator.

In [None]:
# sample-weighted average of the within-bin least squares estimates

causal.est_via_blocking()
print(causal.estimates)


In [None]:
causal.est_via_matching(bias_adj=True)
print(causal.estimates)

In [None]:
for stratum in causal.strata:
    stratum.est_via_matching()
[stratum.estimates['matching']['ate'] for stratum in causal.strata]


In [None]:
causal.est_via_weighting()
print(causal.estimates)

In [None]:
y = []
yerr = []
x_label = []

for method, result in dict(causal.estimates).items():
    y.append(result["ate"])
    yerr.append(result["ate_se"])
    x_label.append(method)

y.append(3)
yerr.append(0)
x_label.append("raw")

x = np.arange(len(y))

plt.errorbar(x=x, y=y, yerr=yerr, linestyle="none", capsize=5, marker="o")
plt.xticks(x, x_label)
plt.title("Estimated Effect Size", fontsize=18)
plt.hlines(y=13, xmin=-0.5, xmax = 4.5, linestyles="dashed")
#plt.xlim(-0.5,3.5);
