In [None]:
import pandas as pd
import numpy as np

import sys, os

import seaborn as sns
import matplotlib.pyplot as plt

from causalinference import CausalModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from scipy.stats import wasserstein_distance
from scipy import stats

In [None]:
os.chdir('/home/adam/files/data/04012020/')
df = pd.read_csv('data_guerin_rct.csv')

In [None]:
df.drop(columns=['pf_ratio_4h_outcome'], inplace=True)
df.dropna(subset=['pf_ratio_12h_outcome'], inplace=True)

In [None]:
df.info(max_cols=200)

In [None]:
COLS = ['lactate',
        'tidal_volume',
        'respiratory_rate_measured',
        'peep',
        'fio2']
        #'lung_compliance_static'

# tidal volume is highly correlated with lung_compliance
# and tidal volume is more imbalanced and have more observations

COLS_bool = df.filter(regex='med').columns.to_list()

In [None]:
#df_one_hot_encoded = pd.get_dummies(df[COLS_bool])
#df_one_hot_encoded.drop(columns=df_one_hot_encoded.
#                        filter(regex='False').
#                        filter(regex="nice").
#                        columns.
#                        to_list(),
#                        inplace=True)
#
#df_one_hot_encoded.info()

In [None]:
df[COLS].corr().round(2)

In [None]:
treated = df_one_hot_encoded.iloc[:,0].values.astype('int')

t = df_one_hot_encoded.loc[:, 'treated'].values
print(t.shape)
X = df_one_hot_encoded.drop(columns=['treated', 'pf_ratio_12h_outcome']).values
print(X.shape)
y = df_one_hot_encoded.loc[:, 'pf_ratio_12h_outcome'].values
print(y.shape)

In [None]:
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(X)
X = imp.transform(X)

# Standardize the predictors
scaler = StandardScaler().fit(X)
X = scaler.transform(X)

## 3. Causal modelling

In [None]:
# Instantiate CausalModel

causal = CausalModel(y, t, X)
print(causal.summary_stats)

In [None]:
X_names = df_one_hot_encoded.drop(columns=['treated', 'pf_ratio_12h_outcome']).columns.to_list()
ndiff = causal.summary_stats['ndiff']
new_dict = {k: round(v, 2) for k, v in zip(X_names, ndiff)}
print(new_dict)


In [None]:
df_summary = pd.DataFrame({'ndiff': ndiff}, index=X_names)
df_summary['ndiff'] = df_summary['ndiff'].map(lambda x: round(x, 2))

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

SEED = 1234
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=SEED,
                         class_weight='balanced',
                         penalty='none').fit(X, t)
print(classification_report(y, clf.predict(X)))
pscore = clf.predict_proba(X)[:, 1]
df_summary['lr_coef'] = clf.coef_.round(2).reshape((20,))

In [None]:
confusion_matrix(y, clf.predict(X))

In [None]:
sns.distplot(pscore[t],
             hist = True,
             kde = True,
             label='Prone')

sns.distplot(pscore[~t],
             hist = True,
             kde = True,
             label='Supine')

# Plot formatting
plt.legend(prop={'size': 12})
plt.title('Pscore')
plt.xlabel('pscore')
plt.ylabel('Density')

In [None]:
causal.raw_data._dict['pscore'] = pscore

In [None]:
# extract obesity as a feature