In [None]:
import pandas as pd
import numpy as np

import sys, os

import seaborn as sns
import matplotlib.pyplot as plt

from causalinference import CausalModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from scipy.stats import wasserstein_distance
from scipy import stats

In [None]:
def visualize_balance(df,
                      treated_column_name='treated',
                      covariate_column_name=None):

    df_plot = df[df[treated_column_name]]
    sns.distplot(df_plot[covariate_column_name],
                 hist = True,
                 kde = True,
                 label='Prone')
    xlim_diff = df_plot[covariate_column_name].quantile(q=0.98) - df_plot[covariate_column_name].quantile(q=0.96)
    xlim = df_plot[covariate_column_name].quantile(q=0.98) + 2*xlim_diff

    df_plot = df[~df[treated_column_name]]
    sns.distplot(df_plot[covariate_column_name],
                 hist = True,
                 kde = True,
                 label='Supine')
    xlim_diff = df_plot[covariate_column_name].quantile(q=0.98) - df_plot[covariate_column_name].quantile(q=0.96)
    xlim = max(df_plot[covariate_column_name].quantile(q=0.98) + 2*xlim_diff, xlim)
    # Plot formatting
    plt.legend(prop={'size': 12})
    plt.title('Distribution of {} in treated and control subpopulations.'.format(covariate_column_name))
    plt.xlabel(str(covariate_column_name))
    plt.ylabel('Density')
    plt.xlim(left=0, right=xlim)

    print("Mean value of {} in the treated subpopulation: {}.".format(
          covariate_column_name, round(df.loc[df[treated_column_name], covariate_column_name].mean(), 2)))
    print("Mean value of {} in the supine subpopulation: {}.".format(
          covariate_column_name, round(df.loc[~df[treated_column_name], covariate_column_name].mean()), 2))
    plt.show()



In [None]:
os.chdir('/home/adam/files/data/13012020/')
df = pd.read_csv('data_guerin_rct.csv')

df.info(max_cols=200)

In [None]:
visualize_balance(df,
                  covariate_column_name='pf_ratio')

In [None]:
visualize_balance(df,
                  covariate_column_name='pf_ratio_4h_outcome')

In [None]:
visualize_balance(df,
                  covariate_column_name='pf_ratio_12h_outcome')


In [None]:
visualize_balance(df,
                  covariate_column_name='pco2')

In [None]:
visualize_balance(df,
                  covariate_column_name='fio2')

In [None]:
visualize_balance(df,
                  covariate_column_name='driving_pressure')

In [None]:
df.corr().iloc[:,-1].round(2)

In [None]:
df[(df.pf_ratio < 400) & (df.pf_ratio_12h_outcome < 400)].plot.scatter(x='pf_ratio', y='pf_ratio_12h_outcome')

In [None]:
df[(df.pf_ratio_4h_outcome < 500) & (df.pf_ratio_12h_outcome < 500)].plot.scatter(x='pf_ratio_12h_outcome', y='pf_ratio_4h_outcome')

In [None]:
df[df.pf_ratio_4h_outcome < 500].plot.scatter(x='fio2', y='pf_ratio_4h_outcome')

In [None]:
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)


In [None]:
df_plot = df[['lactate',
              'tidal_volume',
              'respiratory_rate_measured',
              'peep',
              'fio2']]
sns.set()
sns.pairplot(df_plot, size = 2.5)
plt.show()

In [None]:
df_plot = df[['ph',
              'pco2',
              'po2',
              'driving_pressure',
              'fio2']]
sns.set()
sns.pairplot(df_plot, size = 2.5)
plt.show()


In [None]:
# https://stats.stackexchange.com/questions/404775/calculate-earth-movers-distance-for-two-grayscale-images

# https://stats.stackexchange.com/questions/276497/maximum-mean-discrepancy-distance-distribution/276618

# https://scikit-learn.org/stable/modules/feature_selection.html

# Read the book on how to access covariate strata

In [None]:
df.describe()