In [None]:
import pandas as pd
import numpy as np

import sys, os

import seaborn as sns
import matplotlib.pyplot as plt

from causalinference import CausalModel

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from scipy.stats import wasserstein_distance
from scipy import stats

In [None]:
os.chdir('/home/adam/files/data/13012020/')
df = pd.read_csv('use_case_inclusion_8h_all_outputs_cvvh.csv')
df.start_timestamp = df.start_timestamp.astype('datetime64[ns]')
df.end_timestamp = df.end_timestamp.astype('datetime64[ns]')
df.info(max_cols=200)

In [None]:
df['pf_ratio'] = 0
pf_ratio_is_na = df.po2.isna() | df.fio2.isna()
df.loc[~pf_ratio_is_na, 'pf_ratio'] = df.loc[~pf_ratio_is_na, 'po2'] / df.loc[~pf_ratio_is_na, 'fio2']
df.loc[~pf_ratio_is_na, 'pf_ratio'] = df.loc[~pf_ratio_is_na, 'pf_ratio'].map(lambda x: int(round(x * 100)))

In [None]:
df.loc[df.pco2_arterial.isna(), 'pco2_arterial'] = df.loc[df.pco2_arterial.isna(), 'pco2_unspecified']
df = df.rename(columns={'pco2_arterial':'pco2'})

In [None]:
df.loc[df.lactate_arterial.isna(), 'lactate_arterial'] = df.loc[df.lactate_arterial.isna(), 'lactate_blood']
df.loc[df.lactate_arterial.isna(), 'lactate_arterial'] = df.loc[df.lactate_arterial.isna(), 'lactate_unspecified']
df = df.rename(columns={'lactate_arterial':'lactate'})


In [None]:
df.loc[df.ph_arterial.isna(), 'ph_arterial'] = df.loc[df.ph_arterial.isna(), 'ph_unspecified']
df = df.rename(columns={'ph_arterial':'ph'})

In [None]:
df['nice_morbid_obesity'] = df['bmi'].map(lambda x: x > 35)
df.nice_morbid_obesity.value_counts()

In [None]:
df['med_vasopressors'] = df['atc_C01CA03'] |\
                         df['atc_C01CA04'] |\
                         df['atc_C01CA24'] |\
                         df['atc_H01BA01'] |\
                         df['atc_H01BA04']

In [None]:
df['med_glucocorticoids'] = df['atc_H02A']

In [None]:
df['med_muscle_relaxants'] = df['atc_M03']

In [None]:
from data_warehouse_utils.dataloader import DataLoader

dl = DataLoader()
df_aki = dl.get_patients()
df_aki = df_aki[['hash_patient_id', 'nice_aki']]
df = pd.merge(df, df_aki, how='left', on='hash_patient_id')

In [None]:
df['renal_replacement_therapy'] = ~df['cvvh_blood_flow'].isna() | ~df['cvvhd_blood_flow'].isna()
df['renal_replacement_therapy'].value_counts()

In [None]:
#df['low_tidal_volume_rs'] = ?

In [None]:
VARIABLES_RCT = ['treated',
                 'age',
                 'gender',
                 'bmi',
                 'nice_diabetes',
                 'nice_aki',
                 'nice_cirrhosis',
                 'nice_hem_malign',
                 'nice_copd',
                 'nice_imm_insuf',
                 'nice_cardio_vasc_insuf',
                 'nice_morbid_obesity',
                 'sofa_score',
                 'lactate',
                 'tidal_volume',
                 'respiratory_rate_measured',
                 'peep',
                 'fio2',
                 'po2',
                 'pco2',
                 'ph',
                 'plateau_pressure',
                 'driving_pressure',
                 'lung_compliance_static',
                 'med_muscle_relaxants',
                 'med_vasopressors',
                 'med_glucocorticoids',
                 'pf_ratio',
                 'pf_ratio_2h_outcome',
                 'pf_ratio_12h_outcome',
                 'pf_ratio_16h_outcome']

df_rct = df[VARIABLES_RCT]
df_rct.info(max_cols=200)

#Should add a late outcome 18h-22h

In [None]:
df_rct.head()

In [None]:
df_rct.rename(columns={'pf_ratio_2h_outcome':'pf_ratio_8h_outcome',
                       'pf_ratio_16h_outcome':'pf_ratio_24h_outcome'},
              inplace=True)

In [None]:
os.chdir('/home/adam/files/data/13012020/')
df_rct.to_csv('data_guerin_rct.csv', index=False)


#### Not ready:

In [None]:
df_fluid = dl.get_range_measurements(columns=['hash_patient_id',
                                              'start_timestamp',
                                              'end_timestamp',
                                              'unit_name',
                                              'effective_value',
                                              'pacmed_name'],
                                     parameters=['fluid_in',
                                                 'fluid_out']
                                     )

In [None]:
df_fluid[df_fluid.unit_name == 'ml/h']

In [None]:
for row, column in df_fluid.iterrows():
    fluid_balance = df_fluid[df_fluid.hash_patient_id == column.hash_patient_id]
    fluid_balance = df_fluid[df_fluid.start_timestamp == column.hash_patient_id]

    column.
patient_id = row.hash_patient_id

    start_outcome = row.start_timestamp + timedelta(hours=first_outcome_hours)
    end_outcome = start_outcome + timedelta(hours=last_outcome_hours)
    end_session = row.end_timestamp

    df = df[df.hash_patient_id == patient_id]
    df = df[df.effective_timestamp >= start_outcome]
    df = df[df.effective_timestamp <= end_outcome]
    df = df[df.effective_timestamp <= end_session]

In [None]:
df[['fio2_inclusion_8h',
    'po2_inclusion_8h',
    'peep_inclusion_8h',
    'fio2',
    'po2',
    'peep']].describe()

In [None]:
df[df.treated].info()

In [None]:
df[(df.treated) & (df['fio2_inclusion_8h'] > 50)].info()

In [None]:
sns.distplot(df['pf_ratio_inclusion_8h'],
                     hist=True,
                     bins=10,
                     kde=True,
                     label='Prone',
                     norm_hist=True)

sns.distplot(df.loc[df.pf_ratio < 500, 'pf_ratio'],
                     hist=True,
                     bins=10,
                     kde=True,
                     label='Supine',
                     norm_hist=True)

        # Plot formatting
plt.legend(prop={'size': 12})
plt.title('Estimated propensity score of being turned to prone position.')
plt.xlabel('Propensity score')
plt.ylabel('Density')
plt.show()

In [None]:
sns.distplot(df['fio2_inclusion_8h'],
                     hist=True,
                     bins=10,
                     kde=True,
                     label='Prone',
                     norm_hist=True)

sns.distplot(df['fio2'],
                     hist=True,
                     bins=10,
                     kde=True,
                     label='Supine',
                     norm_hist=True)

        # Plot formatting
plt.legend(prop={'size': 12})
plt.title('Estimated propensity score of being turned to prone position.')
plt.xlabel('Propensity score')
plt.ylabel('Density')
plt.show()

In [None]:
sns.distplot(df.loc[df.po2_inclusion_8h < 200,'po2_inclusion_8h'],
                     hist=True,
                     bins=10,
                     kde=True,
                     label='Prone',
                     norm_hist=True)

sns.distplot(df.loc[df.po2 < 200, 'po2'],
                     hist=True,
                     bins=10,
                     kde=True,
                     label='Supine',
                     norm_hist=True)

        # Plot formatting
plt.legend(prop={'size': 12})
plt.title('Estimated propensity score of being turned to prone position.')
plt.xlabel('Propensity score')
plt.ylabel('Density')
plt.show()


In [None]:
sns.distplot(df['peep_inclusion_8h'],
                     hist=True,
                     bins=10,
                     kde=True,
                     label='Prone',
                     norm_hist=True)

sns.distplot(df['peep'],
                     hist=True,
                     bins=10,
                     kde=True,
                     label='Supine',
                     norm_hist=True)

        # Plot formatting
plt.legend(prop={'size': 12})
plt.title('Estimated propensity score of being turned to prone position.')
plt.xlabel('Propensity score')
plt.ylabel('Density')
plt.show()

In [None]:
df[['po2_inclusion_8h', 'po2']].describe()