In [None]:
import pandas as pd
import numpy as np

import sys, os

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from scipy.stats import wasserstein_distance
from scipy import stats

In [None]:
os.chdir('/home/adam/adam/data/19012021/')
df = pd.read_csv('observational_data_8h_inclusion_all_outputs_1.csv')
df.start_timestamp = df.start_timestamp.astype('datetime64[ns]')
df.end_timestamp = df.end_timestamp.astype('datetime64[ns]')
df.info(max_cols=200)

In [None]:
df[['fio2_inclusion_8h', 'peep_inclusion_8h', 'po2_inclusion_8h', 'pf_ratio_inclusion_8h']].describe()

INCLUDED = (df.pf_ratio_inclusion_8h > 0) & \
           (df.pf_ratio_inclusion_8h < 150) & \
           (df.peep_inclusion_8h >= 5) & \
           (df.fio2_inclusion_8h >= 60)
print(len(df.index))
df = df[INCLUDED]
print(len(df.index))

In [None]:
df_start = df.loc[df.artificial_session & (df.fio2 > 60), ['start_timestamp']]
print(len(df_start.index))
df_start['start_timestamp'] = df_start['start_timestamp'].dt.hour
df_start['is_randomization_point'] = False
df_start['is_randomization_point'] = (df_start.start_timestamp >= 7) & (df_start.start_timestamp <= 12)
df_start['is_randomization_point'].value_counts()

In [None]:
df.hash_patient_id[~df.treated].nunique()

In [None]:
df[df['artificial_session'] == True].duration_hours.describe()

In [None]:
# big share in artificial sessions

# E.g. on average one patient translates to x proning, y supine and z art. supine

In [None]:
from data_warehouse_utils.dataloader import DataLoader

In [None]:
dl = DataLoader()

subparameters = ['tidal_volume_per_kg', 'tidal_volume_per_kg_set']

patients = df.hash_patient_id.to_list()

tidal_volume_per_kg = dl.get_single_timestamp(patients=patients,
                                              sub_parameters=subparameters,
                                              columns=['hash_patient_id',
                                                   'effective_timestamp',
                                                   'effective_value',
                                                   'pacmed_name',
                                                   'pacmed_subname']
                                          )

In [None]:
from datetime import timedelta

def foo(x, y, z, df_meas):
    z = y
    y = y - timedelta(hours=8)

    expr = 'hash_patient_id == @x and @y <= effective_timestamp <= @z'
    result = df_meas.query(expr=expr).sort_values(by='effective_timestamp', ascending=True)
    result = result['effective_value']

    if len(result) > 0:
        # Note that extracting the outcome as taking the last value of both po2 and fio2 is different from
        # extracting po2_over_fio2 parameter.
        result = result.iloc[-1]

    else:
        result = np.NaN

    return result

In [None]:
outcomes = [foo(x, y, z, tidal_volume_per_kg) for x, y, z in
                            zip(df['hash_patient_id'], df['start_timestamp'], df['end_timestamp'])]

In [None]:
df['tidal_volume_per_kg'] = outcomes
df['tidal_volume_per_kg'] = df['tidal_volume_per_kg'].astype('float64')

In [None]:
df.info(max_cols=200)

In [None]:
df['fio2'] = df['fio2_inclusion_8h']
df['peep'] = df['peep_inclusion_8h']
df['po2'] = df['po2_inclusion_8h']

In [None]:
df['pf_ratio'] = 0
pf_ratio_is_na = df.po2.isna() | df.fio2.isna()
df.loc[~pf_ratio_is_na, 'pf_ratio'] = df.loc[~pf_ratio_is_na, 'po2'] / df.loc[~pf_ratio_is_na, 'fio2']
df.loc[~pf_ratio_is_na, 'pf_ratio'] = df.loc[~pf_ratio_is_na, 'pf_ratio'].map(lambda x: int(round(x * 100)))

In [None]:
df.loc[df.pco2_arterial.isna(), 'pco2_arterial'] = df.loc[df.pco2_arterial.isna(), 'pco2_unspecified']
df = df.rename(columns={'pco2_arterial':'pco2'})

In [None]:
df.loc[df.lactate_arterial.isna(), 'lactate_arterial'] = df.loc[df.lactate_arterial.isna(), 'lactate_blood']
df.loc[df.lactate_arterial.isna(), 'lactate_arterial'] = df.loc[df.lactate_arterial.isna(), 'lactate_unspecified']
df = df.rename(columns={'lactate_arterial':'lactate'})


In [None]:
df.loc[df.ph_arterial.isna(), 'ph_arterial'] = df.loc[df.ph_arterial.isna(), 'ph_unspecified']
df = df.rename(columns={'ph_arterial':'ph'})

In [None]:
df['nice_morbid_obesity'] = df['bmi'].map(lambda x: x > 35)
df.nice_morbid_obesity.value_counts()

In [None]:
df['med_vasopressors'] = df['atc_C01CA03'] |\
                         df['atc_C01CA04'] |\
                         df['atc_C01CA24'] |\
                         df['atc_H01BA01'] |\
                         df['atc_H01BA04']

In [None]:
df['med_glucocorticoids'] = df['atc_H02A']

In [None]:
df['med_muscle_relaxants'] = df['atc_M03']

In [None]:
from data_warehouse_utils.dataloader import DataLoader

dl = DataLoader()
df_aki = dl.get_patients()
df_aki = df_aki[['hash_patient_id', 'nice_aki']]
df = pd.merge(df, df_aki, how='left', on='hash_patient_id')

In [None]:
df['renal_replacement_therapy'] = ~df['cvvh_blood_flow'].isna() | ~df['cvvhd_blood_flow'].isna()
df['renal_replacement_therapy'].value_counts()

In [None]:
print(len(df.index))
df[['fio2', 'peep', 'po2', 'pf_ratio']].describe()


In [None]:
df[df['artificial_session'] == True].duration_hours.describe()

In [None]:
# big share in artificial sessions

# E.g. on average one patient translates to x proning, y supine and z art. supine

In [None]:
from data_warehouse_utils.dataloader import DataLoader

In [None]:
dl = DataLoader()

subparameters = ['tidal_volume_per_kg', 'tidal_volume_per_kg_set']

patients = df.hash_patient_id.to_list()

tidal_volume_per_kg = dl.get_single_timestamp(patients=patients,
                                              sub_parameters=subparameters,
                                              columns=['hash_patient_id',
                                                   'effective_timestamp',
                                                   'effective_value',
                                                   'pacmed_name',
                                                   'pacmed_subname']
                                          )

In [None]:
from datetime import timedelta

def foo(x, y, z, df_meas):
    z = y
    y = y - timedelta(hours=8)

    expr = 'hash_patient_id == @x and @y <= effective_timestamp <= @z'
    result = df_meas.query(expr=expr).sort_values(by='effective_timestamp', ascending=True)
    result = result['effective_value']

    if len(result) > 0:
        # Note that extracting the outcome as taking the last value of both po2 and fio2 is different from
        # extracting po2_over_fio2 parameter.
        result = result.iloc[-1]

    else:
        result = np.NaN

    return result

In [None]:
outcomes = [foo(x, y, z, tidal_volume_per_kg) for x, y, z in
                            zip(df['hash_patient_id'], df['start_timestamp'], df['end_timestamp'])]

In [None]:
df['tidal_volume_per_kg'] = outcomes
df['tidal_volume_per_kg'] = df['tidal_volume_per_kg'].astype('float64')

In [None]:
df.info(max_cols=200)

In [None]:
df['fio2'] = df['fio2_inclusion_8h']
df['peep'] = df['peep_inclusion_8h']
df['po2'] = df['po2_inclusion_8h']

In [None]:
df['pf_ratio'] = 0
pf_ratio_is_na = df.po2.isna() | df.fio2.isna()
df.loc[~pf_ratio_is_na, 'pf_ratio'] = df.loc[~pf_ratio_is_na, 'po2'] / df.loc[~pf_ratio_is_na, 'fio2']
df.loc[~pf_ratio_is_na, 'pf_ratio'] = df.loc[~pf_ratio_is_na, 'pf_ratio'].map(lambda x: int(round(x * 100)))

In [None]:
df.loc[df.pco2_arterial.isna(), 'pco2_arterial'] = df.loc[df.pco2_arterial.isna(), 'pco2_unspecified']
df = df.rename(columns={'pco2_arterial':'pco2'})

In [None]:
df.loc[df.lactate_arterial.isna(), 'lactate_arterial'] = df.loc[df.lactate_arterial.isna(), 'lactate_blood']
df.loc[df.lactate_arterial.isna(), 'lactate_arterial'] = df.loc[df.lactate_arterial.isna(), 'lactate_unspecified']
df = df.rename(columns={'lactate_arterial':'lactate'})


In [None]:
df.loc[df.ph_arterial.isna(), 'ph_arterial'] = df.loc[df.ph_arterial.isna(), 'ph_unspecified']
df = df.rename(columns={'ph_arterial':'ph'})

In [None]:
df['nice_morbid_obesity'] = df['bmi'].map(lambda x: x > 35)
df.nice_morbid_obesity.value_counts()

In [None]:
df['med_vasopressors'] = df['atc_C01CA03'] |\
                         df['atc_C01CA04'] |\
                         df['atc_C01CA24'] |\
                         df['atc_H01BA01'] |\
                         df['atc_H01BA04']

In [None]:
df['med_glucocorticoids'] = df['atc_H02A']

In [None]:
df['med_muscle_relaxants'] = df['atc_M03']

In [None]:
from data_warehouse_utils.dataloader import DataLoader

dl = DataLoader()
df_aki = dl.get_patients()
df_aki = df_aki[['hash_patient_id', 'nice_aki']]
df = pd.merge(df, df_aki, how='left', on='hash_patient_id')

In [None]:
df['renal_replacement_therapy'] = ~df['cvvh_blood_flow'].isna() | ~df['cvvhd_blood_flow'].isna()
df['renal_replacement_therapy'].value_counts()

In [None]:
print(len(df.index))
df[['fio2', 'peep', 'po2', 'pf_ratio']].describe()


In [None]:
from data_warehouse_utils.dataloader import DataLoader

dl = DataLoader()
patients = df['hash_patient_id'].unique().tolist()
df_rotation = dl.get_range_measurements(patients=patients,
                                        parameters=['position'])

In [None]:
os.chdir('/home/adam/adam/data/19012021/')
df_rotation.to_csv('rotation_data.csv', index=False)

In [None]:
df_rotation = df_rotation.loc[(df_rotation.effective_value == '30_degrees') |
                              (df_rotation.effective_value == '45_degrees') |
                              (df_rotation.effective_value == 'bed_chair'),
                              ['start_timestamp', 'hash_patient_id']]

In [None]:
df_rotation = df_rotation.loc[(df_rotation.effective_value == '30_degrees'), ['start_timestamp', 'hash_patient_id']]

In [None]:
df_rotation.head()

In [None]:
df_rotation.info()

In [None]:
def was_rotated(x, y, z, df):
    mask = (y < df.start_timestamp) & (df.start_timestamp < z) & (df.hash_patient_id == x)
    df = df[mask].sort_values(by=['start_timestamp'])
    # return True if len(df.index) == 0 else False
    # return len(df.index)
    return 0 if len(df.index) == 0 else df.start_timestamp.iloc[0] - y

In [None]:
rotation_list = [was_rotated(x, y, z, df_rotation) for x, y, z in
            zip(df.loc[:, 'hash_patient_id'],
                df.loc[:, 'start_timestamp'],
                df.loc[:, 'end_timestamp'])]

In [None]:
error = []

for i in range(len(rotation_list)):
    if rotation_list[i] == 0:
        error.append(0)
    else:
        error.append(int(round(rotation_list[i].total_seconds()/(60*60))))

In [None]:
df['error'] = error

In [None]:
df['error'].describe()

In [None]:
df.info(max_cols=200)

In [None]:
df.head()

In [None]:
os.chdir('/home/adam/adam/data/19012021/')
df.to_csv('data_raw.csv', index=False)

In [None]:
df.loc[df.treated & (df.error > 0) & (df.error < df.duration_hours), 'error'].describe()

In [None]:
df.hash_patient_id.nunique()

In [None]:
COLUMNS_TO_DROP = ['hash_session_id',
                   'hash_patient_id',
                   'start_timestamp',
                   'end_timestamp',
                   'duration_hours',
                   'pacmed_origin_hospital',
                   'fio2_inclusion_8h',
                   'peep_inclusion_8h',
                   'po2_inclusion_8h',
                   'artificial_session',
                   'death_timestamp',
                   'outcome',
                   'mortality',
                   'icu_mortality']
df.drop(columns=COLUMNS_TO_DROP, inplace=True)
df.info(max_cols=200)

In [None]:
os.chdir('/home/adam/adam/data/19012021/')
df.to_csv('data_all_variables.csv', index=False)

In [None]:
VARIABLES_RCT = ['treated',
                 'age',
                 'gender',
                 'bmi',
                 'nice_diabetes',
                 'nice_aki',
                 'nice_cirrhosis',
                 'nice_hem_malign',
                 'nice_copd',
                 'nice_imm_insuf',
                 'nice_cardio_vasc_insuf',
                 'nice_morbid_obesity',
                 'renal_replacement_therapy',
                 'sofa_score',
                 'lactate',
                 'tidal_volume',
                 'tidal_volume_per_kg',
                 'respiratory_rate_measured',
                 'peep',
                 'fio2',
                 'po2',
                 'pco2',
                 'ph',
                 'plateau_pressure',
                 'driving_pressure',
                 'lung_compliance_static',
                 'med_muscle_relaxants',
                 'med_vasopressors',
                 'med_glucocorticoids',
                 'pf_ratio',
                 'pf_ratio_2h_8h_outcome',
                 'pf_ratio_2h_8h_manual_outcome',
                 'pf_ratio_12h_24h_outcome',
                 'pf_ratio_12h_24h_manual_outcome']

df_rct = df[VARIABLES_RCT]
df_rct.info(max_cols=200)

#Should add a late outcome 18h-22h

In [None]:
df_rct.head()


In [None]:
df_rct.info()

In [None]:
df_rct = df_rct.drop(columns=['plateau_pressure'])

In [None]:
os.chdir('/home/adam/adam/data/19012021/')
df_rct.to_csv('data_guerin_rct.csv', index=False)

In [None]:
df_rct.describe()

In [None]:

df_rct.info()