In [None]:
import pandas as pd
import numpy as np

from typing import Optional

from data_warehouse_utils.dataloader import DataLoader

import math

import pandas as pd
import numpy as np

import sys, os
from datetime import timedelta

import seaborn as sns
import matplotlib.pyplot as plt

from data_warehouse_utils.dataloader import DataLoader

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from scipy.stats import wasserstein_distance
from scipy import stats

In [None]:
from causal_inference.make_data.make_raw_data import COLUMNS_POSITION

In [None]:
os.chdir('/home/adam/adam/causal_inference')

In [None]:
PATH = '/home/adam/adam/data/causal_inference/data'

In [None]:
dl = DataLoader()


In [None]:
df_rotation = dl.get_range_measurements(parameters=['position'],
                                        columns=['hash_patient_id',
                                                 'end_timestamp',
                                                 'start_timestamp',
                                                 'unit_name',
                                                 'effective_value',
                                                 'numerical_value',
                                                 'is_correct_unit_yn',
                                                 'hospital',
                                                 'ehr',
                                                 'episode_id',
                                                 'pacmed_subname'])


In [None]:
df_rotation.head()

In [None]:
df_rotation.loc[df_rotation['pacmed_subname'] == 'position_body', ]

In [None]:
def remove_bed_rotation(df):

    dl = DataLoader()
    patients = df['hash_patient_id'].unique().tolist()
    df_rotation = dl.get_range_measurements(patients=patients,
                                            parameters=['position'],
                                            sub_parameters=['position_bed'])
    df_rotation = df_rotation.loc[(df_rotation.effective_value == '30_degrees') |
                                  (df_rotation.effective_value == '45_degrees') |
                                  (df_rotation.effective_value == 'bed_chair'),
                                  ['start_timestamp', 'hash_patient_id']]
    hours_to_rotation = [bed_rotated(x, y, z, df_rotation) for x, y, z in zip(df.loc[:, 'hash_patient_id'],
                                                                              df.loc[:, 'start_timestamp'],
                                                                              df.loc[:, 'end_timestamp'])]

    df['hours_to_rotation'] = hours_to_rotation

    mask = (df.hours_to_rotation > 0) & (df.hours_to_rotation < 24) & (df.treated)
    df.loc[mask, 'duration_hours'] = df.loc[mask, 'hours_to_rotation']
    df.loc[mask, 'hours_to_rotation'] = df.loc[mask, 'hours_to_rotation'].map(lambda x: timedelta(hours=x))
    df.loc[mask, 'end_timestamp'] = df.loc[mask, 'start_timestamp'] + df.loc[mask, 'hours_to_rotation']
    df_new = df[mask]
    df_new = add_outcomes(dl=DataLoader(), df=df_new, df_measurements=None)
    print(len(df_new), "outcomes corrected.")
    df.loc[mask, 'pf_ratio_2h_8h_outcome'] = df_new.loc[:, 'pf_ratio_2h_8h_outcome']
    df.loc[mask, 'pf_ratio_2h_8h_manual_outcome'] = df_new.loc[:, 'pf_ratio_2h_8h_manual_outcome']
    df.loc[mask, 'pf_ratio_12h_24h_outcome'] = df_new.loc[:, 'pf_ratio_12h_24h_outcome']
    df.loc[mask, 'pf_ratio_12h_24h_manual_outcome'] = df_new.loc[:, 'pf_ratio_12h_24h_manual_outcome']
    return df

def bed_rotated(hash_id, start, end, df):
    mask = (start <= df.start_timestamp) & (df.start_timestamp < end) & (df.hash_patient_id == hash_id)
    df = df[mask].sort_values(by=['start_timestamp'], ascending=True)
    return 0 if len(df.index) == 0 else int(math.floor(((df.start_timestamp.iloc[0] - start).total_seconds()/(60*60))))

In [None]:
df = remove_observations_corrupted(df)

In [None]:
os.chdir('/home/adam/adam/data/19012021/')
df.to_csv('data_raw_fixed_prone.csv', index=False)


In [None]:
VARIABLES_RCT = ['treated',
                 'age',
                 'gender',
                 'bmi',
                 'nice_diabetes',
                 'nice_aki',
                 'nice_cirrhosis',
                 'nice_hem_malign',
                 'nice_copd',
                 'nice_imm_insuf',
                 'nice_cardio_vasc_insuf',
                 'nice_morbid_obesity',
                 'renal_replacement_therapy',
                 'sofa_score',
                 'lactate',
                 'tidal_volume',
                 'tidal_volume_per_kg',
                 'respiratory_rate_measured',
                 'peep',
                 'fio2',
                 'po2',
                 'pco2',
                 'ph',
                 'plateau_pressure',
                 'driving_pressure',
                 'lung_compliance_static',
                 'med_muscle_relaxants',
                 'med_vasopressors',
                 'med_glucocorticoids',
                 'pf_ratio',
                 'pf_ratio_2h_8h_outcome',
                 'pf_ratio_2h_8h_manual_outcome',
                 'pf_ratio_12h_24h_outcome',
                 'pf_ratio_12h_24h_manual_outcome']

df_rct = df[VARIABLES_RCT]
df_rct.info(max_cols=200)

In [None]:
os.chdir('/home/adam/adam/data/19012021/')
df_rct.to_csv('data_guerin_rct_fixed_prone.csv', index=False)

In [None]:
df_rct.info()

In [None]:
df_new.info()

In [None]:
df.loc[df['pf_ratio_12h_24h_manual_outcome'].isna(), 'pf_ratio_12h_24h_manual_outcome']

In [None]:
df_new.loc[df['pf_ratio_12h_24h_manual_outcome'].isna(), 'pf_ratio_12h_24h_manual_outcome']

In [None]:
(df.loc[~df['pf_ratio_2h_8h_manual_outcome'].isna(), 'pf_ratio_2h_8h_manual_outcome'] == df_new.loc[~df['pf_ratio_2h_8h_manual_outcome'].isna(), 'pf_ratio_2h_8h_manual_outcome']).describe()

In [None]:
patients = df.hash_patient_id.value_counts()[df.hash_patient_id.value_counts() > 50].index.unique().tolist()
patients

In [None]:
df_new = df[df.hash_patient_id.isin(patients)]

In [None]:
print(len(df_new[df_new.treated]))
print(len(df_new[~df_new.treated]))

In [None]:
df_new[['nice_aki',
       'renal_replacement_therapy',
       'pf_ratio_inclusion_8h',
       'pf_ratio_12h_24h_manual_outcome']].describe()

In [None]:
df[~df.hash_patient_id.isin(patients)][['nice_aki',
       'renal_replacement_therapy',
       'pf_ratio_inclusion_8h',
       'pf_ratio_12h_24h_manual_outcome']].describe()

In [None]:
df[df.treated & ~df.hash_patient_id.isin(patients)][['nice_aki',
       'renal_replacement_therapy',
       'pf_ratio_inclusion_8h',
       'pf_ratio_12h_24h_manual_outcome']].describe()

In [None]:
df[df.treated & df.hash_patient_id.isin(patients)][['nice_aki',
       'renal_replacement_therapy',
       'pf_ratio_inclusion_8h',
       'pf_ratio_12h_24h_manual_outcome']].describe()

In [None]:
df[~df.treated & ~df.hash_patient_id.isin(patients)][['nice_aki',
       'renal_replacement_therapy',
       'pf_ratio_inclusion_8h',
       'pf_ratio_12h_24h_manual_outcome']].describe()

In [None]:
df[~df.treated & df.hash_patient_id.isin(patients)][['nice_aki',
       'renal_replacement_therapy',
       'pf_ratio_inclusion_8h',
       'pf_ratio_12h_24h_manual_outcome']].describe()


In [None]:
df_new['nice_aki'].value_counts()

In [None]:
df_new['renal_replacement_therapy'].value_counts()

In [None]:
np.mean(df.loc[df.treated & df.hash_patient_id.isin(patients), 'pf_ratio_12h_24h_manual_outcome']) - np.mean(df.loc[~df.treated & df.hash_patient_id.isin(patients), 'pf_ratio_12h_24h_manual_outcome'])

In [None]:
np.mean(df.loc[df.treated & ~df.hash_patient_id.isin(patients), 'pf_ratio_12h_24h_manual_outcome']) - np.mean(df.loc[~df.treated & ~df.hash_patient_id.isin(patients), 'pf_ratio_12h_24h_manual_outcome'])

In [None]:
df_new[['nice_aki',
       'renal_replacement_therapy',
       'pf_ratio_inclusion_8h',
       'pf_ratio_12h_24h_manual_outcome']].describe()

In [None]:
len(df[df.treated])

In [None]:
mask = (df.treated == False) & (df.duration_hours >= 1460)
df_long = df[mask]

In [None]:
df_long.info(max_cols=200)

In [None]:
df.loc[~df.treated, ['pf_ratio_inclusion_8h',
                    'pf_ratio_2h_8h_manual_outcome',
                    'pf_ratio_12h_24h_manual_outcome']].describe()

In [None]:
df_long[['pf_ratio_inclusion_8h',
         'pf_ratio_2h_8h_manual_outcome',
         'pf_ratio_12h_24h_manual_outcome']].describe()

In [None]:
df_long.hash_session_id.unique().tolist()

In [None]:
df_long.hash_session_id.str.contains('\_\d+\_\d+$', regex=True).value_counts()

In [None]:
df_long[df_long.hash_session_id.str.contains('\d+_\d+$', regex=True)].hash_session_id.unique()

In [None]:
df.hash_session_id.str.contains('_\d+\_\d+$', regex=True).value_counts()
str.replace(/\/$/, "")

In [None]:
df['filename'] = df['filename'].map(lambda x: str(x)[:-4])

#### Extract all the session id's that is long

In [None]:
mask = (df.artificial_session == False) & (df.treated == False) & (df.duration_hours >= 1460)
df_long = df[mask]
print(len(df_long))
patients = df_long.hash_session_id.unique().tolist()
patients

In [None]:
mask = (df.artificial_session == True) & (df.treated == False) & (df.duration_hours >= 1460)
df_long = df[mask]
print(len(df_long))

In [None]:
mask = df_long.hash_session_id.str.contains('\_\d+_\d$', regex=True)
print(len(df_long.loc[mask, 'hash_session_id'].unique().tolist()))
print(df_long.loc[mask, 'hash_session_id'].unique().tolist())
df_long.loc[mask, 'hash_session_id'] = df_long.loc[mask, 'hash_session_id'].map(lambda x: str(x)[:-2])

In [None]:
mask = df_long.hash_session_id.str.contains('\_\d+_\d+$', regex=True)
print(len(df_long.loc[mask, 'hash_session_id'].unique().tolist()))
print(df_long.loc[mask, 'hash_session_id'].unique().tolist())
df_long.loc[mask, 'hash_session_id'] = df_long.loc[mask, 'hash_session_id'].map(lambda x: str(x)[:-3])

In [None]:
df_long['hash_session_id'].unique().tolist()
print(len(df_long['hash_session_id'].unique().tolist()))

In [None]:
patients = patients + df_long['hash_session_id'].unique().tolist()

In [None]:
patients = list(set(patients))

In [None]:
patients

In [None]:
#### Select only sessions that are not generated by this

In [None]:
df.loc[:, 'session_origin'] = 0

In [None]:
df.loc[df.artificial_session == False, 'session_origin'] = df.loc[df.artificial_session == False, 'hash_session_id']

In [None]:
mask = df.hash_session_id.str.contains('\_\d+_\d$', regex=True)
print(mask)
df.loc[mask, 'session_origin'] = df.loc[mask, 'hash_session_id'].map(lambda x: str(x)[:-2])
mask = df.hash_session_id.str.contains('\_\d+_\d\d$', regex=True)
print(df.loc[mask, 'hash_session_id'].unique().tolist())
df.loc[mask, 'session_origin'] = df.loc[mask, 'hash_session_id'].map(lambda x: str(x)[:-3])

In [None]:
df[df.session_origin.isin(patients)].index

In [None]:
df.session_origin.unique().tolist()

#### Second method

In [None]:
mask = (df.treated == False) & (df.duration_hours >= 1460)
df_long = df[mask]

In [None]:
long = []
for _, row in df_long.iterrows():
    long.append([row.hash_patient_id, row.start_timestamp, row.end_timestamp])

long

In [None]:
df['origin_long'] = False
origin_long = df.origin_long

In [None]:
for idx, row in df.iterrows():
    for i in range(len(long)):
        condition = (row.hash_patient_id == long[i][0]) & \
                    (row.start_timestamp >= long[i][1]) & \
                    (row.end_timestamp <= long[i][2])
        if condition:
            origin_long[idx] = True

In [None]:
df['origin_long'] = origin_long

In [None]:
df[df['origin_long'] == True].index