In [2]:
import os
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance
from GCS_map import GCS_int

# Function to load time-series data for a patient
def load_patient_data(patient_file, directory):
    file_path = os.path.join(directory, patient_file)
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
    else:
        raise FileNotFoundError(f"File {file_path} not found.")

# Function to extract raw values for a given vital sign from a list of patient files
def extract_vital_data(df, directory, vital_index):
    vital_data = []
    
    for _, row in df.iterrows():
        patient_file = row['stay']  # The file name is in the 'stay' column
        try:
            patient_data = load_patient_data(patient_file, directory)
            patient_data = GCS_int(patient_data)
            # Assume vital sign data is in columns, we are using vital_index to specify which vital sign
            vital_series = patient_data.iloc[:, vital_index]  # Extract the vital sign data (e.g., Oxygen Saturation)
            vital_data.extend(vital_series.dropna())  # Collect raw values for all patients, remove NaNs if any
        except FileNotFoundError as e:
            print(e)
    
    return np.array(vital_data)

# Function to compute Wasserstein distance for a set of vital signs and return in a DataFrame
def compute_wasserstein_for_vitals(df1, df2, directory, vital_indices, vital_names):
    # Dictionary to hold results
    results = {'Vital': [], 'Wasserstein Distance': []}
    
    # Loop through each vital sign
    for i, vital_name in zip(vital_indices, vital_names):
        # Extract raw values for both cohorts for the current vital sign
        data_cohort1 = extract_vital_data(df1, directory, i)
        data_cohort2 = extract_vital_data(df2, directory, i)
        
        # Calculate the Wasserstein distance between the two sets of raw values
        distance = wasserstein_distance(data_cohort1, data_cohort2)
        
        # Store the result in the dictionary
        results['Vital'].append(vital_name)
        results['Wasserstein Distance'].append(distance)
    
    # Convert the results to a pandas DataFrame
    result_df = pd.DataFrame(results)
    
    return result_df

# Define the vital sign indices and corresponding names (example vital signs)
vital_names = ['Cap rate', 'Diastolic BP',
       'FO2', 'GCS eye opening',
       'GCS motor response', 'GCS total',
       'GCS verbal response', 'Glucose', 'Heart Rate', 'Height',
       'Mean BP', 'Oxygen saturation', 'Respiratory rate',
       'Systolic BP', 'Temperature', 'Weight', 'pH']

vital_indices = [i+1 for i in range(len(vital_names))]  # Replace with the actual column indices of the vital signs (e.g., Oxygen Saturation, BP)



### Load patient cohort

In [12]:
data_directory = 'data/task-data/in-hospital-mortality/train-filled-48H'

sepsis_file = 'data/task-data/in-hospital-mortality/train_listfile_sepsis.csv'
original_file = 'data/task-data/in-hospital-mortality/train_listfile.csv'
pneumonia_file = 'data/task-data/in-hospital-mortality/train_listfile_PNEUMONIA.csv'


df_sepsis = pd.read_csv(sepsis_file)
df_pneumonia = pd.read_csv(pneumonia_file)
df_original = pd.read_csv(original_file)


df_sepsis_C1 = df_sepsis[df_sepsis['y_true'] == 1]
df_sepsis_C0 = df_sepsis[df_sepsis['y_true'] == 0]

df_pneumonia_C1 = df_pneumonia[df_pneumonia['y_true'] == 1]
df_pneumonia_C0 = df_pneumonia[df_pneumonia['y_true'] == 0]

df_original_C1 = df_original[df_original['y_true'] == 1]
df_original_C0 = df_original[df_original['y_true'] == 0]

#### Original VS Sepsis


In [11]:
sepsis_file = 'data/task-data/in-hospital-mortality/train_listfile_sepsis.csv'
original_file = 'data/task-data/in-hospital-mortality/train_listfile.csv'

df_sepsis = pd.read_csv(sepsis_file)
df_original = pd.read_csv(original_file)

data_directory = 'data/task-data/in-hospital-mortality/train-filled-48H'

WD_original_sepsis_df = compute_wasserstein_for_vitals(df_sepsis, df_original, data_directory, vital_indices, vital_names)
WD_original_sepsis_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_original_sepsis.csv", index = False)
WD_original_sepsis_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.001195
1,Diastolic BP,2.927264
2,FO2,0.024386
3,GCS eye opening,0.024153
4,GCS motor response,0.095532
5,GCS total,0.201229
6,GCS verbal response,0.075122
7,Glucose,7.463494
8,Heart Rate,2.067004
9,Height,0.416537


#### Original VS Pneumonia

In [12]:
pneumonia_file = 'data/task-data/in-hospital-mortality/train_listfile_PNEUMONIA.csv'
original_file = 'data/task-data/in-hospital-mortality/train_listfile.csv'

df_pneumonia = pd.read_csv(pneumonia_file)
df_original = pd.read_csv(original_file)

data_directory = 'data/task-data/in-hospital-mortality/train-filled-48H'

WD_original_pneumonia_df = compute_wasserstein_for_vitals(df_pneumonia, df_original, data_directory, vital_indices, vital_names)
WD_original_pneumonia_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_original_pneumonia.csv", index = False)
WD_original_pneumonia_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.002189
1,Diastolic BP,1.051656
2,FO2,0.043339
3,GCS eye opening,0.070412
4,GCS motor response,0.116242
5,GCS total,0.431748
6,GCS verbal response,0.131793
7,Glucose,3.704413
8,Heart Rate,0.541473
9,Height,0.679926


#### Sepsis VS Pneumonia

In [8]:
sepsis_file = 'data/task-data/in-hospital-mortality/train_listfile_sepsis.csv'
pneumonia_file = 'data/task-data/in-hospital-mortality/train_listfile_PNEUMONIA.csv'

df_sepsis = pd.read_csv(sepsis_file)
df_pneumonia = pd.read_csv(pneumonia_file)

data_directory = 'data/task-data/in-hospital-mortality/train-filled-48H'

result_df = compute_wasserstein_for_vitals(df_sepsis, df_pneumonia, data_directory, vital_indices, vital_names)
result_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_Sepsis_Pneumonia.csv", index = False)
result_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.000995
1,Diastolic BP,2.599739
2,FO2,0.066763
3,GCS eye opening,0.05602
4,GCS motor response,0.052991
5,GCS total,0.529547
6,GCS verbal response,0.077045
7,Glucose,4.24837
8,Heart Rate,2.571095
9,Height,0.959111


#### Sepsis C0 VS C1

In [13]:
df_sepsis = pd.read_csv(sepsis_file)

df_sepsis_C1 = df_sepsis[df_sepsis['y_true'] == 1]
df_sepsis_C0 = df_sepsis[df_sepsis['y_true'] == 0]

data_directory = 'data/task-data/in-hospital-mortality/train-filled-48H'

WD_sepsis_classes_df = compute_wasserstein_for_vitals(df_sepsis_C1, df_sepsis_C0, data_directory, vital_indices, vital_names)
WD_sepsis_classes_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_Sepsis_C0_C1.csv", index = False)
WD_sepsis_classes_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.00134
1,Diastolic BP,2.229154
2,FO2,0.025255
3,GCS eye opening,0.41834
4,GCS motor response,0.760695
5,GCS total,1.414239
6,GCS verbal response,0.698034
7,Glucose,3.519773
8,Heart Rate,5.967671
9,Height,0.760181


#### Penumonia C0 VS C1

In [14]:
df_pneumonia = pd.read_csv(pneumonia_file)

df_pneumonia_C1 = df_pneumonia[df_pneumonia['y_true'] == 1]
df_pneumonia_C0 = df_pneumonia[df_pneumonia['y_true'] == 0]

data_directory = 'data/task-data/in-hospital-mortality/train-filled-48H'

WD_pneumonia_classes_df = compute_wasserstein_for_vitals(df_pneumonia_C1, df_pneumonia_C0, data_directory, vital_indices, vital_names)
WD_pneumonia_classes_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_pneumonia_C0_C1.csv", index = False)
WD_pneumonia_classes_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.001593
1,Diastolic BP,4.515473
2,FO2,0.056197
3,GCS eye opening,0.299736
4,GCS motor response,0.692505
5,GCS total,0.695868
6,GCS verbal response,0.56852
7,Glucose,8.553436
8,Heart Rate,6.123253
9,Height,0.543914


#### Sepsis C1 VS Pneumonia C1


In [15]:
WD_sepsis_pneumonia_c1_df = compute_wasserstein_for_vitals(df_sepsis_C1, df_pneumonia_C1, data_directory, vital_indices, vital_names)
WD_sepsis_pneumonia_c1_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_sepsis_pneumonia_c1.csv", index = False)
WD_sepsis_pneumonia_c1_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.001593
1,Diastolic BP,0.617857
2,FO2,0.095149
3,GCS eye opening,0.153114
4,GCS motor response,0.098037
5,GCS total,1.140826
6,GCS verbal response,0.137839
7,Glucose,5.068734
8,Heart Rate,2.553315
9,Height,0.921028


#### Sepsis C0 VS Pneumonia C0

In [16]:
WD_sepsis_pneumonia_c0_df = compute_wasserstein_for_vitals(df_sepsis_C0, df_pneumonia_C0, data_directory, vital_indices, vital_names)
WD_sepsis_pneumonia_c0_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_sepsis_pneumonia_c0.csv", index = False)
WD_sepsis_pneumonia_c0_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.00134
1,Diastolic BP,2.802045
2,FO2,0.063884
3,GCS eye opening,0.034509
4,GCS motor response,0.029847
5,GCS total,0.422455
6,GCS verbal response,0.085157
7,Glucose,5.199632
8,Heart Rate,2.439268
9,Height,0.967663


#### Original C1 VS Sepsis C1

In [14]:
WD_original_sepsis_c1_df = compute_wasserstein_for_vitals(df_original_C1, df_sepsis_C1, data_directory, vital_indices, vital_names)
WD_original_sepsis_c1_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_original_sepsis_C1.csv", index = False)
WD_original_sepsis_c1_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.005002
1,Diastolic BP,3.314613
2,FO2,0.025749
3,GCS eye opening,0.138943
4,GCS motor response,0.211652
5,GCS total,0.292176
6,GCS verbal response,0.3175
7,Glucose,6.513371
8,Heart Rate,3.766883
9,Height,0.578586


#### Original C0 VS Sepsis C0

In [15]:
WD_original_sepsis_c0_df = compute_wasserstein_for_vitals(df_original_C0, df_sepsis_C0, data_directory, vital_indices, vital_names)
WD_original_sepsis_c0_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_original_sepsis_C0.csv", index = False)
WD_original_sepsis_c0_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.000609
1,Diastolic BP,2.882996
2,FO2,0.023993
3,GCS eye opening,0.029169
4,GCS motor response,0.094891
5,GCS total,0.248185
6,GCS verbal response,0.141694
7,Glucose,8.552887
8,Heart Rate,1.843071
9,Height,0.414254


#### Original C1 VS Pneumonia C1


In [16]:
WD_original_pneumonia_c1_df = compute_wasserstein_for_vitals(df_original_C1, df_pneumonia_C1, data_directory, vital_indices, vital_names)
WD_original_pneumonia_c1_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_original_pneumonia_C1.csv", index = False)
WD_original_pneumonia_c1_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.003409
1,Diastolic BP,2.869343
2,FO2,0.069538
3,GCS eye opening,0.292056
4,GCS motor response,0.309689
5,GCS total,1.380884
6,GCS verbal response,0.361857
7,Glucose,4.979923
8,Heart Rate,2.110405
9,Height,0.567007


#### Original C0 VS Pneumonia C0

In [17]:
WD_original_pneumonia_c0_df = compute_wasserstein_for_vitals(df_original_C0, df_pneumonia_C0, data_directory, vital_indices, vital_names)
WD_original_pneumonia_c0_df.to_csv("mimic3models/in_hospital_mortality/Wasserstein_distance/WD_original_pneumonia_C0.csv", index = False)
WD_original_pneumonia_c0_df

Unnamed: 0,Vital,Wasserstein Distance
0,Cap rate,0.001949
1,Diastolic BP,0.961557
2,FO2,0.041123
3,GCS eye opening,0.047502
4,GCS motor response,0.107964
5,GCS total,0.261409
6,GCS verbal response,0.226851
7,Glucose,3.898696
8,Heart Rate,0.640636
9,Height,0.737559


#### Merge 

In [7]:
directory = 'mimic3models/in_hospital_mortality/Wasserstein_distance'
dfs = []

for filename in os.listdir(directory):
    if filename.endswith(".csv"):  # Ensure you're working with CSV files
        file_path = os.path.join(directory, filename)
        df = pd.read_csv(file_path)
        df = df.rename(columns={"Wasserstein Distance": filename.replace('.csv', '')})
        dfs.append(df)

merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on='Vital', how='outer')



merged_df = merged_df [['Vital', 'WD_original_sepsis', 'WD_original_pneumonia', 'WD_sepsis_pneumonia',
    'WD_sepsis_C0_C1', 'WD_pneumonia_C0_C1', 
    'WD_sepsis_pneumonia_C1', 'WD_sepsis_pneumonia_C0',
    'WD_original_sepsis_C1', 'WD_original_sepsis_C0',
    'WD_original_pneumonia_C1', 'WD_original_pneumonia_C0']]

display(merged_df)
merged_df.to_csv(directory+'/WD_merged.csv', index=False)


Unnamed: 0,Vital,WD_original_sepsis,WD_original_pneumonia,WD_sepsis_pneumonia,WD_sepsis_C0_C1,WD_pneumonia_C0_C1,WD_sepsis_pneumonia_C1,WD_sepsis_pneumonia_C0,WD_original_sepsis_C1,WD_original_sepsis_C0,WD_original_pneumonia_C1,WD_original_pneumonia_C0
0,Cap rate,0.001195,0.002189,0.000995,0.00134,0.001593,0.001593,0.00134,0.005002,0.000609,0.003409,0.001949
1,Diastolic BP,2.927264,1.051656,2.599739,2.229154,4.515473,0.617857,2.802045,3.314613,2.882996,2.869343,0.961557
2,FO2,0.024386,0.043339,0.066763,0.025255,0.056197,0.095149,0.063884,0.025749,0.023993,0.069538,0.041123
3,GCS eye opening,0.024153,0.070412,0.05602,0.41834,0.299736,0.153114,0.034509,0.138943,0.029169,0.292056,0.047502
4,GCS motor response,0.095532,0.116242,0.052991,0.760695,0.692505,0.098037,0.029847,0.211652,0.094891,0.309689,0.107964
5,GCS total,0.201229,0.431748,0.529547,1.414239,0.695868,1.140826,0.422455,0.292176,0.248185,1.380884,0.261409
6,GCS verbal response,0.075122,0.131793,0.077045,0.698034,0.56852,0.137839,0.085157,0.3175,0.141694,0.361857,0.226851
7,Glucose,7.463494,3.704413,4.24837,3.519773,8.553436,5.068734,5.199632,6.513371,8.552887,4.979923,3.898696
8,Heart Rate,2.067004,0.541473,2.571095,5.967671,6.123253,2.553315,2.439268,3.766883,1.843071,2.110405,0.640636
9,Height,0.416537,0.679926,0.959111,0.760181,0.543914,0.921028,0.967663,0.578586,0.414254,0.567007,0.737559


In [4]:
merged_df.columns

Index(['Vital', 'WD_original_sepsis', 'WD_original_pneumonia',
       'WD_Sepsis_C0_C1', 'WD_pneumonia_C0_C1', 'WD_sepsis_pneumonia',
       'WD_sepsis_pneumonia_C1', 'WD_sepsis_pneumonia_C0',
       'WD_original_sepsis_C1', 'WD_original_sepsis_C0',
       'WD_original_pneumonia_C1', 'WD_original_pneumonia_C0'],
      dtype='object')