In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
DATA_DIR = 'data/raw_data'

LABELS

In [None]:
mortality = pd.read_csv('data/mortality_labels.csv')
mortality['patient_id'] = pd.to_numeric(mortality['stay'].apply(lambda x: x.split('_')[0]))

In [None]:
mortality = mortality.drop(columns=['patient_id'])

In [None]:
mortality['subject_id'] = pd.to_numeric(mortality['stay'].apply(lambda x: x.split('_', 1)[0]))
mortality['episode'] = pd.to_numeric(mortality['stay'].apply(lambda x: x.split('_', 2)[1][7:]))

In [None]:
new_morality = None

In [None]:
for i in range(0, 7):
    visits_df = pd.read_parquet(os.path.join(DATA_DIR, f'visits_{i}.parquet'))
    visits_df = visits_df.sort_values(['subject_id', 'intime'])
    visits_df['episode'] = (
        visits_df[visits_df['intime'].notna()]
        .sort_values(['subject_id', 'intime'])
        .groupby('subject_id')
        .cumcount() + 1
    )
    visits_df.to_parquet(os.path.join(DATA_DIR, f'visits_{i}.parquet'))
    mortality_i = mortality.merge(visits_df, on=['subject_id', 'episode'])

    if new_morality is None:
        new_morality = mortality_i
    else:
        new_morality = pd.concat([new_morality, mortality_i])

In [None]:
new_morality = new_morality[['subject_id', 'hadm_id', 'stay_id', 'episode', 'y_true', 'intime', 'outtime', 'deathtime']]

In [None]:
new_morality.to_csv('data/mortality_labels.csv', index=False)

DEMOGRAPHICS

In [None]:
test_df = pd.read_parquet('data/test_demo.parquet')
train_df = pd.read_parquet('data/train_demo.parquet')

In [None]:
df = pd.concat([train_df, test_df])

In [None]:
df.to_parquet('data/demographics.parquet')

In [None]:
COLS = {
    'chartevents': ('subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'valuenum'),
    'inputevents': ('subject_id', 'hadm_id', 'stay_id', 'starttime', 'itemid', 'amount'),
    'outputevents': ('subject_id', 'hadm_id', 'stay_id', 'charttime', 'itemid', 'value'),
    'procedureevents': ('subject_id', 'hadm_id', 'stay_id', 'starttime', 'itemid', 'value'),
    'labevents': ('subject_id', 'hadm_id', 'charttime', 'itemid', 'valuenum'),
    'pharmacy': ('subject_id', 'hadm_id', 'starttime', 'medication'),
    'emar': ('subject_id', 'hadm_id', 'charttime', 'medication'),
}


In [None]:
def read_lab(task, i):
    col = COLS[task]
    df = pd.read_parquet(f'data/raw_data/{task}_{i}.parquet', columns=col)

    if not 'stay_id' in df.columns:
        df['stay_id'] = None
    
    if 'starttime' in df.columns:
        df = df.rename(columns={'starttime': 'time'})
    elif 'charttime' in df.columns:
        df = df.rename(columns={'charttime': 'time'})
    
    if 'medication' in df.columns:
        df = df.rename(columns={'medication': 'itemid'})
    
    if 'amount' in df.columns:
        df = df.rename(columns={'amount': 'value'})
    elif 'valuenum' in df.columns:
        df = df.rename(columns={'valuenum': 'value'})
    return df


In [None]:
label_df = pd.read_csv('data/mortality_labels.csv')

In [None]:
for i in range(1, 7):
    visits_df = pd.read_parquet(f'data/raw_data/visits_{i}.parquet')
    visits_df = visits_df[visits_df['subject_id'].isin(label_df['subject_id'])]
    ids = visits_df['subject_id'].unique()

    chart_df = read_lab('chartevents', i)
    emar_df = read_lab('emar', i)
    input_df = read_lab('inputevents', i)
    output_df = read_lab('outputevents', i)
    lab_df = read_lab('labevents', i)
    pharmacy_df = read_lab('pharmacy', i)
    procedure_df = read_lab('procedureevents', i)

    for j in range(10):
        df_final = None
        ids_j = ids[1000*j:1000*(j+1)]
        for id in ids_j:
            print(id)
            label_i = label_df[label_df['subject_id'] == id].iloc[0]
            label_time = label_i['outtime']

            chart_i = chart_df[chart_df['subject_id'] == id]
            emar_i = emar_df[emar_df['subject_id'] == id]
            input_i = input_df[input_df['subject_id'] == id]
            output_i = output_df[output_df['subject_id'] == id]
            lab_i = lab_df[lab_df['subject_id'] == id]
            pharmacy_i = pharmacy_df[pharmacy_df['subject_id'] == id]
            procedure_i = procedure_df[procedure_df['subject_id'] == id]

            df_i = pd.concat([chart_i, emar_i, input_i, output_i, lab_i, pharmacy_i, procedure_i])
            df_i = df_i[df_i['time'] < label_time]
            df_i = df_i.sort_values('time')

            df_i['itemid'] = df_i['itemid'].astype(str)
            
            if df_final is None:
                df_final = df_i
            else:
                df_final = pd.concat([df_final, df_i], ignore_index=True)
        
        if not df_final is None:
            df_final.to_parquet(f'data/mortality/{i}_{j}_final.parquet')

DATA STATISTICS

In [None]:
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data/mortality/mortality_labels.csv')


In [None]:
# Assuming your dataframe is named df and intime/outtime are datetime types
df['intime'] = pd.to_datetime(df['intime'])
df['outtime'] = pd.to_datetime(df['outtime'])
df['deathtime'] = pd.to_datetime(df['deathtime'])
df['duration'] = (df['outtime'] - df['intime']).dt.total_seconds() / 60 
df['duration2'] = (df['deathtime'] - df['intime']).dt.total_seconds() / 60  # duration in minutes

# Plot
plt.figure(figsize=(10, 6))
for label, group in df.groupby('y_true'):
    if label == 1:
        plt.hist(group['duration2'], bins=50, alpha=0.5, label=f'Label {label}', density=True)
        # plt.hist(group['duration'], bins=50, alpha=0.5, label=f'Label {label}', density=True)
    else:
        plt.hist(group['duration'], bins=50, alpha=0.5, label=f'Label {label}', density=True)

plt.xlabel('Duration (minutes)')
plt.ylabel('Density')
plt.title('Duration Distribution by Label')
plt.legend()
plt.grid(True)
plt.show()
