In [14]:
import pandas as pd
import re

df = pd.read_csv("cleaned_labevents.csv")

# Columns you never want to collapse (keep as-is)
id_cols = [c for c in ["subject_id", "hadm_id", "stay_id", "storetime", "charttime"] if c in df.columns]

# Build groups by "base name" (strip trailing .<number>)
groups = {}
for col in df.columns:
    if col in id_cols:
        continue
    base = re.sub(r"\.\d+$", "", col)  # "Lactate.2" -> "Lactate"
    groups.setdefault(base, []).append(col)

# Collapse duplicates: base column becomes first non-null across its group
for base, cols in groups.items():
    if len(cols) <= 1:
        continue

    # Keep a stable order: base first (if present), then .1, .2...
    cols_sorted = sorted(
        cols,
        key=lambda x: (0 if x == base else 1, int(x.split(".")[-1]) if "." in x else 0)
    )

    df[base] = df[cols_sorted].bfill(axis=1).iloc[:, 0]  # first non-null across cols
    df.drop(columns=[c for c in cols_sorted if c != base], inplace=True)

# ---- Now merge true synonyms (from your second cell) ----
def merge_columns_inplace(df, cols, new_name):
    cols = [c for c in cols if c in df.columns]
    if len(cols) == 0:
        return
    df[new_name] = df[cols].bfill(axis=1).iloc[:, 0]
    df.drop(columns=cols, inplace=True)

merge_columns_inplace(df, ["WBC Count", "White Blood Cells"], "WBC")
merge_columns_inplace(df, ["C-Reactive Protein", "High-Sensitivity CRP"], "CRP")
merge_columns_inplace(df, ["Absolute Neutrophil Count", "Absolute Neutrophil"], "ANC")

# ---- Define lab_cols and check missingness ----
# Select numeric lab columns (exclude IDs and timestamps)
lab_cols = [c for c in df.select_dtypes(include=[float, int]).columns if c not in ['subject_id', 'hadm_id', 'stay_id']]

print("Final columns after collapsing/merging:")
print(df.columns.tolist())

missing_pct = df[lab_cols].isnull().mean() * 100
print("\nMissingness % for lab columns (sorted):")
print(missing_pct.sort_values())

# Optional: Display df
df


Final columns after collapsing/merging:
['subject_id', 'hadm_id', 'stay_id', 'storetime', 'Lactate', 'Bilirubin, Direct', 'Bilirubin, Total', 'Creatinine', 'D-Dimer', 'Absolute Lymphocyte Count', 'Atypical Lymphocytes', 'Bands', 'Basophils', 'Eosinophil Count', 'Eosinophils', 'INR(PT)', 'Lymphocytes', 'Lymphocytes, Percent', 'Monocyte Count', 'Monocytes', 'Neutrophils', 'Platelet Count', 'PTT', 'Absolute Basophil Count', 'Absolute Eosinophil Count', 'Absolute Monocyte Count', 'Fibrinogen', 'Immature Granulocytes', 'Absolute Other WBC', 'WBC', 'CRP', 'ANC']

Missingness % for lab columns (sorted):
Creatinine                    71.218206
WBC                           73.226238
Platelet Count                73.360107
Basophils                     75.100402
Eosinophils                   75.100402
Lymphocytes                   75.100402
Neutrophils                   75.100402
Monocytes                     75.100402
Lactate                       82.998661
INR(PT)                       83.132

Unnamed: 0,subject_id,hadm_id,stay_id,storetime,Lactate,"Bilirubin, Direct","Bilirubin, Total",Creatinine,D-Dimer,Absolute Lymphocyte Count,...,PTT,Absolute Basophil Count,Absolute Eosinophil Count,Absolute Monocyte Count,Fibrinogen,Immature Granulocytes,Absolute Other WBC,WBC,CRP,ANC
0,10000032,,33258284,2180-05-06 22:42:00,,,,,,,...,,,,,,,,5.0,,
1,10000032,,33258284,2180-05-06 23:13:00,,,,,,,...,,,,,,,,,,
2,10000032,,33258284,2180-05-06 23:14:00,,,,,,,...,30.9,,,,,,,,,
3,10000032,,33258284,2180-05-06 23:16:00,,,1.6,0.3,,,...,,,,,,,,,,
4,10000032,,38112554,2180-06-26 16:40:00,1.7,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,10040025,27996268.0,36041505,2148-01-22 16:41:00,,,,,,,...,53.7,,,,,,,,,
743,10040025,27996268.0,36041505,2148-01-22 16:54:00,,,,,,0.26,...,,0.0,0.09,0.78,,,,,,7.57
744,10040025,27996268.0,36041505,2148-01-22 17:18:00,,,,1.8,,,...,,,,,,,,,,
745,10040025,,36041505,2148-01-23 10:51:00,,,,,,,...,,,,,,,,,,


In [15]:
import numpy as np

def merge_columns_inplace(df, cols, new_name):
    """
    Merge multiple synonymous columns into one using first non-null value.
    Operates IN PLACE on the dataframe.
    """
    cols = [c for c in cols if c in df.columns]
    if len(cols) == 0:
        return

    df[new_name] = df[cols].bfill(axis=1).iloc[:, 0]
    df.drop(columns=cols, inplace=True)


# ---- merge true synonyms ----
merge_columns_inplace(df, ["WBC Count", "White Blood Cells"], "WBC")
merge_columns_inplace(df, ["C-Reactive Protein", "High-Sensitivity CRP"], "CRP")
merge_columns_inplace(df, ["Absolute Neutrophil Count", "Absolute Neutrophil"], "ANC")

# optional: quick sanity check
print("Final columns:")
print(df.columns.tolist())


Final columns:
['subject_id', 'hadm_id', 'stay_id', 'storetime', 'Lactate', 'Bilirubin, Direct', 'Bilirubin, Total', 'Creatinine', 'D-Dimer', 'Absolute Lymphocyte Count', 'Atypical Lymphocytes', 'Bands', 'Basophils', 'Eosinophil Count', 'Eosinophils', 'INR(PT)', 'Lymphocytes', 'Lymphocytes, Percent', 'Monocyte Count', 'Monocytes', 'Neutrophils', 'Platelet Count', 'PTT', 'Absolute Basophil Count', 'Absolute Eosinophil Count', 'Absolute Monocyte Count', 'Fibrinogen', 'Immature Granulocytes', 'Absolute Other WBC', 'WBC', 'CRP', 'ANC']


In [19]:
import pandas as pd

# Load the raw lab events CSV
df = pd.read_csv('/Users/ebiteclement/Documents/Beyond-Time-Zero/Sepsis Risk Data and Code/data/MIMIC-ED/ed/cleaned_labevents.csv')

# Select the relevant columns: subject_id, hadm_id, stay_id, storetime, and the specified labs
# Note: Using exact column names from the header
selected_columns = [
    'subject_id', 'hadm_id', 'stay_id', 'storetime',
    'Creatinine',                 # Creatinine
    'Platelet Count',            # Platelet Count
    'Absolute Neutrophil Count',  # ANC
    'C-Reactive Protein',         # CRP
    'INR(PT)',                    # INR
    'PTT',                        # PTT
    'Fibrinogen',                 # Fibrinogen
    'Bilirubin, Total',           # Bilirubin
    'D-Dimer'                     # D-Dimer
]

df_selected = df[selected_columns]

# Drop rows where all lab columns are NaN
lab_columns = ['Creatinine', 'Platelet Count','Absolute Neutrophil Count', 'C-Reactive Protein', 'INR(PT)', 'PTT', 'Fibrinogen', 'Bilirubin, Total', 'D-Dimer']
df_clean = df_selected.dropna(subset=lab_columns, how='all')

# Save to a new CSV file
output_path = '/Users/ebiteclement/Documents/Beyond-Time-Zero/Sepsis Risk Data and Code/data/MIMIC-ED/ed/cleaned_labevents_additional_labs.csv'
df_clean.to_csv(output_path, index=False)

# Print shape and a sample for verification
print(f"Shape of cleaned DataFrame: {df_clean.shape}")
print("Sample rows:")
print(df_clean.head())

Shape of cleaned DataFrame: (612, 13)
Sample rows:
   subject_id  hadm_id   stay_id            storetime  Creatinine  \
0    10000032      NaN  33258284  2180-05-06 22:42:00         NaN   
1    10000032      NaN  33258284  2180-05-06 23:13:00         NaN   
2    10000032      NaN  33258284  2180-05-06 23:14:00         NaN   
3    10000032      NaN  33258284  2180-05-06 23:16:00         0.3   
5    10000032      NaN  38112554  2180-06-26 16:50:00         NaN   

   Platelet Count  Absolute Neutrophil Count  C-Reactive Protein  INR(PT)  \
0            71.0                        NaN                 NaN      NaN   
1             NaN                        NaN                 NaN      1.6   
2             NaN                        NaN                 NaN      NaN   
3             NaN                        NaN                 NaN      NaN   
5           143.0                        NaN                 NaN      NaN   

    PTT  Fibrinogen  Bilirubin, Total  D-Dimer  
0   NaN         NaN   

In [8]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

# Paths (hardcoded for notebook; adjust if needed)
REPO_ROOT = Path("/Users/ebiteclement/Documents/Beyond-Time-Zero/Sepsis Risk Data and Code")
DATA_DIR = REPO_ROOT / "data" / "MIMIC-ED"
ED_DIR = DATA_DIR / "ed"

cox_path = DATA_DIR / "cox_timevarying_train.csv"  # Load the one with Creatinine and Platelet Count already added
labs_path = ED_DIR / "cleaned_labevents_additional_labs.csv"  # The new labs CSV
edstays_path = ED_DIR / "edstays.csv"
output_path = DATA_DIR / "cox_timevarying_with_all_labs_train.csv"  # New output with all labs

# Load edstays for intime
edstays = pd.read_csv(edstays_path, usecols=['stay_id', 'intime'])
edstays['intime'] = pd.to_datetime(edstays['intime'])
edstays = edstays.set_index('stay_id')['intime']

# Load labs
labs = pd.read_csv(labs_path)
labs['storetime'] = pd.to_datetime(labs['storetime'])
labs = labs.dropna(subset=['stay_id', 'storetime']).copy()

# Compute hours_since_adm
labs = labs.merge(edstays, left_on='stay_id', right_index=True, how='left')
labs['hours_since_adm'] = (labs['storetime'] - labs['intime']).dt.total_seconds() / 3600
labs = labs.drop(columns=['intime', 'storetime', 'subject_id', 'hadm_id'])  # drop unnecessary

# Select the new lab columns
lab_cols = ['Creatinine', 'Platelet Count', 'Absolute Neutrophil Count', 'C-Reactive Protein', 'INR(PT)', 'PTT', 'Fibrinogen', 'Bilirubin, Total', 'D-Dimer']

# Keep only relevant columns
labs = labs[['stay_id', 'hours_since_adm'] + lab_cols].dropna(subset=['hours_since_adm'])

# Sort labs by stay_id and time
labs = labs.sort_values(['stay_id', 'hours_since_adm'])

# Load cox data (already has Creatinine and Platelet Count)
cox_df = pd.read_csv(cox_path)
print(cox_df)

# For each lab column, merge using groupby to avoid sorting issues
for lab_col in lab_cols:
    temp_labs = labs[['stay_id', 'hours_since_adm', lab_col]].dropna(subset=[lab_col])
    
    def merge_for_group(group):
        stay_labs = temp_labs[temp_labs['stay_id'] == group.name].sort_values('hours_since_adm')
        stay_labs = stay_labs.drop(columns=['stay_id'])  # Avoid suffix on stay_id
        if lab_col in group.columns:
            group = group.drop(columns=[lab_col])  # Avoid suffix on lab_col
        return pd.merge_asof(
            group.sort_values('start'),
            stay_labs,
            left_on='start',
            right_on='hours_since_adm',
            direction='backward'
        )
    
    cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
    cox_df = cox_df.reset_index(drop=True)  # Reset index without adding columns
    
    # Rename the merged column
    cox_df = cox_df.rename(columns={lab_col: f'lab_{lab_col}'})
    # Drop the hours_since_adm column added
    if 'hours_since_adm' in cox_df.columns:
        cox_df = cox_df.drop(columns=['hours_since_adm'])

# Save
cox_df.to_csv(output_path, index=False)
print(f"Saved updated cox_timevarying to {output_path}")

        stay_id     start      stop  event  temperature  heartrate  resprate  \
0      30005196  0.000000  0.066667      0         97.8       86.0      16.0   
1      30005196  0.066667  0.150000      0         97.8       86.0      16.0   
2      30005196  0.150000  0.400000      0         97.8       85.0      29.0   
3      30005196  0.400000  0.600000      0         97.8       84.0      28.0   
4      30005196  0.600000  0.933333      0         97.9       83.0      22.0   
...         ...       ...       ...    ...          ...        ...       ...   
22709  39999835  6.533333  6.816667      0         98.0      130.0      18.0   
22710  39999835  6.816667  6.900000      0         98.0      130.0      18.0   
22711  39999835  6.900000  8.050000      0         98.0       80.0      18.0   
22712  39999835  8.050000  8.966667      0         98.5      118.0      18.0   
22713  39999835  8.966667  9.250000      0         98.5      118.0      18.0   

       o2sat    sbp   dbp  ...  arrival

  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)


Saved updated cox_timevarying to /Users/ebiteclement/Documents/Beyond-Time-Zero/Sepsis Risk Data and Code/data/MIMIC-ED/cox_timevarying_with_all_labs_train.csv


  cox_df = cox_df.groupby("stay_id", group_keys=False).apply(merge_for_group)
