In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


def create_unique_splits(df, train_size=128, val_size=128, test_size=10000):
    # Ensure the dataset is large enough
    if len(df) < train_size + val_size + test_size:
        raise ValueError("Dataset is too small to create the requested splits.")
    
    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Initialize empty DataFrames for each split
    train_df = pd.DataFrame(columns=df.columns)
    val_df = pd.DataFrame(columns=df.columns)
    test_df = pd.DataFrame(columns=df.columns)
    
    # Track used IDs to ensure uniqueness
    used_subject_ids = set()
    used_study_ids = set()
    used_labevent_ids = set()
    
    # Function to check if a row is unique
    def is_unique(row):
        return (row['subject_id'] not in used_subject_ids and
                row['study_id'] not in used_study_ids and
                row['labevent_id'] not in used_labevent_ids)
    
    # Function to add a row to a split and mark its IDs as used
    def add_to_split(split_df, row):
        split_df.loc[len(split_df)] = row
        used_subject_ids.add(row['subject_id'])
        used_study_ids.add(row['study_id'])
        used_labevent_ids.add(row['labevent_id'])
    
    # Iterate through the dataset and assign rows to splits
    for _, row in df.iterrows():
        if is_unique(row):
            if len(train_df) < train_size:
                add_to_split(train_df, row)
            elif len(val_df) < val_size:
                add_to_split(val_df, row)
            elif len(test_df) < test_size:
                add_to_split(test_df, row)
            else:
                break  # All splits are filled
    
    # Check if all splits are filled
    if len(train_df) < train_size or len(val_df) < val_size or len(test_df) < test_size:
        raise ValueError("Could not create splits with unique IDs. Dataset may have overlapping IDs.")
    
    return train_df, val_df, test_df

In [21]:
path = "/Volumes/SanDisk SSD/physionet.org/files/mimic-iv-ecg/1.0/MIMIC-IV-ECG-Ext-Electrolytes/"

# Process for Calcium50893
df = pd.read_csv(path + "mimiciv_ECGv1.1_hospV2.2_Calcium50893.csv")
train_df, val_df, test_df = create_unique_splits(df)
calcium50893_df = pd.concat([train_df[['subject_id', 'study_id', 'labevent_id']],
                             val_df[['subject_id', 'study_id', 'labevent_id']],
                             test_df[['subject_id', 'study_id', 'labevent_id']]])

calcium50893_df['split'] = ['train'] * len(train_df) + ['val'] * len(val_df) + ['test'] * len(test_df)
calcium50893_df.to_csv(path + "few_shot_splits/128shots/split1/Calcium50893.csv", index=False)

# Process for Creatinine50912
df = pd.read_csv(path + "mimiciv_ECGv1.1_hospV2.2_Creatinine50912.csv")
train_df, val_df, test_df = create_unique_splits(df)
creatinine50912_df = pd.concat([train_df[['subject_id', 'study_id', 'labevent_id']],
                                 val_df[['subject_id', 'study_id', 'labevent_id']],
                                 test_df[['subject_id', 'study_id', 'labevent_id']]])
creatinine50912_df['split'] = ['train'] * len(train_df) + ['val'] * len(val_df) + ['test'] * len(test_df)
creatinine50912_df.to_csv(path + "few_shot_splits/128shots/split1/Creatinine50912.csv", index=False)

# Process for Magnesium50960
df = pd.read_csv(path + "mimiciv_ECGv1.1_hospV2.2_Magnesium50960.csv")
train_df, val_df, test_df = create_unique_splits(df)
magnesium50960_df = pd.concat([train_df[['subject_id', 'study_id', 'labevent_id']],
                                val_df[['subject_id', 'study_id', 'labevent_id']],
                                test_df[['subject_id', 'study_id', 'labevent_id']]])
magnesium50960_df['split'] = ['train'] * len(train_df) + ['val'] * len(val_df) + ['test'] * len(test_df)
magnesium50960_df.to_csv(path + "few_shot_splits/128shots/split1/Magnesium50960.csv", index=False)

# Process for Potassium50971
df = pd.read_csv(path + "mimiciv_ECGv1.1_hospV2.2_Potassium50971.csv")
train_df, val_df, test_df = create_unique_splits(df)
potassium50971_df = pd.concat([train_df[['subject_id', 'study_id', 'labevent_id']],
                                val_df[['subject_id', 'study_id', 'labevent_id']],
                                test_df[['subject_id', 'study_id', 'labevent_id']]])
potassium50971_df['split'] = ['train'] * len(train_df) + ['val'] * len(val_df) + ['test'] * len(test_df)
potassium50971_df.to_csv(path + "few_shot_splits/128shots/split1/Potassium50971.csv", index=False)

# Process for Sodium50983
df = pd.read_csv(path + "mimiciv_ECGv1.1_hospV2.2_Sodium50983.csv")
train_df, val_df, test_df = create_unique_splits(df)
sodium50983_df = pd.concat([train_df[['subject_id', 'study_id', 'labevent_id']],
                             val_df[['subject_id', 'study_id', 'labevent_id']],
                             test_df[['subject_id', 'study_id', 'labevent_id']]])
sodium50983_df['split'] = ['train'] * len(train_df) + ['val'] * len(val_df) + ['test'] * len(test_df)
sodium50983_df.to_csv(path + "few_shot_splits/128shots/split1/Sodium50983.csv", index=False)