In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("train_cheXbert.csv")

# Extract patient ID correctly
df['patient_id'] = df['Path'].apply(lambda x: x.split('/')[2])

# Get unique patient IDs
unique_patients = df['patient_id'].unique()

# Split into 70% train and 30% temp
train_ids, temp_ids = train_test_split(unique_patients, test_size=0.3, random_state=42)

# Now split temp into 3 equal parts: test1, sim1, sim2
test1_ids, temp_remaining = train_test_split(temp_ids, test_size=2/3, random_state=42)
sim1_ids, sim2_ids = train_test_split(temp_remaining, test_size=0.5, random_state=42)

# Helper function
def get_subset(patient_ids):
    return df[df['patient_id'].isin(patient_ids)].drop(columns=['patient_id'])

# Generate subsets
train_df = get_subset(train_ids)
test1_df = get_subset(test1_ids)
sim1_df = get_subset(sim1_ids)
sim2_df = get_subset(sim2_ids)

# Validation checks
all_used_ids = set(train_ids) | set(test1_ids) | set(sim1_ids) | set(sim2_ids)
original_ids = set(unique_patients)

assert len(all_used_ids) == len(original_ids), "❌ Some patient IDs are missing!"
assert len(all_used_ids) == len(set(all_used_ids)), "❌ Duplicate patient IDs across splits!"

print(f"✅ Validation Passed: {len(original_ids)} patient IDs correctly split.")

# Save to CSV
train_df.to_csv("train.csv", index=False)
test1_df.to_csv("test1.csv", index=False)
sim1_df.to_csv("sim1.csv", index=False)
sim2_df.to_csv("sim2.csv", index=False)

print("✅ CSV splits saved successfully")


✅ Validation Passed: 64540 patient IDs correctly split.
✅ CSV splits saved successfully
