In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# --- Configuration ---
# FILE_CHECK_PATH = "D:/VA/coding/project/cect/patient_data.csv"
OUTPUT_SPLIT_FILE = 'D:/VA/coding/project/cect/patient_splits.csv'
TEST_RATIO = 0.1
VAL_RATIO = 0.2
RANDOM_SEED = 42
# ---------------------
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import sys

# --- Configuration ---
FILE_CHECK_PATH = 'D:/VA/coding/project/cect/file_check.csv'       # Input: The file with 'label'
PATIENT_DATA_PATH = 'D:/VA/coding/project/cect/patient_data.csv'   # Input: The file linking patient_id to file_name
# ---------------------

def run_patient_split():
    """
    Reads file_check.csv to find valid files, maps them to
    patient_ids using patient_data.csv, splits the unique
    patients, and saves this mapping to patient_splits.csv.
    """
    
    # --- 1. Load both CSVs ---
    try:
        df_check = pd.read_csv(FILE_CHECK_PATH)
        df_patient = pd.read_csv(PATIENT_DATA_PATH)
    except FileNotFoundError as e:
        print(f"Error: Could not find a required file. {e}", file=sys.stderr)
        return

    # --- 2. Get all valid file names ---
    # Get all rows from file_check.csv where label is True
    valid_files_df = df_check[df_check['label'] == True]
    
    # Create a set of all valid file names for fast lookup.
    # We will need to handle .nii vs .nii.gz mismatches.
    valid_file_names_set = set()
    for f in valid_files_df['file_name']:
        valid_file_names_set.add(f)
        if f.endswith('.nii.gz'):
            valid_file_names_set.add(f.replace('.nii.gz', '.nii')) # Add .nii version
        elif f.endswith('.nii'):
            valid_file_names_set.add(f + '.gz') # Add .nii.gz version
            
    if not valid_file_names_set:
        print("Error: No valid files found (label == True) in file_check.csv", file=sys.stderr)
        return

    # --- 3. Map valid files to patients ---
    
    # Get the base file name from 'ct_path' in patient_data.csv
    # e.g., 'ct_files/P0001_ct_P.nii.gz' -> 'P0001_ct_P.nii.gz'
    df_patient['file_name_base'] = df_patient['ct_path'].apply(lambda x: os.path.basename(str(x)))
    
    # Check which rows in patient_data.csv have a file name that is in our valid set
    df_patient['is_valid'] = df_patient['file_name_base'].apply(lambda x: x in valid_file_names_set)
    
    # Get all rows for patients that have at least one valid file
    valid_patient_rows = df_patient[df_patient['is_valid']]
    
    # --- 4. Get unique patients from this valid list ---
    valid_patients = sorted(list(valid_patient_rows['patient_id'].unique()))
    
    print(f"Found {len(valid_patients)} unique patients with at least one valid file.")
    if not valid_patients:
        print("No patients found. Check file name matching logic between CSVs.", file=sys.stderr)
        return

    # --- 5. Perform the splits on the patient list ---
    
    # 1. Split into (train+val) and (test)
    train_val_patients, test_patients = train_test_split(
        valid_patients,
        test_size=TEST_RATIO,
        random_state=RANDOM_SEED,
        shuffle=True
    )
    
    # 2. Calculate the correct validation ratio for the remaining data
    val_ratio_in_train_val = VAL_RATIO / (1.0 - TEST_RATIO)
    
    # 3. Split (train+val) into (train) and (val)
    train_patients, val_patients = train_test_split(
        train_val_patients,
        test_size=val_ratio_in_train_val,
        random_state=RANDOM_SEED,
        shuffle=True
    )
    
    print(f"Total patients: {len(valid_patients)}")
    print(f"  Train patients: {len(train_patients)}")
    print(f"  Val patients:   {len(val_patients)}")
    print(f"  Test patients:  {len(test_patients)}")
    
    # --- 6. Create and save the split DataFrame ---
    
    # Assign each patient to their split
    train_df = pd.DataFrame({'patient_id': train_patients, 'split': 'train'})
    val_df = pd.DataFrame({'patient_id': val_patients, 'split': 'val'})
    test_df = pd.DataFrame({'patient_id': test_patients, 'split': 'test'})
    
    # Combine them into one file
    split_df = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)
    
    # Save to CSV
    split_df.to_csv(OUTPUT_SPLIT_FILE, index=False)
    
    print(f"\nSuccessfully saved patient splits to {OUTPUT_SPLIT_FILE}")

if __name__ == "__main__":
    run_patient_split()

Found 274 unique patients with at least one valid file.
Total patients: 274
  Train patients: 191
  Val patients:   55
  Test patients:  28

Successfully saved patient splits to D:/VA/coding/project/cect/patient_splits.csv


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
import sys

# --- Configuration ---
OUTPUT_FILE = 'D:/VA/coding/project/cect/file_splits.csv'
TEST_RATIO = 0.1
VAL_RATIO = 0.2
RANDOM_SEED = 42
# ---------------------

def create_master_split_file():
    """
    Combines file_check, patient_data, and a new patient split
    to create a single master CSV mapping valid files to their
    train/val/test split.
    """
    
    # --- 1. Load patient_data and file_check ---
    try:
        df_patient = pd.read_csv(PATIENT_DATA_PATH)
        df_check = pd.read_csv(FILE_CHECK_PATH)
    except FileNotFoundError as e:
        print(f"Error: Could not find a required file. {e}", file=sys.stderr)
        return

    # --- 2. Get set of all valid file names ---
    df_check_valid = df_check[df_check['label'] == True]
    valid_file_names_set = set()
    for f in df_check_valid['file_name']:
        valid_file_names_set.add(f)
        if f.endswith('.nii.gz'):
            valid_file_names_set.add(f.replace('.nii.gz', '.nii'))
        elif f.endswith('.nii'):
            valid_file_names_set.add(f + '.gz')

    # --- 3. Filter patient_data for valid files ---
    # Get base file name
    df_patient['file_name'] = df_patient['ct_path'].apply(lambda x: os.path.basename(str(x)))
    
    # Check if the file is in our valid set
    df_patient['is_valid'] = df_patient['file_name'].apply(lambda x: x in valid_file_names_set)
    
    # Keep only the rows that are valid
    df_valid_files = df_patient[df_patient['is_valid']].copy()
    
    if df_valid_files.empty:
        print("Error: No files in patient_data.csv matched a valid file in file_check.csv", file=sys.stderr)
        return

    # --- 4. Get unique patients from this valid list ---
    valid_patients = sorted(list(df_valid_files['patient_id'].unique()))
    print(f"Found {len(valid_patients)} unique patients with at least one valid file.")

    # --- 5. Perform the patient splits ---
    train_val_patients, test_patients = train_test_split(
        valid_patients, test_size=TEST_RATIO, random_state=RANDOM_SEED, shuffle=True
    )
    val_ratio_in_train_val = VAL_RATIO / (1.0 - TEST_RATIO)
    train_patients, val_patients = train_test_split(
        train_val_patients, test_size=val_ratio_in_train_val, random_state=RANDOM_SEED, shuffle=True
    )

    # --- 6. Create the patient-to-split mapping ---
    train_df = pd.DataFrame({'patient_id': train_patients, 'split': 'train'})
    val_df = pd.DataFrame({'patient_id': val_patients, 'split': 'val'})
    test_df = pd.DataFrame({'patient_id': test_patients, 'split': 'test'})
    
    split_map_df = pd.concat([train_df, val_df, test_df])

    # --- 7. Merge the split info with our valid file list ---
    # This assigns 'train', 'val', or 'test' to every valid file
    # based on its patient_id
    df_final = df_valid_files.merge(split_map_df, on='patient_id', how='left')

    # --- 8. Save the final, clean file ---
    # We only need the file_name and its split
    output_columns = ['patient_id', 'file_name', 'split']
    df_final_to_save = df_final[output_columns].drop_duplicates()
    
    df_final_to_save.to_csv(OUTPUT_FILE, index=False)
    
    print(f"\nSuccessfully created master split file: {OUTPUT_FILE}")
    print("This file contains all valid files and their assigned split.")

if __name__ == "__main__":
    create_master_split_file()

Found 274 unique patients with at least one valid file.

Successfully created master split file: D:/VA/coding/project/cect/file_splits.csv
This file contains all valid files and their assigned split.
