In [55]:
import matplotlib.pyplot as plt
import monai
import torch
import numpy as np
import pandas as pd
from pathlib import Path
import os
import re
import glob
from collections import defaultdict

Must know:
- ID 207: Add a folder named "sax" in cine and move all subfolders (with the weird names) in there for consistency

Good to know:
- All missing IDs have a "-" in column "Folders (y/n)". So when you find out folder-order and first and last for the missing ones, change that to "y" and run the code again. 
- We should ask about 187 (no sax folder), for now it also has a "-" and is ignored.
- Otherwise scroll down for the function calls. I structured it this way so it's easy to make into a .py file, but notebook is easier for debugging etc.
- It took a lot longer than planned, so I have not made Eva's code function-based and connected it here, or saved the files as .nii, but that should hopefully be very straightforward

In [54]:
def parse_filename(filename):
    """
    Extracts sliceloc and triggertime values from a filename.

    Parameters:
        filename (str): Filename containing 'sliceloc_{val}_triggertime_{val}'.

    Returns:
        tuple[float | None, float | None]: Parsed sliceloc and triggertime as floats, or (None, None) if not found.
    """
    match = re.search(r"sliceloc_([-\d.]+)_triggertime_([-\d.]+)", filename)
    if match:
        return float(match.group(1)), float(match.group(2))
    return None, None

In [86]:
def get_relevant_files_n(df, base_path):
    """
    Selects one relevant file per slice location for each patient based on ED frame and apex–base range.

    For each patient, searches {base_path}/{ID}/cine/sax/ (recursively) for files named 
    like '...sliceloc_{val}_triggertime_{val}'. Keeps only slices within the apex–base 
    range and selects the earliest (ED Slice == 0) or latest frame per slice.

    Parameters:
        df (pd.DataFrame): df_y (see above); DataFrame of "ED_slices_and_timepoints.csv", without series-substructure
        base_path (str): Root path containing the patient folders.

    Returns:
        dict[str, list[str]]: Mapping from patient ID to selected file paths.
    """
    patient_files = {}

    for _, row in df.iterrows():
        pid = str(row["ID"]).strip()
        try:
            ed_slice = int(row["ED frame"])
            apex = float(row["apex"])
            base = float(row["base"])
        except (ValueError, TypeError):
            # Skip malformed rows
            continue

        folder = os.path.join(base_path, pid, "cine", "sax")
        if not os.path.isdir(folder):
            print(f"Warning: folder not found for patient {pid}")
            continue

        files = [p for p in Path(folder).rglob("*") if p.is_file()]
        parsed = []

        # Parse filenames
        for f in files:
            fname = f.name
            sliceloc, triggertime = parse_filename(fname)
            if sliceloc is not None and triggertime is not None:
                parsed.append((f, sliceloc, triggertime))

        if not parsed:
            print(f"Warning: no valid files for patient {pid}")
            continue

        # Group by sliceloc → list of triggertimes
        sliceloc_map = defaultdict(list)
        for f, sliceloc, triggertime in parsed:
            sliceloc_map[sliceloc].append((f, triggertime))

        lower, upper = sorted([apex, base])
        selected = []

        for sliceloc, items in sliceloc_map.items():
            if lower <= sliceloc <= upper:
                times = [tt for _, tt in items]
                if ed_slice == 0:
                    target_tt = min(times)
                else:
                    target_tt = max(times)

                # Add the file with this sliceloc + target triggertime
                for f, tt in items:
                    if tt == target_tt:
                        selected.append(str(f))
                        break  # Only one per sliceloc

        patient_files[pid] = selected

    return patient_files



In [84]:
def get_relevant_files_y(df, base_path):
    """
    Selects one relevant file per folder-defined slice for each patient using ED frame.

    For each patient, looks inside {base_path}/{ID}/cine/sax/series_{folder}/ for files.
    The slice locations are inferred from subfolder names (e.g. 'series_25').
    The column 'folder order' lists all available series (in order),
    while 'apex' and 'base' define the first and last folder to include.

    Parameters:
        df (pd.DataFrame): DataFrame with columns 'ID', 'ED Slice', 'apex', 'base', and 'folder order'.
        base_path (str): Root path containing the patient folders.

    Returns:
        dict[str, list[str]]: Mapping from patient ID to selected file paths.
    """

    patient_files = {}

    for _, row in df.iterrows():
        pid = str(row["ID"]).strip()
        try:
            ed_slice = int(row["ED frame"])
            apex = int(row["apex"])
            base = int(row["base"])
            folder_order = str(row["folder order"]).strip()
        except (ValueError, TypeError):
            print(f"Could not extract values for ID {pid}")
            continue

        if not folder_order or folder_order.lower() == "nan":
            continue

        sax_root = os.path.join(base_path, pid, "cine", "sax")
        if not os.path.isdir(sax_root):
            print(f"Warning: folder not found for patient {pid}")
            continue

        # Get ordered folder list (as ints)
        order = [int(x) for x in folder_order.split("-") if x.isdigit()]
        lower, upper = sorted([apex, base])
        selected_series = [x for x in order if lower <= x <= upper]

        selected = []

        for sliceloc in selected_series:
            series_path = os.path.join(sax_root, f"series_{sliceloc}")
            if not os.path.isdir(series_path):
                print(f"Warning: missing folder series_{sliceloc} for patient {pid}")
                continue

            files = glob.glob(os.path.join(series_path, "*"))
            triggertimes = []

            for f in files:
                _, tt = parse_filename(os.path.basename(f))
                if tt is not None:
                    triggertimes.append((f, tt))

            if not triggertimes:
                continue

            if ed_slice == 0:
                chosen_file = min(triggertimes, key=lambda x: x[1])[0]
            else:
                chosen_file = max(triggertimes, key=lambda x: x[1])[0]

            selected.append(chosen_file)

        patient_files[pid] = selected

    return patient_files


In [87]:
# This would be main in .py

# read in csv split on folders y/n
csv_file = "ED_slices_and_timepoints.csv" #For the future, once we structure our folders/files better we need to (probably) adjust this import
df = pd.read_csv(csv_file)
#display(df)

df.columns = df.columns.str.strip()
df["Folders (y/n)"] = df["Folders (y/n)"].str.strip().str.lower()

df_y = df[df["Folders (y/n)"] == 'y'].reset_index(drop=True)
df_n = df[df["Folders (y/n)"] == 'n'].reset_index(drop=True)

#display(df_n)
#display(df_y)

# Change this based on where you store the data
base_path = "/Users/au698484/Documents/SSCP25_data/Data and scripts SSCP25 3/CMR_image_data/new data-dicom"

files_n = get_relevant_files_n(df_n, base_path)
files_y = get_relevant_files_y(df_y, base_path)

In [88]:
# TODO: Save these files in .nii format and connect Eva's code (MONAI/doc/visualize.ipynb) and make that part function-based as well.