In [1]:
import matplotlib.pyplot as plt
import monai
import torch
import numpy as np
import pandas as pd
from pathlib import Path
import os
import re
import glob
from collections import defaultdict
import pydicom
import nibabel as nib
from scipy.ndimage import zoom
from monai.bundle import load_bundle_config
from huggingface_hub import hf_hub_download
import cv2 #The import-call for cv2 is "pip install opencv-python" (not cv2)

### Segmentation of DICOM files

Temporary Information for Nina during coding:

* path: MAD_OUS_sorted/MAD_x/Study01/cine/SAX/slice_y/framez.dcm where x is ID, y is the slice number and z is the frame number
* the csv will have ID (x), ED frame (z), apex and base (y_a:y_b)

**Challenges**

* sometimes other files besides slice_y in file (e.g ID 1)
* double series (see csv-file), different size (sort based on that?)
* first do normal case and then double series, sort based on y/n in "Double Series (y/n)" in csv


IT ALSO PROBABLY MAKES SENSE TO COMBINE THIS FILE WITH THE OTHER SEGMENTATION NOTEBOOK; BUT FOR DEBUGGING PURPOSES NOW EASIER!

In [4]:
def get_relevant_files_new_singleSeries(df, base_path):
    """
    Selects the ED frame DICOM file for each slice between apex and base for each patient.

    Parameters:
        df (pd.DataFrame): DataFrame with columns ['ID', 'ED frame', 'apex', 'base']
        base_path (str): Root directory containing patient folders (e.g., 'MAD_OUS_sorted')

    Returns:
        dict[str, list[str]]: Mapping from patient ID to selected file paths.
    """
    patient_files = {}

    for _, row in df.iterrows():
        try:
            pid = str(row["ID"]).strip()
            ed_frame = int(row["ED frame"])
            apex = int(row["apex"])
            base = int(row["base"])
            print(pid, ed_frame)
        except (ValueError, TypeError):
            print("There was an error in reading the csv file.")
            continue 

        lower, upper = sorted([apex, base])
        selected = []

        for slice_num in range(lower, upper + 1):
            fpath = Path(base_path) / f"MAD_{pid}" / "Study01" / "cine" / "SAX" / f"slice_{slice_num}" / f"frame{ed_frame}.dcm"
            print(slice_num)
            print(fpath)
            """
            if fpath.exists():
                selected.append(str(fpath))
            else:
                print(f"Missing: {fpath}")
            """

        patient_files[pid] = selected

    return patient_files


In [None]:
def get_relevant_files_new_doubleSeries(df, base_path):
    """
    Selects the correct DICOM frames from mixed-series folders by filtering on file size.
    Keeps only the larger-sized series and reindexes frames accordingly.

    Parameters:
        df (pd.DataFrame): DataFrame with columns ['ID', 'ED frame', 'apex', 'base']
        base_path (str): Base path to patient folders

    Returns:
        dict[str, list[str]]: Mapping from patient ID to selected file paths
    """
    patient_files = {}

    for _, row in df.iterrows():
        pid = str(row["ID"]).strip()
        try:
            ed_frame = int(row["ED frame"])
            apex = int(float(row["apex"]))
            base = int(float(row["base"]))
        except (ValueError, TypeError):
            continue

        lower, upper = sorted([apex, base])
        selected = []

        for slice_num in range(lower, upper + 1):
            slice_folder = Path(base_path) / f"MAD_{pid}" / "Study01" / "cine" / "SAX" / f"slice_{slice_num}"
            if not slice_folder.exists():
                print(f"Missing folder: {slice_folder}")
                continue

            files = sorted(slice_folder.glob("frame*.dcm"))
            if not files:
                print(f"No frames in {slice_folder}")
                continue

            # Group files by file size
            size_map = defaultdict(list)
            for f in files:
                try:
                    size = f.stat().st_size
                    size_map[size].append(f)
                except FileNotFoundError:
                    continue

            if len(size_map) == 1:
                # Only one series — normal behavior -> should not be marked y in csv
                print("There is only one file-size in the folder. Is the y in Double Series in the csv a mistake? PID:", pid)
            elif len(size_map) == 2:
                # Mixed series: keep the one with larger filesize
                largest_size = max(size_map.keys())
                correct_series = sorted(size_map[largest_size])
            else:
                print(f"Unexpected number ({len(size_map)}) of file sizes in {slice_folder}. Please check PID: {pid}")
                continue

            # Use re-indexed list to match ED frame
            if ed_frame < len(correct_series):
                selected.append(str(correct_series[ed_frame]))
            else:
                print(f"ED frame {ed_frame} out of range in {slice_folder}")

        patient_files[pid] = selected

    return patient_files


In [None]:
# This would be main in .py

# read in csv split on folders y/n
csv_file = "newDataSegmentation_Placeholder.csv" 
df = pd.read_csv(csv_file)
#display(df)

df.columns = df.columns.str.strip()
df["Double Series (y/n)"] = df["Double Series (y/n)"].str.strip().str.lower()

df_double = df[df["Double Series (y/n)"] == 'y'].reset_index(drop=True)
df_normal = df[df["Double Series (y/n)"] == 'n'].reset_index(drop=True)

display(df_normal)

# Change this based on where you store the data
base_path = "/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted"

files_normal = get_relevant_files_new_singleSeries(df_normal, base_path)
files_double = get_relevant_files_new_doubleSeries(df_double, base_path)



Unnamed: 0,ID,ED frame,apex,base,Folder order,Folders (y/n),Double Series (y/n),Slice(s) of Double Series,Notes
0,142,29,10,4,10-9-8-7-6-5-4,y,n,,
1,144,38,13,6,13-12-11-10-9-8-7-6,y,n,,Slice 7 should be ignored I think
2,145,35,12,3,12-11-10-9-8-7-6-5-4-3,y,n,,Slice 7 should be ignored I think
3,149,0,13,2,13-12-11-10-9-8-7-6-5-4-3-2,y,n,,Slice 7 should be ignored I think
4,152,0,12,5,12-11-10-9-8-7-6-5,y,n,,Slice 7 should be ignored I think
5,157,0,13,4,13-12-11-10-9-8-7-6-5-4,y,n,,Slice 7 should be ignored I think
6,158,29,10,3,10-9-8-7-6-5-4-3,y,n,,
7,160,1,11,4,11-10-9-8-7-6-5-4,y,n,,Slice 7 should be ignored I think
8,161,24,14,5,14-13-12-11-10-9-8-7-6-5,y,n,,
9,162,1,12,6,12-11-10-9-8-7-6,y,n,,Slice 7 should be ignored I think


142 29
4
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_142/Study01/cine/SAX/slice_4/frame29.dcm
5
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_142/Study01/cine/SAX/slice_5/frame29.dcm
6
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_142/Study01/cine/SAX/slice_6/frame29.dcm
7
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_142/Study01/cine/SAX/slice_7/frame29.dcm
8
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_142/Study01/cine/SAX/slice_8/frame29.dcm
9
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_142/Study01/cine/SAX/slice_9/frame29.dcm
10
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_142/Study01/cine/SAX/slice_10/frame29.dcm
144 38
6
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_144/Study01/cine/SAX/slice_6/frame38.dcm
7
/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted/MAD_144/Study01/cine/SAX/slice_7/frame38.dcm
8
/Users/au6