In [1]:
import matplotlib.pyplot as plt
import monai
import torch
import numpy as np
import pandas as pd
from pathlib import Path
import os
import re
import glob
from collections import defaultdict
import pydicom
import nibabel as nib
from scipy.ndimage import zoom
from monai.bundle import load_bundle_config
from huggingface_hub import hf_hub_download
import cv2 #The import-call for cv2 is "pip install opencv-python" (not cv2)

### Segmentation of DICOM files

Temporary Information for Nina during coding:

* path: MAD_OUS_sorted/MAD_x/Study01/cine/SAX/slice_y/framez.dcm where x is ID, y is the slice number and z is the frame number
* the csv will have ID (x), ED frame (z), apex and base (y_a:y_b)

**Challenges**

* sometimes other files besides slice_y in file (e.g ID 1)
* double series (see csv-file), different size (sort based on that?) - but also extra folder (mostly slice 7)
* first do normal case and then double series, sort based on y/n in "Double Series (y/n)" in csv

TODO: do one function and always check file size, instead of relying on y/n info (might miss), also identify extra folders by filesize (168KB)


IT ALSO PROBABLY MAKES SENSE TO COMBINE THIS FILE WITH THE OTHER SEGMENTATION NOTEBOOK; BUT FOR DEBUGGING PURPOSES NOW EASIER!

In [34]:
def is_excluded_size(f, pid=None):
    # Default range: files that show as 168 KB
    default_range = (167000, 169000)

    # Manual overrides for specific PIDs
    overrides = {
        "4": (102000, 104000),     # PID 4 → exclude ~103 KB
        "169": (132000, 134000),   # PID 169 → exclude ~133 KB
    }

    lower, upper = overrides.get(str(pid), default_range)

    try:
        size = f.stat().st_size
        return lower <= size <= upper
    except FileNotFoundError:
        return True



In [37]:
def get_relevant_files_new_singleSeries(df, base_path):
    """
    Selects the ED frame DICOM file for each slice between apex and base for each patient.

    Parameters:
        df (pd.DataFrame): DataFrame with columns ['ID', 'ED frame', 'apex', 'base']
        base_path (str): Root directory containing patient folders (e.g., 'MAD_OUS_sorted')

    Returns:
        dict[str, list[str]]: Mapping from patient ID to selected file paths.
    """
    patient_files = {}

    for _, row in df.iterrows():
        try:
            pid = str(row["ID"]).strip()
            ed_frame = int(row["ED frame"])
            apex = int(row["apex"])
            base = int(row["base"])
        except (ValueError, TypeError):
            print("There was an error in reading the csv file.")
            continue 

        lower, upper = sorted([apex, base])
        selected = []

        for slice_num in range(lower, upper + 1):
            slice_folder = Path(base_path) / f"MAD_{pid}" / "Study01" / "cine" / "SAX" / f"slice_{slice_num}"
            if not slice_folder.exists():
                print(f"Folder slice_{slice_num} for patient {pid} can not be found and is skipped.")
                continue

            files = sorted(slice_folder.glob("frame*.dcm"))
            if not files:
                print(f"Folder slice_{slice_num} for patient {pid} does not contain any frame*.dcm files.")
                continue

            # Filter out files of unwanted size
            valid_files = [f for f in files if not is_excluded_size(f, pid)]
            #print(valid_files[ed_frame].stat().st_size)

            if not valid_files:
                print(f"skipping patient {pid} and slice {slice_num} because all files are not valid (low quality added series)")
                continue  # all were 168 KB — skip this slice silently

            # Use reindexed list to get ED frame
            if ed_frame < len(valid_files):
                selected.append(str(valid_files[ed_frame]))
                print(f"Adding patient {pid} and slice {slice_num}")
            else:
                print(f"ED_Frame {ed_frame} number larger than number of files in folder, Patient {pid}, Slice {slice_num}")

            
        patient_files[pid] = selected

    return patient_files


In [38]:
# This would be main in .py

# read in csv split on folders y/n
csv_file = "newDataSegmentation.csv" 
df = pd.read_csv(csv_file)
#display(df)

df.columns = df.columns.str.strip()

display(df)

# Change this based on where you store the data
base_path = "/Users/au698484/Documents/SSCP25_data/newData/MAD_OUS_sorted"

files = get_relevant_files_new_singleSeries(df, base_path)



Unnamed: 0,ID,ED frame,apex,base,Folder order,Folders (y/n),Double Series (y/n),Slice(s) of Double Series,Notes
0,1,29,9,3,9-8-7-6-5-4-3,y,,,Dark images
1,3,0,12,5,12-11-10-9-8-7-6-5,y,,,Slice 7 has bad quality (20 time frames)
2,4,57,10,2,10-9-8-7-6-5-4-3-2,y,,,Slice 6 has bad quality (23 time frames)
3,8,0,14,4,14-13-12-11-10-9-8-7-6-5-4,y,,,Slice 8 has bad quality (26 time frames)
4,12,0,8,3,8-7-6-5-4-3,y,,,
...,...,...,...,...,...,...,...,...,...
75,178,0,10,5,10-9-8-7-6-5,y,y,6.0,
76,180,29,11,5,11-10-9-8-7-6-5,y,n,,
77,181,29,10,6,10-9-8-7-6,y,n,,
78,182,38,11,4,11-10-9-8-7-6-5-4,y,y,6.0,


Adding patient 1 and slice 3
Adding patient 1 and slice 4
Adding patient 1 and slice 5
Adding patient 1 and slice 6
Adding patient 1 and slice 7
Adding patient 1 and slice 8
Adding patient 1 and slice 9
Adding patient 3 and slice 5
Adding patient 3 and slice 6
skipping patient 3 and slice 7 because all files are not valid (low quality added series)
Adding patient 3 and slice 8
Adding patient 3 and slice 9
Adding patient 3 and slice 10
Adding patient 3 and slice 11
Adding patient 3 and slice 12
Adding patient 4 and slice 2
Adding patient 4 and slice 3
Adding patient 4 and slice 4
Adding patient 4 and slice 5
skipping patient 4 and slice 6 because all files are not valid (low quality added series)
ED_Frame 57 number larger than number of files in folder, Patient 4, Slice 7
Adding patient 4 and slice 8
Adding patient 4 and slice 9
Adding patient 4 and slice 10
Adding patient 8 and slice 4
Adding patient 8 and slice 5
Adding patient 8 and slice 6
Adding patient 8 and slice 7
skipping patie