In [1]:
import pydicom



In [2]:
# view dcm properties

IMAGE_PATH = '../Datasets/Lung-PET-CT-Dx/manifest-1608669183333/Lung-PET-CT-Dx/Lung_Dx-A0001/04-04-2007-NA-Chest-07990/2.000000-5mm-40805/1-01.dcm'

properties = pydicom.dcmread(IMAGE_PATH)
print(properties)

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 204
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 1.3.6.1.4.1.14519.5.2.1.6655.2359.911257607240619696148712587012
(0002, 0010) Transfer Syntax UID                 UI: Implicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.3.6.1.4.1.22213.1.143
(0002, 0013) Implementation Version Name         SH: '0.5'
(0002, 0016) Source Application Entity Title     AE: 'POSDA'
-------------------------------------------------
(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL', 'HELIX']
(0008, 0012) Instance Creation Date              DA: '20070404'
(0008, 0013) Instance Creation Time              TM: '190114'
(0008, 0016) SOP Class UID 

In [3]:
import os
import pydicom
from collections import defaultdict
from tqdm import tqdm


In [4]:
# dataset path
DATASET_PATH = "../Datasets/Lung-PET-CT-Dx/manifest-1608669183333/Lung-PET-CT-Dx"

sop_counts = defaultdict(int)

# collect all DICOM files
all_dcm_files = [os.path.join(root, f)
                 for root, _, files in os.walk(DATASET_PATH)
                 for f in files if f.endswith(".dcm")]

for filepath in tqdm(all_dcm_files, desc="Scanning DICOM files"):

    try:
        dcm = pydicom.dcmread(filepath, stop_before_pixels=True)
        sop_uid = getattr(dcm.file_meta, "MediaStorageSOPClassUID", None)
        if sop_uid:
            sop_counts[sop_uid.name] += 1  # human-readable name
    except Exception as e:
        print(f"Error reading {filepath}: {e}")

#  summary
print("\n===== SOP Class UID Distribution =====\n")
for sop_name, count in sop_counts.items():
    print(f"{sop_name}: {count} files")

Scanning DICOM files: 100%|██████████| 251135/251135 [03:36<00:00, 1160.36it/s]


===== SOP Class UID Distribution =====

Positron Emission Tomography Image Storage: 45676 files
Secondary Capture Image Storage: 57002 files
CT Image Storage: 148457 files





In [5]:
img_type_counts = defaultdict(int)

# collect all DICOM files
all_dcm_files = [os.path.join(root, f)
                 for root, _, files in os.walk(DATASET_PATH)
                 for f in files if f.endswith(".dcm")]

for filepath in tqdm(all_dcm_files, desc="Scanning DICOM files"):
    try:
        dcm = pydicom.dcmread(filepath, stop_before_pixels=True)

        # extract Image Type (0008,0008)
        img_type = getattr(dcm, "ImageType", None)
        if img_type:

            # convert to tuple so it's hashable
            img_type_counts[tuple(img_type)] += 1
    except Exception as e:
        print(f"Error reading {filepath}: {e}")

# summary
print("\n===== Image Type Distribution =====\n")
for img_type, count in sorted(img_type_counts.items(), key=lambda x: -x[1]):
    print(f"{list(img_type)}: {count} files")

Scanning DICOM files: 100%|██████████| 251135/251135 [03:35<00:00, 1164.47it/s]


===== Image Type Distribution =====

['ORIGINAL', 'PRIMARY', 'AXIAL', 'CT_SOM5 SPI']: 98581 files
['DERIVED', 'SECONDARY', 'OTHER', 'CSA FUSED MPR', '', 'CSAPARALLEL', 'AXIAL', 'CT_SOM5 SPI']: 56599 files
['ORIGINAL', 'PRIMARY']: 45676 files
['ORIGINAL', 'PRIMARY', 'AXIAL']: 44829 files
['ORIGINAL', 'PRIMARY', 'AXIAL', 'HELIX']: 2157 files
['ORIGINAL', 'SECONDARY', 'AXIAL']: 2060 files
['ORIGINAL', 'PRIMARY', 'AXIAL', 'GSI QC']: 672 files
['DERIVED', 'SECONDARY', 'OTHER', 'CSA FUSED MPR TK', '', 'CSAPARALLEL', 'AXIAL', 'CT_SOM5 SPI']: 403 files
['ORIGINAL', 'PRIMARY', 'AXIAL', 'GSI MONO']: 156 files
['ORIGINAL', 'PRIMARY', 'AXIAL', 'CT_SOM5 RTD']: 1 files
['ORIGINAL', 'PRIMARY', 'LOCALIZER']: 1 files





In [6]:
# counters
slice_counts = {"PET": defaultdict(int), "CT": defaultdict(int)}
patient_counts = {"PET": defaultdict(set), "CT": defaultdict(set)}

# disease map
disease_map = {
    "A": "Adenocarcinoma",
    "B": "Small Cell Carcinoma",
    "E": "Large Cell Carcinoma",
    "G": "Squamous Cell Carcinoma"
}

# valid SOP classes
SOP_UIDS = {
    "PET": "1.2.840.10008.5.1.4.1.1.128",
    "CT": "1.2.840.10008.5.1.4.1.1.2"
}

def is_valid_image(dcm):
    
    # check if image is ORIGINAL + PRIMARY and not derived/reformatted
    if not hasattr(dcm, "ImageType"):
        return False
    
    img_type = [s.upper() for s in dcm.ImageType]
    if "ORIGINAL" not in img_type or "PRIMARY" not in img_type:
        return False
    
    # exclude reconstructions
    bad_keywords = ["DERIVED", "SECONDARY", "LOCALIZER", "QC", "MONO", "MPR"]
    if any(bad in img_type for bad in bad_keywords):
        return False
    
    return True


# walk through dataset
for root, _, files in os.walk(DATASET_PATH):
    for f in files:
        if not f.endswith(".dcm"):
            continue
        filepath = os.path.join(root, f)

        try:
            dcm = pydicom.dcmread(filepath, stop_before_pixels=True)
            modality = getattr(dcm, "Modality", "")
            sop_uid = str(getattr(dcm, "SOPClassUID", ""))

            if modality == "PT" and sop_uid == SOP_UIDS["PET"] and is_valid_image(dcm):
                patient_id = str(dcm.PatientID)
                disease_code = patient_id.split("-")[1][0]

                if disease_code in disease_map:
                    slice_counts["PET"][disease_code] += 1
                    patient_counts["PET"][disease_code].add(patient_id)

            elif modality == "CT" and sop_uid == SOP_UIDS["CT"] and is_valid_image(dcm):
                patient_id = str(dcm.PatientID)
                disease_code = patient_id.split("-")[1][0]
                
                if disease_code in disease_map:
                    slice_counts["CT"][disease_code] += 1
                    patient_counts["CT"][disease_code].add(patient_id)
        except Exception as e:
            print(f"Error reading {filepath}: {e}")

# print results
print("===== PET Distribution =====\n")
for code, disease in disease_map.items():
    num_slices = slice_counts["PET"][code]
    num_patients = len(patient_counts["PET"][code])
    print(f"{disease}: {num_slices} slices from {num_patients} patients")

print("\n===== CT Distribution =====\n")
for code, disease in disease_map.items():
    num_slices = slice_counts["CT"][code]
    num_patients = len(patient_counts["CT"][code])
    print(f"{disease}: {num_slices} slices from {num_patients} patients")

===== PET Distribution =====

Adenocarcinoma: 33753 slices from 95 patients
Small Cell Carcinoma: 2673 slices from 9 patients
Large Cell Carcinoma: 0 slices from 0 patients
Squamous Cell Carcinoma: 9250 slices from 29 patients

===== CT Distribution =====

Adenocarcinoma: 109770 slices from 251 patients
Small Cell Carcinoma: 10549 slices from 38 patients
Large Cell Carcinoma: 808 slices from 5 patients
Squamous Cell Carcinoma: 25269 slices from 61 patients


In [7]:
import shutil

In [8]:
OUTPUT_PATH = "../raw/PET"

# counters
slice_counts = defaultdict(int)
patient_counts = defaultdict(set)

# ensure disease folders exist
for code in disease_map:
    os.makedirs(os.path.join(OUTPUT_PATH, code), exist_ok=True)

# collect all DICOM files
all_dcm_files = [os.path.join(root, f)
                 for root, _, files in os.walk(DATASET_PATH)
                 for f in files if f.endswith(".dcm")]

# process files with tqdm
for filepath in tqdm(all_dcm_files, desc="Processing PET DICOM files"):
    try:
        dcm = pydicom.dcmread(filepath, stop_before_pixels=True)
        modality = getattr(dcm, "Modality", "")
        sop_uid = str(getattr(dcm, "SOPClassUID", ""))

        if modality == "PT" and sop_uid == SOP_UIDS["PET"] and is_valid_image(dcm):
            patient_id = str(dcm.PatientID)  # Lung_Dx-A0001
            parts = patient_id.split("-")

            if len(parts) < 2:
                continue

            disease_code = parts[1][0]  # A, B, E, G
            if disease_code in disease_map:

                # add dataset prefix
                full_patient_id = f"Lung-PET-CT-Dx-{parts[1]}"
                dest_dir = os.path.join(OUTPUT_PATH, disease_code, full_patient_id)
                os.makedirs(dest_dir, exist_ok=True)
                dest_file = os.path.join(dest_dir, os.path.basename(filepath))

                if not os.path.exists(dest_file):
                    shutil.copy2(filepath, dest_file)

                slice_counts[disease_code] += 1
                patient_counts[disease_code].add(full_patient_id)
    except Exception as e:
        print(f"Error reading {filepath}: {e}")

# summary
print("\n===== PET Extraction Summary =====\n")
for code, disease in disease_map.items():
    print(f"{disease}: {slice_counts[code]} slices copied from {len(patient_counts[code])} patients")

Processing PET DICOM files: 100%|██████████| 251135/251135 [04:08<00:00, 1009.25it/s]


===== PET Extraction Summary =====

Adenocarcinoma: 33753 slices copied from 95 patients
Small Cell Carcinoma: 2673 slices copied from 9 patients
Large Cell Carcinoma: 0 slices copied from 0 patients
Squamous Cell Carcinoma: 9250 slices copied from 29 patients





In [9]:
OUTPUT_PATH = "../raw/CT"

# counters
slice_counts = defaultdict(int)
patient_counts = defaultdict(set)

# ensure disease folders exist
for code in disease_map:
    os.makedirs(os.path.join(OUTPUT_PATH, code), exist_ok=True)

# collect all dicom files
all_dcm_files = [os.path.join(root, f)
                 for root, _, files in os.walk(DATASET_PATH)
                 for f in files if f.endswith(".dcm")]

# process files
for filepath in tqdm(all_dcm_files, desc="Processing CT DICOM files"):
    try:
        dcm = pydicom.dcmread(filepath, stop_before_pixels=True)
        modality = getattr(dcm, "Modality", "")
        sop_uid = str(getattr(dcm, "SOPClassUID", ""))

        if modality == "CT" and sop_uid == SOP_UIDS["CT"] and is_valid_image(dcm):

            patient_id = str(dcm.PatientID) # Lung_Dx-A0001
            parts = patient_id.split("-")
            if len(parts) < 2:
                continue

            disease_code = parts[1][0]
            if disease_code in disease_map:
                # add dataset prefix
                full_patient_id = f"Lung-PET-CT-Dx-{parts[1]}"
                dest_dir = os.path.join(OUTPUT_PATH, disease_code, full_patient_id)
                os.makedirs(dest_dir, exist_ok=True)
                dest_file = os.path.join(dest_dir, os.path.basename(filepath))

                if not os.path.exists(dest_file):
                    shutil.copy2(filepath, dest_file)

                slice_counts[disease_code] += 1
                patient_counts[disease_code].add(full_patient_id)
    except Exception as e:
        print(f"Error reading {filepath}: {e}")

# summary
print("\n===== CT Extraction Summary =====\n")
for code, disease in disease_map.items():
    print(f"{disease}: {slice_counts[code]} slices copied from {len(patient_counts[code])} patients")

Processing CT DICOM files: 100%|██████████| 251135/251135 [05:37<00:00, 744.59it/s] 


===== CT Extraction Summary =====

Adenocarcinoma: 109770 slices copied from 251 patients
Small Cell Carcinoma: 10549 slices copied from 38 patients
Large Cell Carcinoma: 808 slices copied from 5 patients
Squamous Cell Carcinoma: 25269 slices copied from 61 patients



