In [1]:
import os
import pydicom
import numpy as np
import matplotlib.pyplot as plt
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Define the path to the root directory containing DICOM folders
dicom_root = '/gpfs/data/mankowskilab/HCC_Recurrence/dicom'

Matplotlib created a temporary cache directory at /tmp/matplotlib-7_qzfeov because the default path (/gpfs/home/wz1492/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [2]:
# Function to get all DICOM file paths
def get_dicom_files(root_dir):
    dicom_files = []
    for root, _, files in os.walk(root_dir):
        for file in sorted(files, key=lambda x: int(os.path.splitext(x)[0]) if x[:-4].isdigit() else x):  # Sort numerically if possible
            if file.endswith('.dcm'):
                dicom_files.append(os.path.join(root, file))
    return dicom_files
# Load DICOM files
dicom_files = get_dicom_files(dicom_root)
print(f"Found {len(dicom_files)} DICOM files.")

Found 139322 DICOM files.


In [4]:
import csv

# Function to extract metadata from a DICOM file, excluding PixelData
def extract_metadata(file_path, tags):
    try:
        dicom = pydicom.dcmread(file_path, stop_before_pixels=True)
        metadata = {tag: getattr(dicom, tag, None) for tag in tags if tag != "PixelData"}
        return metadata
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Function to extract metadata for all files and save to CSV
def save_metadata_to_csv(dicom_files, tags, output_csv):
    with open(output_csv, mode='w', newline='') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=tags)
        writer.writeheader()

        for file_path in tqdm(dicom_files, desc="Processing DICOM files"):
            metadata = extract_metadata(file_path, tags)
            if metadata:
                writer.writerow(metadata)

# Save the metadata to a CSV file
output_csv = 'dicom_metadata.csv'
save_metadata_to_csv(dicom_files, unique_tags, output_csv)

print(f"Metadata saved to {output_csv}")


Processing DICOM files: 100%|██████████| 139322/139322 [39:53<00:00, 58.20it/s] 

Metadata saved to dicom_metadata.csv



