some dicom images are here in this directory
`/gpfs/data/mankowskilab/HCC_Recurrence/dicom`

In [40]:
import os
import pydicom
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict

# Set the base directory
base_dir = '/gpfs/data/mankowskilab/HCC_Recurrence/dicom'

def get_all_dicom_files(base_path):
    """
    Recursively find all DICOM files in the directory structure
    Returns a dictionary with patient IDs as keys and lists of DICOM file paths as values
    """
    dicom_files = defaultdict(list)
    
    for patient_dir in os.listdir(base_path):
        patient_path = os.path.join(base_path, patient_dir)
        if os.path.isdir(patient_path):
            for root, _, files in os.walk(patient_path):
                for file in files:
                    if file.endswith('.dcm'):
                        dicom_files[patient_dir].append(os.path.join(root, file))
    
    return dicom_files

def extract_dicom_metadata(dicom_path):
    """
    Extract relevant metadata from a DICOM file
    Returns a dictionary of metadata fields
    """
    try:
        dcm = pydicom.dcmread(dicom_path)
        return {
            'PatientID': getattr(dcm, 'PatientID', 'Unknown'),
            'Modality': getattr(dcm, 'Modality', 'Unknown'),
            'StudyDate': getattr(dcm, 'StudyDate', 'Unknown'),
            'SeriesDescription': getattr(dcm, 'SeriesDescription', 'Unknown'),
            'ImageType': getattr(dcm, 'ImageType', ['Unknown'])[0],
            'PixelSpacing': getattr(dcm, 'PixelSpacing', [0, 0]),
            'SliceThickness': getattr(dcm, 'SliceThickness', 0),
            'Manufacturer': getattr(dcm, 'Manufacturer', 'Unknown'),
            'ImageShape': dcm.pixel_array.shape if hasattr(dcm, 'pixel_array') else (0, 0)
        }
    except Exception as e:
        print(f"Error reading {dicom_path}: {str(e)}")
        return None

def save_plot(fig, filename):
    """
    Save matplotlib figure to file and close it to free memory
    """
    fig.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close(fig)


In [41]:

# Get all DICOM files
print("Finding all DICOM files...")
dicom_files = get_all_dicom_files(base_dir)

# Create a list to store metadata from all files
metadata_list = []

# Extract metadata from each DICOM file
print("Extracting metadata...")
for patient_id, file_list in tqdm(dicom_files.items()):
    for file_path in file_list:
        metadata = extract_dicom_metadata(file_path)
        if metadata:
            metadata['FilePath'] = file_path
            metadata_list.append(metadata)

# Convert to DataFrame for analysis
df = pd.DataFrame(metadata_list)

# Print basic statistics
print("\nBasic Dataset Statistics:")
print(f"Total number of patients: {len(dicom_files)}")
print(f"Total number of DICOM files: {len(df)}")

# Plot modality distribution
fig, ax = plt.subplots(figsize=(10, 6))
df['Modality'].value_counts().plot(kind='bar', ax=ax)
plt.title('Distribution of Image Modalities')
plt.xlabel('Modality')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
save_plot(fig, 'modality_distribution.png')

# Plot image size distribution
fig, ax = plt.subplots(figsize=(12, 6))
image_sizes = df['ImageShape'].value_counts().head(10)
plt.bar(range(len(image_sizes)), image_sizes.values)
plt.title('Top 10 Most Common Image Dimensions')
plt.xlabel('Image Dimensions')
plt.ylabel('Count')
plt.xticks(range(len(image_sizes)), image_sizes.index, rotation=45)
plt.tight_layout()
save_plot(fig, 'image_dimensions.png')

# Plot manufacturer distribution
fig, ax = plt.subplots(figsize=(10, 6))
df['Manufacturer'].value_counts().plot(kind='bar', ax=ax)
plt.title('Distribution of Scanner Manufacturers')
plt.xlabel('Manufacturer')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
save_plot(fig, 'manufacturer_distribution.png')



Finding all DICOM files...
Extracting metadata...


  6%|▋         | 7/110 [04:29<1:18:32, 45.75s/it]Exception ignored in: <function tqdm.__del__ at 0x155508b71120>
Traceback (most recent call last):
  File "/gpfs/scratch/wz1492/miniconda3/envs/reddit/lib/python3.13/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/gpfs/scratch/wz1492/miniconda3/envs/reddit/lib/python3.13/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
 74%|███████▎  | 81/110 [54:38<23:14, 48.08s/it]  

Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/25064549/791.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pylibjpeg-libjpeg>=2.1
Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/25064549/500.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pylibjpeg-libjpeg>=2.1
Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/25064549/564.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pyl

 75%|███████▍  | 82/110 [55:02<19:02, 40.81s/it]

Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/25064549/974.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pylibjpeg-libjpeg>=2.1
Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/25064549/111.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pylibjpeg-libjpeg>=2.1
Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/25064549/175.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pyl

 85%|████████▌ | 94/110 [1:02:06<10:14, 38.40s/it]

Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/35176884/500.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pylibjpeg-libjpeg>=2.1
Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/35176884/564.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pylibjpeg-libjpeg>=2.1
Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/35176884/127.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pyl

 86%|████████▋ | 95/110 [1:02:19<07:40, 30.69s/it]

Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/35176884/521.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pylibjpeg-libjpeg>=2.1
Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/35176884/585.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pylibjpeg-libjpeg>=2.1
Error reading /gpfs/data/mankowskilab/HCC_Recurrence/dicom/35176884/64.dcm: Unable to decompress 'JPEG Lossless, Non-Hierarchical, First-Order Prediction (Process 14 [Selection Value 1])' pixel data because all plugins are missing dependencies:
	gdcm - requires gdcm>=3.0.10
	pylibjpeg - requires pylibjpeg>=2.0 and pyli

100%|██████████| 110/110 [1:11:47<00:00, 39.16s/it]



Basic Dataset Statistics:
Total number of patients: 110
Total number of DICOM files: 137525


In [42]:
def analyze_sample_image(file_path):
    """
    Analyze and save visualizations for a sample DICOM image
    """
    dcm = pydicom.dcmread(file_path)
    
    # Create figure for image and histogram
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
    
    # Plot image
    ax1.imshow(dcm.pixel_array, cmap='gray')
    ax1.set_title('DICOM Image')
    ax1.axis('off')
    
    # Plot histogram
    ax2.hist(dcm.pixel_array.flatten(), bins=50)
    ax2.set_title('Pixel Value Distribution')
    ax2.set_xlabel('Pixel Value')
    ax2.set_ylabel('Frequency')
    
    plt.tight_layout()
    save_plot(fig, 'sample_image_analysis.png')
    
    # Print image properties
    print(f"\nImage Properties:")
    print(f"Shape: {dcm.pixel_array.shape}")
    print(f"Data Type: {dcm.pixel_array.dtype}")
    print(f"Min Value: {dcm.pixel_array.min()}")
    print(f"Max Value: {dcm.pixel_array.max()}")
    print(f"Mean Value: {dcm.pixel_array.mean():.2f}")

# Analyze a sample image
sample_file = dicom_files[list(dicom_files.keys())[0]][0]
print("\nAnalyzing sample image...")
analyze_sample_image(sample_file)

# Save metadata to CSV
df.to_csv('dicom_metadata.csv', index=False)
print("\nMetadata saved to 'dicom_metadata.csv'")

# Additional analyses
def analyze_slice_thickness(df):
    """
    Analyze and save visualization of slice thickness distribution
    """
    fig, ax = plt.subplots(figsize=(10, 6))
    df['SliceThickness'].hist(bins=30, ax=ax)
    plt.title('Distribution of Slice Thickness')
    plt.xlabel('Slice Thickness (mm)')
    plt.ylabel('Frequency')
    save_plot(fig, 'slice_thickness_distribution.png')

def analyze_temporal_distribution(df):
    """
    Analyze and save visualization of temporal scan distribution
    """
    df['StudyDate'] = pd.to_datetime(df['StudyDate'], format='%Y%m%d', errors='coerce')
    fig, ax = plt.subplots(figsize=(12, 6))
    df['StudyDate'].value_counts().sort_index().plot(kind='line', ax=ax)
    plt.title('Temporal Distribution of Scans')
    plt.xlabel('Date')
    plt.ylabel('Number of Scans')
    plt.xticks(rotation=45)
    plt.tight_layout()
    save_plot(fig, 'temporal_distribution.png')

# Run additional analyses
print("\nAnalyzing slice thickness distribution...")
analyze_slice_thickness(df)

print("\nAnalyzing temporal distribution of scans...")
analyze_temporal_distribution(df)

print("\nAll analyses complete. Check the current directory for generated plots and metadata CSV.")


Analyzing sample image...

Image Properties:
Shape: (128, 128)
Data Type: uint16
Min Value: 0
Max Value: 629
Mean Value: 73.21

Metadata saved to 'dicom_metadata.csv'

Analyzing slice thickness distribution...

Analyzing temporal distribution of scans...

All analyses complete. Check the current directory for generated plots and metadata CSV.


In [43]:
def visualize_sample_images(dicom_files, num_patients=3, samples_per_patient=4):
    """
    Visualize sample images from multiple patients
    
    Args:
        dicom_files (dict): Dictionary of patient IDs and their DICOM file paths
        num_patients (int): Number of patients to sample
        samples_per_patient (int): Number of images to show per patient
    """
    # Sample patients
    patient_ids = list(dicom_files.keys())
    if len(patient_ids) > num_patients:
        patient_ids = np.random.choice(patient_ids, num_patients, replace=False)
    
    rows = len(patient_ids)
    cols = samples_per_patient
    fig, axes = plt.subplots(rows, cols, figsize=(15, 5*rows))
    
    if rows == 1:
        axes = axes.reshape(1, -1)
    
    for i, patient_id in enumerate(patient_ids):
        files = dicom_files[patient_id]
        
        # Sample files for this patient
        if len(files) > samples_per_patient:
            sample_files = np.random.choice(files, samples_per_patient, replace=False)
        else:
            sample_files = files + [files[0]] * (samples_per_patient - len(files))
        
        for j, file_path in enumerate(sample_files):
            try:
                dcm = pydicom.dcmread(file_path)
                img = dcm.pixel_array
                
                # Normalize image for display
                if img.max() > img.min():
                    img = (img - img.min()) / (img.max() - img.min())
                
                axes[i, j].imshow(img, cmap='gray')
                axes[i, j].axis('off')
                
                # Add title with basic info
                title = f"Patient {patient_id}\\n"
                if hasattr(dcm, 'SeriesDescription'):
                    title += f"{dcm.SeriesDescription[:20]}\\n"
                if hasattr(dcm, 'InstanceNumber'):
                    title += f"Instance #{dcm.InstanceNumber}"
                    
                axes[i, j].set_title(title, fontsize=8)
                
            except Exception as e:
                print(f"Error loading {file_path}: {str(e)}")
                axes[i, j].text(0.5, 0.5, 'Error loading image', 
                              ha='center', va='center')
                axes[i, j].axis('off')
    
    plt.tight_layout()
    save_plot(fig, 'sample_images_grid.png')
    print("\nSample images grid saved as 'sample_images_grid.png'")

def visualize_3d_slices(patient_id, dicom_files, num_slices=6):
    """
    Visualize different slices from a single patient's 3D volume
    
    Args:
        patient_id (str): Patient ID to visualize
        dicom_files (dict): Dictionary of patient IDs and their DICOM file paths
        num_slices (int): Number of slices to display
    """
    if patient_id not in dicom_files:
        print(f"Patient {patient_id} not found in the dataset")
        return
        
    files = sorted(dicom_files[patient_id])  # Sort to maintain slice order
    
    if len(files) < num_slices:
        print(f"Patient has fewer than {num_slices} slices. Showing all available slices.")
        num_slices = len(files)
    
    # Select evenly spaced slices
    slice_indices = np.linspace(0, len(files)-1, num_slices, dtype=int)
    
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, idx in enumerate(slice_indices):
        try:
            dcm = pydicom.dcmread(files[idx])
            img = dcm.pixel_array
            
            # Normalize image for display
            if img.max() > img.min():
                img = (img - img.min()) / (img.max() - img.min())
            
            axes[i].imshow(img, cmap='gray')
            axes[i].axis('off')
            
            # Add slice information
            slice_info = f"Slice {idx+1}/{len(files)}\\n"
            if hasattr(dcm, 'SliceLocation'):
                slice_info += f"Location: {dcm.SliceLocation:.1f}mm"
            axes[i].set_title(slice_info, fontsize=8)
            
        except Exception as e:
            print(f"Error loading slice {idx}: {str(e)}")
            axes[i].text(0.5, 0.5, 'Error loading image', 
                        ha='center', va='center')
            axes[i].axis('off')
    
    plt.suptitle(f"Different Slices for Patient {patient_id}", y=1.02)
    plt.tight_layout()
    save_plot(fig, f'patient_{patient_id}_slices.png')
    print(f"\nSlice visualization saved as 'patient_{patient_id}_slices.png'")

In [44]:
# Visualize sample images from multiple patients
print("\nGenerating sample images grid...")
visualize_sample_images(dicom_files)

# Visualize slices from the first patient
first_patient = list(dicom_files.keys())[0]
print(f"\nGenerating slice visualization for patient {first_patient}...")
visualize_3d_slices(first_patient, dicom_files)


Generating sample images grid...

Sample images grid saved as 'sample_images_grid.png'

Generating slice visualization for patient 9023679...

Slice visualization saved as 'patient_9023679_slices.png'
