In [1]:
import os
import re
import glob
import time
import shutil
import numpy as np
import pandas as pd
import cv2
import pydicom
from pydicom.pixel_data_handlers.util import convert_color_space
from tqdm import tqdm
from joblib import Parallel, delayed
from multiprocessing import Pool, cpu_count
import warnings

In [2]:
# ==========================================
# 1. INSTALL & CONFIG
# ==========================================
# We use standard pip installs to ensure it runs on any fresh notebook
os.system('pip install -q dicomsdl python-gdcm pydicom pylibjpeg pylibjpeg-libjpeg pylibjpeg-openjpeg')

DEBUG = False  # Set to True for a quick test, False for full dataset
GLOBAL_WIDTH = 224
RD = '/kaggle/input/rsna-intracranial-aneurysm-detection'
OUTPUT_DIR = '/kaggle/working/cvt_png'

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Debug Mode: {DEBUG}")
print(f"Global Width: {GLOBAL_WIDTH}")

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 62.1/62.1 kB 1.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.4/1.4 MB 18.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.3/13.3 MB 76.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.4/3.4 MB 64.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.0/2.0 MB 42.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.9/16.9 MB 73.7 MB/s eta 0:00:00


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
mkl-umath 0.1.1 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
mkl-random 1.2.4 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
mkl-fft 1.3.8 requires numpy<1.27.0,>=1.26.4, but you have numpy 2.3.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.3.4 which is incompatible.
datasets 4.4.1 requires pyarrow>=21.0.0, but you have pyarrow 19.0.1 which is incompatible.
ydata-profiling 4.17.0 requires numpy<2.2,>=1.16.0, but you have numpy 2.3.4 which is incompatible.
google-colab 1.0.0 requires notebook==6.5.7, but you have notebook 6.5.4 which is incompatible.
google-colab 1.0.0 requires pandas==2.2.2, but you have pandas 2.2

Debug Mode: False
Global Width: 224


In [3]:
# ==========================================
# 2. HELPER FUNCTIONS (Sorting & Windowing)
# ==========================================
def get_windowing_params(modality):
    windows = {
        'CT': (40, 80),
        'CTA': (50, 350),
        'MRA': (600, 1200),
        'MRI': (40, 80),
    }
    return windows.get(modality, (40, 80))

def apply_dicom_windowing(img, window_center, window_width):
    img_min = window_center - window_width // 2
    img_max = window_center + window_width // 2
    img = np.clip(img, img_min, img_max)
    img = (img - img_min) / (img_max - img_min + 1e-7)
    return (img * 255).astype(np.uint8)

def extract_sort_key(path):
    try:
        ds = pydicom.dcmread(path, stop_before_pixels=True, force=True)
        instance_number = getattr(ds, 'InstanceNumber', None)
        position = getattr(ds, 'ImagePositionPatient', [None, None, None])
        z = position[2] if position and len(position) == 3 else None
        
        if instance_number is not None:
            return (int(instance_number), 0, path)
        elif z is not None:
            return (float('inf'), float(z), path)
        else:
            return (float('inf'), float('inf'), path)
    except:
        return (float('inf'), float('inf'), path)

def sort_series(args):
    series_uid, paths = args
    # Sort paths based on InstanceNumber or Z-position
    sort_info = [extract_sort_key(p) for p in paths]
    sort_info.sort()
    return series_uid, [x[2] for x in sort_info]

In [4]:
# ==========================================
# 3. SORTING DICOMS & MAPPING
# ==========================================
print(">>> Step 1: Loading and Sorting Data...")
df_train = pd.read_csv(f'{RD}/train.csv')

if DEBUG:
    series_uids = df_train['SeriesInstanceUID'].unique()[:5]
    df_train = df_train[df_train['SeriesInstanceUID'].isin(series_uids)]
else:
    series_uids = df_train['SeriesInstanceUID'].unique()

# Map SeriesUID -> List of Paths
series_dicom_map = {
    si: glob.glob(os.path.join(RD, 'series', si, '*.dcm'))
    for si in series_uids
}

# Parallel Sort
with Pool(cpu_count()) as pool:
    sorted_results = list(tqdm(pool.imap(sort_series, series_dicom_map.items()),
                               total=len(series_dicom_map),
                               desc="Sorting Series"))

# Generate Mapping CSV
rows = []
for series_uid, sorted_paths in sorted_results:
    try:
        modality = df_train[df_train['SeriesInstanceUID'] == series_uid]['Modality'].iloc[0]
    except:
        modality = 'CT'
        
    for idx, path in enumerate(sorted_paths):
        sop_uid = os.path.splitext(os.path.basename(path))[0]
        rows.append({
            'SeriesInstanceUID': series_uid,
            'SOPInstanceUID': sop_uid,
            'dicom_filename': path,
            'relative_index': idx,
            'Modality': modality
        })

df_series_index_mapping = pd.DataFrame(rows)
df_series_index_mapping.sort_values(by=['SeriesInstanceUID', 'relative_index'], inplace=True)
df_series_index_mapping.to_csv('series_index_mapping.csv', index=False)
print("Saved series_index_mapping.csv")


>>> Step 1: Loading and Sorting Data...


Sorting Series: 100%|██████████| 4348/4348 [52:21<00:00,  1.38it/s]  


Saved series_index_mapping.csv


In [5]:
# ==========================================
# 4. PREPARE CONVERSION LIST (Positive Only logic)
# ==========================================
print(">>> Step 2: filtering Positive Cases...")

exclude_cols = ['SeriesInstanceUID', 'PatientAge', 'PatientSex', 'Modality', 'Aneurysm Present']
location_cols = [col for col in df_train.columns if col not in exclude_cols]

outputList = []
unique_series = df_train['SeriesInstanceUID'].unique()

for si in tqdm(unique_series, desc="Filtering Positives"):
    pdf = df_train[df_train['SeriesInstanceUID'] == si]
    
    # Check if this series has any aneurysm (Using your original logic)
    has_location = False
    for _, row in pdf.iterrows():
        # Find locations where value is 1
        active_locs = [col for col in location_cols if row[col] == 1]
        
        if active_locs:
            has_location = True
            # Get sorted DICOMs for this series
            df_series_map = df_series_index_mapping[df_series_index_mapping['SeriesInstanceUID'] == si]
            
            for loc in active_locs:
                clean_loc = loc.replace('/', '_')
                out_folder = os.path.join(OUTPUT_DIR, clean_loc, si)
                
                # Add all frames for this series to the processing list
                for row_map in df_series_map.itertuples():
                    dst = os.path.join(out_folder, f"{row_map.relative_index:04d}.png")
                    outputList.append({
                        'impath': row_map.dicom_filename,
                        'dst': dst,
                        'modality': row_map.Modality
                    })
            break # Optimization: if found, move to processing

print(f"Total images to convert: {len(outputList)}")


>>> Step 2: filtering Positive Cases...


Filtering Positives: 100%|██████████| 4348/4348 [02:37<00:00, 27.54it/s]

Total images to convert: 577572





In [6]:
# ==========================================
# 5. DICOM TO PNG CONVERSION
# ==========================================
def dicom_to_png(src_path, dst_path, width, modality):
    try:
        if os.path.exists(dst_path): return 
        
        ds = pydicom.dcmread(src_path, force=True)
        img = ds.pixel_array.astype(np.float32)
        
        # Handle Rescale
        if hasattr(ds, 'RescaleSlope') and hasattr(ds, 'RescaleIntercept'):
            img = img * float(ds.RescaleSlope) + float(ds.RescaleIntercept)
            
        # Apply Windowing
        wc, ww = get_windowing_params(modality)
        img = apply_dicom_windowing(img, wc, ww)
        
        # Resize
        img = cv2.resize(img, (width, width), interpolation=cv2.INTER_AREA)
        
        # Save
        os.makedirs(os.path.dirname(dst_path), exist_ok=True)
        cv2.imwrite(dst_path, img)
        
    except Exception as e:
        # Silent fail to keep logs clean, or print e if debugging
        pass

print(">>> Step 3: Converting DICOM to PNG (Parallel)...")
start_time = time.time()

Parallel(n_jobs=cpu_count())(
    delayed(dicom_to_png)(item['impath'], item['dst'], GLOBAL_WIDTH, item['modality'])
    for item in tqdm(outputList)
)

print(f"Conversion finished in {(time.time() - start_time)/60:.2f} minutes.")


>>> Step 3: Converting DICOM to PNG (Parallel)...


  warn_and_log(
  warn_and_log(
  warn_and_log(
  warn_and_log(
100%|██████████| 577572/577572 [1:35:31<00:00, 100.77it/s]


Conversion finished in 95.54 minutes.


In [7]:
# ==========================================
# 6. CREATE LOCALIZERS CSV (Relative Coords)
# ==========================================
print(">>> Step 4: Processing Localizers...")
df_localizers = pd.read_csv(f'{RD}/train_localizers.csv')

# Create a quick lookup dictionary
mapping_dict = {
    (row.SeriesInstanceUID, row.SOPInstanceUID): (row.relative_index, row.dicom_filename)
    for row in df_series_index_mapping.itertuples()
}

rel_indices, rel_xs, rel_ys = [], [], []

for _, row in tqdm(df_localizers.iterrows(), total=len(df_localizers)):
    key = (row['SeriesInstanceUID'], row['SOPInstanceUID'])
    
    if key in mapping_dict:
        idx, path = mapping_dict[key]
        rel_indices.append(idx)
        
        try:
            # We need image dimensions to calculate relative coordinates
            # Optimization: Only read header
            ds = pydicom.dcmread(path, stop_before_pixels=True)
            h, w = int(ds.Rows), int(ds.Columns)
            
            coords = eval(row['coordinates']) if isinstance(row['coordinates'], str) else row['coordinates']
            rel_xs.append((coords['x'] / w) * GLOBAL_WIDTH)
            rel_ys.append((coords['y'] / h) * GLOBAL_WIDTH)
        except:
            rel_xs.append(None)
            rel_ys.append(None)
    else:
        rel_indices.append(None)
        rel_xs.append(None)
        rel_ys.append(None)

df_localizers['relative_index'] = rel_indices
df_localizers['relative_x'] = rel_xs
df_localizers['relative_y'] = rel_ys
df_localizers.to_csv('train_localizers_with_relative.csv', index=False)
print("Saved train_localizers_with_relative.csv")

>>> Step 4: Processing Localizers...


100%|██████████| 2254/2254 [00:23<00:00, 96.96it/s] 


Saved train_localizers_with_relative.csv


In [8]:
# ==========================================
# 7. FINAL STEP: CREATE DOWNLOADABLE DATASET
# ==========================================
print(">>> Step 5: Zipping Output for Download...")

# We Zip the PNG folder and the created CSVs into one file
# -r: recursive, -q: quiet
os.system("zip -r -q dataset.zip cvt_png series_index_mapping.csv train_localizers_with_relative.csv")

print("Done! You can now download 'dataset.zip' from the Output tab.")

>>> Step 5: Zipping Output for Download...
Done! You can now download 'dataset.zip' from the Output tab.
