In [1]:
import os
import numpy as np
import nibabel as nib
# Note: removed scipy.ndimage.zoom - no longer resizing XY

In [2]:
# =============================================================================
# CONFIGURATION
# =============================================================================

# Patch extraction
PATCH_SIZE = (128, 128, 128)
PATCHES_PER_VOLUME = 20
MIN_Z_SLICES = 64  # Lowered - we'll pad thin volumes

# Patch distribution (tumor-centered strategy)
TUMOR_RATIO = 0.7   # 70% centered on tumor (if exists)
LIVER_RATIO = 0.2   # 20% centered on liver
RANDOM_RATIO = 0.1  # 10% random

# Normalization
CLIP_RANGE = (-200, 300)

# Output
OUTPUT_DIR = 'preprocessed_patches_v2'
SEED = 42

In [3]:
# =============================================================================
# FUNCTIONS
# =============================================================================

def get_num_volumes(volume_dir='volume'):
    return len([f for f in os.listdir(volume_dir) if f.endswith('.nii')])


def load_volume(index, volume_dir='volume', seg_dir='segmentations'):
    files = sorted([f for f in os.listdir(volume_dir) if f.endswith('.nii')])
    filename = files[index]
    vol = nib.load(os.path.join(volume_dir, filename)).get_fdata()
    seg = nib.load(os.path.join(seg_dir, filename.replace('volume', 'segmentation'))).get_fdata()
    return vol, seg


def pad_volume_z(volume, seg, target_z):
    """Pad volume and segmentation in Z dimension to reach target size."""
    current_z = volume.shape[2]
    if current_z >= target_z:
        return volume, seg
    
    pad_total = target_z - current_z
    pad_before = pad_total // 2
    pad_after = pad_total - pad_before
    
    # Pad with zeros (air in CT)
    volume_padded = np.pad(volume, ((0, 0), (0, 0), (pad_before, pad_after)), 
                           mode='constant', constant_values=-1000)
    seg_padded = np.pad(seg, ((0, 0), (0, 0), (pad_before, pad_after)), 
                        mode='constant', constant_values=0)
    
    return volume_padded, seg_padded


def extract_patches(volume, seg, patch_size, num_patches, tumor_ratio, liver_ratio):
    """
    Extract patches with tumor-centered strategy.
    
    Priority:
    - tumor_ratio: % of patches centered on tumor voxels (if tumor exists)
    - liver_ratio: % of patches centered on liver-only voxels
    - remainder: random patches
    
    If no tumor exists, tumor allocation goes to liver-centered patches.
    """
    px, py, pz = patch_size
    vx, vy, vz = volume.shape
    
    # Check if volume is large enough for patches
    if vx < px or vy < py or vz < pz:
        raise ValueError(f"Volume {volume.shape} too small for patch {patch_size}")
    
    tumor_coords = np.argwhere(seg == 2)  # Tumor only
    liver_coords = np.argwhere(seg == 1)  # Liver only (not tumor)
    
    has_tumor = len(tumor_coords) > 0
    has_liver = len(liver_coords) > 0
    
    # Determine patch counts
    if has_tumor:
        num_tumor = int(num_patches * tumor_ratio)
        num_liver = int(num_patches * liver_ratio)
    else:
        # No tumor - give tumor allocation to liver
        num_tumor = 0
        num_liver = int(num_patches * (tumor_ratio + liver_ratio))
    
    num_random = num_patches - num_tumor - num_liver
    
    patches, segs = [], []
    
    # Smaller offset for tumor patches to keep tumor well-centered
    tumor_max_offset = min(32, px // 4)
    
    # Tumor-centered patches
    for _ in range(num_tumor):
        cx, cy, cz = tumor_coords[np.random.randint(len(tumor_coords))]
        
        # Small random offset to add variety while keeping tumor centered
        ox = np.random.randint(-tumor_max_offset, tumor_max_offset + 1)
        oy = np.random.randint(-tumor_max_offset, tumor_max_offset + 1)
        oz = np.random.randint(-tumor_max_offset, tumor_max_offset + 1)
        
        # Calculate patch start position, clamped to valid range
        x = max(0, min(cx - px // 2 + ox, vx - px))
        y = max(0, min(cy - py // 2 + oy, vy - py))
        z = max(0, min(cz - pz // 2 + oz, vz - pz))
        
        patches.append(volume[x:x+px, y:y+py, z:z+pz])
        segs.append(seg[x:x+px, y:y+py, z:z+pz])
    
    # Liver-centered patches (larger offset allowed)
    liver_max_offset = min(64, px // 2)
    
    for _ in range(num_liver):
        cx, cy, cz = liver_coords[np.random.randint(len(liver_coords))]

        
        ox = np.random.randint(-liver_max_offset, liver_max_offset + 1)
        oy = np.random.randint(-liver_max_offset, liver_max_offset + 1)
        oz = np.random.randint(-liver_max_offset, liver_max_offset + 1)
        
        x = max(0, min(cx - px // 2 + ox, vx - px))
        y = max(0, min(cy - py // 2 + oy, vy - py))
        z = max(0, min(cz - pz // 2 + oz, vz - pz))
        
        patches.append(volume[x:x+px, y:y+py, z:z+pz])
        segs.append(seg[x:x+px, y:y+py, z:z+pz])
    
    # Random patches
    for _ in range(num_random):
        x = np.random.randint(0, max(1, vx - px + 1))
        y = np.random.randint(0, max(1, vy - py + 1))
        z = np.random.randint(0, max(1, vz - pz + 1))
        
        patches.append(volume[x:x+px, y:y+py, z:z+pz])
        segs.append(seg[x:x+px, y:y+py, z:z+pz])
    
    return np.array(patches), np.array(segs)


def normalize_hu(patches, clip_range):
    """Normalize HU values to 0-1 range."""
    lo, hi = clip_range
    return (np.clip(patches, lo, hi) - lo) / (hi - lo)


def save_patches(patches, segs, filename, output_dir):
    """Save patches and segmentations to compressed npz file."""
    os.makedirs(output_dir, exist_ok=True)
    filepath = os.path.join(output_dir, filename)
    np.savez_compressed(filepath, patches=patches, segmentations=segs)
    return filepath

In [4]:
# =============================================================================
# MAIN PIPELINE: Tumor-centered patching, no resize, with Z-padding
# =============================================================================

def run_preprocessing():
    np.random.seed(SEED)
    total = get_num_volumes()
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    saved_files = []
    skipped = []
    padded_count = 0
    tumor_volumes = 0
    
    print(f"Processing {total} volumes...")
    print(f"Patch strategy: {int(TUMOR_RATIO*100)}% tumor, {int(LIVER_RATIO*100)}% liver, {int(RANDOM_RATIO*100)}% random")
    print(f"Patches per volume: {PATCHES_PER_VOLUME}")
    print(f"Min Z slices: {MIN_Z_SLICES} (will pad if needed)\n")
    
    for idx in range(total):
        vol, seg = load_volume(idx)
        original_shape = vol.shape
        
        # Skip if too thin even for padding (extremely thin volumes)
        if vol.shape[2] < MIN_Z_SLICES:
            skipped.append(idx)
            print(f"  [{idx+1}/{total}] volume_{idx:03d} - SKIPPED (Z={vol.shape[2]} < {MIN_Z_SLICES})")
            del vol, seg
            continue
        
        # Pad Z dimension if needed to reach 128 for patch extraction
        was_padded = False
        if vol.shape[2] < PATCH_SIZE[2]:
            vol, seg = pad_volume_z(vol, seg, PATCH_SIZE[2])
            was_padded = True
            padded_count += 1
        
        # Check if this volume has tumor
        has_tumor = np.any(seg == 2)
        if has_tumor:
            tumor_volumes += 1
        
        # NO RESIZE - extract patches at original XY resolution
        # Extract patches with tumor-centered strategy
        try:
            patches, patch_segs = extract_patches(
                vol, seg, PATCH_SIZE, PATCHES_PER_VOLUME, 
                TUMOR_RATIO, LIVER_RATIO
            )
        except ValueError as e:
            skipped.append(idx)
            print(f"  [{idx+1}/{total}] volume_{idx:03d} - SKIPPED ({e})")
            del vol, seg
            continue
        
        # Normalize and convert to uint8
        patches = normalize_hu(patches, CLIP_RANGE)
        patches = (patches * 255).astype(np.uint8)
        patch_segs = patch_segs.astype(np.uint8)
        
        # Save all patches for this patient
        filename = f'volume_{idx:03d}.npz'
        filepath = save_patches(patches, patch_segs, filename, OUTPUT_DIR)
        saved_files.append(filepath)
        
        size_mb = os.path.getsize(filepath) / 1024**2
        pad_str = " [Z-padded]" if was_padded else ""
        tumor_str = " [has tumor]" if has_tumor else ""
        print(f"  [{idx+1}/{total}] {filename} ({size_mb:.1f} MB) - shape {original_shape}{pad_str}{tumor_str}")
        
        del vol, seg, patches, patch_segs
    
    # Summary
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"Processed:      {len(saved_files)} volumes")
    print(f"  - With tumor: {tumor_volumes}")
    print(f"  - Z-padded:   {padded_count}")
    print(f"Skipped:        {len(skipped)} volumes")
    
    total_mb = sum(os.path.getsize(f) for f in saved_files) / 1024**2
    total_patches = len(saved_files) * PATCHES_PER_VOLUME
    
    print(f"\nTotal: {total_patches} patches, {total_mb:.1f} MB")
    print(f"Output: {OUTPUT_DIR}/")
    
    return saved_files, skipped

In [5]:
# RUN
saved_files, skipped = run_preprocessing()

Processing 131 volumes...
Patch strategy: 70% tumor, 20% liver, 10% random
Patches per volume: 20
Min Z slices: 64 (will pad if needed)

  [1/131] volume_000.npz (20.0 MB) - shape (512, 512, 75) [Z-padded] [has tumor]
  [2/131] volume_001.npz (27.5 MB) - shape (512, 512, 123) [Z-padded] [has tumor]
  [3/131] volume_002.npz (31.6 MB) - shape (512, 512, 501) [has tumor]
  [4/131] volume_003.npz (32.9 MB) - shape (512, 512, 685) [has tumor]
  [5/131] volume_004.npz (33.9 MB) - shape (512, 512, 683) [has tumor]
  [6/131] volume_005.npz (32.1 MB) - shape (512, 512, 677) [has tumor]
  [7/131] volume_006.npz (28.0 MB) - shape (512, 512, 683) [has tumor]
  [8/131] volume_007.npz (28.8 MB) - shape (512, 512, 781) [has tumor]
  [9/131] volume_008.npz (29.8 MB) - shape (512, 512, 986)
  [10/131] volume_009.npz (23.7 MB) - shape (512, 512, 771)
  [11/131] volume_010.npz (29.0 MB) - shape (512, 512, 771) [has tumor]
  [12/131] volume_011.npz (29.9 MB) - shape (512, 512, 856) [has tumor]
  [13/131] 

In [6]:
# =============================================================================
# VERIFY
# =============================================================================
if saved_files:
    sample = saved_files[0]
    data = np.load(sample)
    
    print(f"Sample: {sample}")
    print(f"\nKeys: {list(data.keys())}")
    print(f"\npatches:       {data['patches'].shape}, dtype={data['patches'].dtype}")
    print(f"segmentations: {data['segmentations'].shape}, unique={np.unique(data['segmentations'])}")
    
    # Count tumor voxels in this sample
    seg = data['segmentations']
    total_voxels = seg.size
    tumor_voxels = np.sum(seg == 2)
    liver_voxels = np.sum(seg == 1)
    bg_voxels = np.sum(seg == 0)
    
    print(f"\nClass distribution in sample:")
    print(f"  Background: {bg_voxels:,} ({100*bg_voxels/total_voxels:.1f}%)")
    print(f"  Liver:      {liver_voxels:,} ({100*liver_voxels/total_voxels:.1f}%)")
    print(f"  Tumor:      {tumor_voxels:,} ({100*tumor_voxels/total_voxels:.2f}%)")

Sample: preprocessed_patches_v2/volume_000.npz

Keys: ['patches', 'segmentations']

patches:       (20, 128, 128, 128), dtype=uint8
segmentations: (20, 128, 128, 128), unique=[0 1 2]

Class distribution in sample:
  Background: 39,407,012 (94.0%)
  Liver:      2,518,106 (6.0%)
  Tumor:      17,922 (0.04%)
