In [None]:
# Import libraries
import sys
sys.path.insert(0, '..')

import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import nibabel as nib
from collections import Counter

# Project imports
from data import CAMUSDataset, CAMUSPatient

# Configuration
DATA_ROOT = '../data/CAMUS'  # Update this path
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1. Dataset Overview

In [None]:
# Load datasets
train_dataset = CAMUSDataset(root_dir=DATA_ROOT, split='train')
val_dataset = CAMUSDataset(root_dir=DATA_ROOT, split='val')
test_dataset = CAMUSDataset(root_dir=DATA_ROOT, split='test')

print("Dataset Statistics (ED/ES frames only):")
print(f"  Training:   {len(train_dataset):,} samples ({len(train_dataset.patients)} patients)")
print(f"  Validation: {len(val_dataset):,} samples ({len(val_dataset.patients)} patients)")
print(f"  Testing:    {len(test_dataset):,} samples ({len(test_dataset.patients)} patients)")
print(f"\nTotal: {len(train_dataset) + len(val_dataset) + len(test_dataset):,} samples")

In [None]:
# Load with half sequences
train_full = CAMUSDataset(root_dir=DATA_ROOT, split='train', include_sequences=True)

print("\nWith Half Sequences:")
print(f"  Training: {len(train_full):,} samples (10-20x more data!)")
print(f"  Ratio: {len(train_full) / len(train_dataset):.1f}x more samples")

## 2. Image Visualization

In [None]:
# Load a sample patient
patient = train_dataset.patients[0]
print(f"Patient ID: {patient.patient_id}")

# Load ED and ES images for both views
fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i, view in enumerate(['2CH', '4CH']):
    for j, phase in enumerate(['ED', 'ES']):
        img = patient.load_image(view, phase)
        mask = patient.load_segmentation(view, phase)
        
        # Image
        axes[i, j*2].imshow(img, cmap='gray')
        axes[i, j*2].set_title(f'{view} - {phase}')
        axes[i, j*2].axis('off')
        
        # Mask overlay
        axes[i, j*2+1].imshow(img, cmap='gray')
        axes[i, j*2+1].imshow(mask, cmap='jet', alpha=0.5)
        axes[i, j*2+1].set_title(f'{view} - {phase} (with mask)')
        axes[i, j*2+1].axis('off')

plt.suptitle(f'Patient: {patient.patient_id}', fontsize=14)
plt.tight_layout()
plt.show()

## 3. Segmentation Masks Analysis

In [None]:
# Class labels
CLASS_NAMES = {
    0: 'Background',
    1: 'LV Endocardium',
    2: 'LV Epicardium (Myocardium)',
    3: 'Left Atrium'
}

# Visualize each class separately
img = patient.load_image('4CH', 'ED')
mask = patient.load_segmentation('4CH', 'ED')

fig, axes = plt.subplots(1, 5, figsize=(20, 4))

# Original image
axes[0].imshow(img, cmap='gray')
axes[0].set_title('Original')
axes[0].axis('off')

# Each class
colors = ['gray', 'Reds', 'Greens', 'Blues']
for i, (label, name) in enumerate(CLASS_NAMES.items()):
    axes[i+1].imshow(img, cmap='gray')
    class_mask = (mask == label).astype(float)
    if label > 0:
        axes[i+1].imshow(class_mask, cmap=colors[label], alpha=0.6)
    axes[i+1].set_title(name)
    axes[i+1].axis('off')

plt.tight_layout()
plt.show()

In [None]:
# Class distribution across dataset
class_pixels = {0: 0, 1: 0, 2: 0, 3: 0}

for i in range(min(50, len(train_dataset))):  # Sample 50 images
    sample = train_dataset[i]
    mask = sample['mask'].numpy() if hasattr(sample['mask'], 'numpy') else sample['mask']
    for label in range(4):
        class_pixels[label] += np.sum(mask == label)

# Plot
labels = [CLASS_NAMES[i] for i in range(4)]
sizes = [class_pixels[i] for i in range(4)]

fig, ax = plt.subplots(figsize=(8, 6))
bars = ax.bar(labels, sizes, color=['gray', 'red', 'green', 'blue'])
ax.set_ylabel('Total Pixels')
ax.set_title('Class Distribution (sampled from training set)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# Print percentages
total = sum(sizes)
print("Class Distribution:")
for i, name in CLASS_NAMES.items():
    print(f"  {name}: {class_pixels[i]/total*100:.2f}%")

## 4. Half Sequences Analysis

In [None]:
# Load half sequence
images, masks = patient.load_half_sequence('4CH')

print(f"Half Sequence Shape: {images.shape}")
print(f"Number of frames: {images.shape[0]}")
print(f"Frame size: {images.shape[1]} x {images.shape[2]}")
print(f"\nMasks Shape: {masks.shape} (Ground truth for ALL frames!)")

In [None]:
# Visualize sequence frames
n_frames = images.shape[0]
n_show = min(8, n_frames)
indices = np.linspace(0, n_frames-1, n_show, dtype=int)

fig, axes = plt.subplots(2, n_show, figsize=(16, 6))

for i, idx in enumerate(indices):
    # Image
    axes[0, i].imshow(images[idx], cmap='gray')
    axes[0, i].set_title(f'Frame {idx}')
    axes[0, i].axis('off')
    
    # Mask
    axes[1, i].imshow(images[idx], cmap='gray')
    axes[1, i].imshow(masks[idx], cmap='jet', alpha=0.5)
    axes[1, i].axis('off')

axes[0, 0].set_ylabel('Image', fontsize=12)
axes[1, 0].set_ylabel('With GT', fontsize=12)
plt.suptitle(f'Half Sequence: {patient.patient_id} - 4CH ({n_frames} frames)', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Analyze LV area change through cardiac cycle
lv_areas = []
for i in range(n_frames):
    lv_area = np.sum(masks[i] == 1)  # LV endocardium
    lv_areas.append(lv_area)

plt.figure(figsize=(10, 4))
plt.plot(lv_areas, 'b-o', linewidth=2, markersize=6)
plt.xlabel('Frame')
plt.ylabel('LV Area (pixels)')
plt.title('LV Area Through Cardiac Cycle')
plt.grid(True, alpha=0.3)

# Mark ED (max) and ES (min)
ed_idx = np.argmax(lv_areas)
es_idx = np.argmin(lv_areas)
plt.axvline(ed_idx, color='green', linestyle='--', label=f'ED (frame {ed_idx})')
plt.axvline(es_idx, color='red', linestyle='--', label=f'ES (frame {es_idx})')
plt.legend()
plt.tight_layout()
plt.show()

print(f"Max LV area (ED): {max(lv_areas):,} pixels at frame {ed_idx}")
print(f"Min LV area (ES): {min(lv_areas):,} pixels at frame {es_idx}")

## 5. Clinical Information

In [None]:
# Patient clinical info
print(f"Patient: {patient.patient_id}")
print(f"\n2CH View:")
print(f"  Image Quality: {patient.get_image_quality('2CH')}")
print(f"  Ejection Fraction: {patient.get_ef('2CH')}%")
ed_vol, es_vol = patient.get_lv_volumes('2CH')
print(f"  LV ED Volume: {ed_vol} ml")
print(f"  LV ES Volume: {es_vol} ml")

print(f"\n4CH View:")
print(f"  Image Quality: {patient.get_image_quality('4CH')}")
print(f"  Ejection Fraction: {patient.get_ef('4CH')}%")

In [None]:
# EF distribution across training set
ef_values = []
for p in train_dataset.patients[:100]:  # Sample 100 patients
    ef = p.get_ef('4CH')
    if ef is not None:
        ef_values.append(ef)

plt.figure(figsize=(10, 4))
plt.hist(ef_values, bins=20, edgecolor='black', alpha=0.7)
plt.xlabel('Ejection Fraction (%)')
plt.ylabel('Count')
plt.title('Distribution of Ejection Fraction (Training Set Sample)')
plt.axvline(np.mean(ef_values), color='red', linestyle='--', label=f'Mean: {np.mean(ef_values):.1f}%')
plt.legend()
plt.tight_layout()
plt.show()

print(f"EF Statistics:")
print(f"  Mean: {np.mean(ef_values):.1f}%")
print(f"  Std: {np.std(ef_values):.1f}%")
print(f"  Range: {min(ef_values):.1f}% - {max(ef_values):.1f}%")

## 6. Image Quality Distribution

In [None]:
# Quality distribution
quality_counts = {'Good': 0, 'Medium': 0, 'Poor': 0}

for p in train_dataset.patients:
    for view in ['2CH', '4CH']:
        q = p.get_image_quality(view)
        if q:
            quality_counts[q] += 1

# Plot
fig, ax = plt.subplots(figsize=(8, 5))
colors = ['green', 'orange', 'red']
bars = ax.bar(quality_counts.keys(), quality_counts.values(), color=colors)
ax.set_ylabel('Count')
ax.set_title('Image Quality Distribution (Training Set)')

# Add percentages
total = sum(quality_counts.values())
for bar, count in zip(bars, quality_counts.values()):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
            f'{count/total*100:.1f}%', ha='center', fontsize=12)

plt.tight_layout()
plt.show()

print("Quality Distribution:")
for q, count in quality_counts.items():
    print(f"  {q}: {count} ({count/total*100:.1f}%)")

## 7. Data Statistics Summary

In [None]:
# Image size statistics
heights = []
widths = []
intensities = []

for i in range(min(100, len(train_dataset))):
    sample = train_dataset[i]
    img = sample['image'].numpy() if hasattr(sample['image'], 'numpy') else sample['image']
    if img.ndim == 3:
        img = img[0]
    heights.append(img.shape[0])
    widths.append(img.shape[1])
    intensities.append(img.mean())

print("Image Statistics:")
print(f"  Height: {np.mean(heights):.0f} ± {np.std(heights):.0f} (range: {min(heights)}-{max(heights)})")
print(f"  Width: {np.mean(widths):.0f} ± {np.std(widths):.0f} (range: {min(widths)}-{max(widths)})")
print(f"  Mean Intensity: {np.mean(intensities):.3f} ± {np.std(intensities):.3f}")

In [None]:
# Summary
print("="*60)
print("CAMUS DATASET SUMMARY")
print("="*60)
print(f"\nTotal Patients: 500")
print(f"  Training: 400 patients")
print(f"  Validation: 50 patients")
print(f"  Testing: 50 patients")
print(f"\nViews: 2CH, 4CH")
print(f"Phases: ED (End-Diastolic), ES (End-Systolic)")
print(f"\nED/ES Samples: {500 * 2 * 2:,} total")
print(f"Half Sequence Samples: ~{500 * 2 * 15:,} estimated (with GT!)")
print(f"\nClasses:")
for label, name in CLASS_NAMES.items():
    print(f"  {label}: {name}")
print("="*60)