# Data Exploration - HC18 Fetal Head Segmentation Dataset

This notebook explores the HC18 Grand Challenge dataset for fetal head segmentation.

**Dataset Overview:**
- Training set: 999 ultrasound images with annotations
- Test set: 355 ultrasound images with annotations
- Image size: 256×256 pixels (grayscale)
- Task: Binary segmentation of fetal head

**Sections:**
1. Load and inspect dataset statistics
2. Visualize sample images and masks
3. Analyze image properties
4. Visualize data augmentations
5. Check pixel size and HC statistics

## 1. Import Libraries and Setup

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from pathlib import Path
from glob import glob

# Add project root to path
sys.path.append('..')

from src.data import HC18Dataset
from src.utils import get_transforms

# Set style for better visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (15, 8)
plt.rcParams['figure.dpi'] = 100

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

## 2. Dataset Statistics

In [None]:
# Dataset paths
TRAIN_IMG_DIR = '../dataset/training_set/images'
TRAIN_MASK_DIR = '../dataset/training_set/masks'
TEST_IMG_DIR = '../dataset/test_set/images'
TEST_MASK_DIR = '../dataset/test_set/masks'

# Count images
train_images = sorted(glob(os.path.join(TRAIN_IMG_DIR, '*.png')))
train_masks = sorted(glob(os.path.join(TRAIN_MASK_DIR, '*.png')))
test_images = sorted(glob(os.path.join(TEST_IMG_DIR, '*.png')))
test_masks = sorted(glob(os.path.join(TEST_MASK_DIR, '*.png')))

print("="*60)
print("HC18 DATASET STATISTICS")
print("="*60)
print(f"Training Images: {len(train_images)}")
print(f"Training Masks:  {len(train_masks)}")
print(f"Test Images:     {len(test_images)}")
print(f"Test Masks:      {len(test_masks)}")
print("="*60)

# Load sample image to check dimensions
if len(train_images) > 0:
    sample_img = cv2.imread(train_images[0], cv2.IMREAD_GRAYSCALE)
    sample_mask = cv2.imread(train_masks[0], cv2.IMREAD_GRAYSCALE)
    print(f"\nOriginal Image Shape:  {sample_img.shape}")
    print(f"Original Mask Shape:   {sample_mask.shape}")
    print(f"Image dtype:           {sample_img.dtype}")
    print(f"Mask dtype:            {sample_mask.dtype}")
    print(f"Image value range:     [{sample_img.min()}, {sample_img.max()}]")
    print(f"Mask unique values:    {np.unique(sample_mask)}")

## 3. Visualize Random Samples

In [None]:
# Visualize random samples from training set
np.random.seed(42)
n_samples = 8
indices = np.random.choice(len(train_images), n_samples, replace=False)

fig, axes = plt.subplots(n_samples, 3, figsize=(12, 3*n_samples))

for i, idx in enumerate(indices):
    # Load image and mask
    img = cv2.imread(train_images[idx], cv2.IMREAD_GRAYSCALE)
    mask = cv2.imread(train_masks[idx], cv2.IMREAD_GRAYSCALE)
    
    # Overlay mask on image
    overlay = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
    overlay[mask > 0] = [255, 0, 0]  # Red overlay
    
    # Plot
    axes[i, 0].imshow(img, cmap='gray')
    axes[i, 0].set_title(f'Image {Path(train_images[idx]).stem}')
    axes[i, 0].axis('off')
    
    axes[i, 1].imshow(mask, cmap='gray')
    axes[i, 1].set_title('Ground Truth Mask')
    axes[i, 1].axis('off')
    
    axes[i, 2].imshow(overlay)
    axes[i, 2].set_title('Overlay')
    axes[i, 2].axis('off')

plt.tight_layout()
plt.suptitle('Random Samples from Training Set', y=1.001, fontsize=16, fontweight='bold')
plt.show()

## 4. Analyze Image Properties

In [None]:
# Analyze intensity distributions
n_analyze = 100  # Analyze subset for speed
sample_indices = np.random.choice(len(train_images), n_analyze, replace=False)

intensities = []
mask_ratios = []

for idx in sample_indices:
    img = cv2.imread(train_images[idx], cv2.IMREAD_GRAYSCALE)
    mask = cv2.imread(train_masks[idx], cv2.IMREAD_GRAYSCALE)
    
    intensities.extend(img.flatten())
    mask_ratios.append(np.sum(mask > 0) / mask.size)

# Plot intensity distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(intensities, bins=50, color='steelblue', alpha=0.7, edgecolor='black')
axes[0].set_title('Pixel Intensity Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Pixel Intensity')
axes[0].set_ylabel('Frequency')
axes[0].grid(True, alpha=0.3)

axes[1].hist(mask_ratios, bins=30, color='coral', alpha=0.7, edgecolor='black')
axes[1].set_title('Mask Coverage Ratio Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Mask Coverage Ratio')
axes[1].set_ylabel('Frequency')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Mean mask coverage: {np.mean(mask_ratios):.4f}")
print(f"Std mask coverage:  {np.std(mask_ratios):.4f}")

## 5. Visualize Data Augmentations

In [None]:
# Visualize augmentations on a single image
import torch

# Get transforms
train_transforms = get_transforms(256, 256, is_train=True)

# Load one image
sample_img = cv2.imread(train_images[0], cv2.IMREAD_GRAYSCALE)
sample_mask = cv2.imread(train_masks[0], cv2.IMREAD_GRAYSCALE)

# Apply augmentation multiple times
fig, axes = plt.subplots(4, 4, figsize=(16, 16))
axes = axes.flatten()

for i in range(16):
    # Apply transform
    augmented = train_transforms(image=sample_img, mask=sample_mask)
    aug_img = augmented['image'].squeeze().numpy()
    aug_mask = augmented['mask'].squeeze().numpy()
    
    # Create overlay
    overlay = np.stack([aug_img, aug_img, aug_img], axis=-1)
    overlay[aug_mask > 0.5] = [1.0, 0.0, 0.0]
    
    axes[i].imshow(overlay)
    axes[i].set_title(f'Augmentation {i+1}')
    axes[i].axis('off')

plt.suptitle('Data Augmentation Examples (HFlip, Rotation ±20°, Shift/Scale ±10%)', 
             fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 6. Pixel Size and HC Statistics

In [None]:
# Load CSV files with pixel size and HC measurements
train_csv = pd.read_csv('../dataset/training_set_pixel_size_and_HC.csv')
test_csv = pd.read_csv('../dataset/test_set_pixel_size.csv')

print("Training Set CSV:")
print(train_csv.head())
print(f"\nShape: {train_csv.shape}")
print(f"\nColumns: {train_csv.columns.tolist()}")

print("\n" + "="*60)
print("\nTest Set CSV:")
print(test_csv.head())
print(f"\nShape: {test_csv.shape}")

# Visualize HC distribution
if 'head_circumference_mm' in train_csv.columns:
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    axes[0].hist(train_csv['head_circumference_mm'], bins=30, 
                 color='mediumseagreen', alpha=0.7, edgecolor='black')
    axes[0].set_title('Head Circumference Distribution (Training)', 
                      fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Head Circumference (mm)')
    axes[0].set_ylabel('Frequency')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].hist(train_csv['pixel_size_mm'], bins=30, 
                 color='mediumpurple', alpha=0.7, edgecolor='black')
    axes[1].set_title('Pixel Size Distribution (Training)', 
                      fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Pixel Size (mm)')
    axes[1].set_ylabel('Frequency')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nHC Stats: Mean={train_csv['head_circumference_mm'].mean():.2f} mm, "
          f"Std={train_csv['head_circumference_mm'].std():.2f} mm")
    print(f"Pixel Size Stats: Mean={train_csv['pixel_size_mm'].mean():.4f} mm, "
          f"Std={train_csv['pixel_size_mm'].std():.4f} mm")