# Land Cover Segmentation - Data Exploration

This notebook explores the land cover dataset and visualizes class distributions.

In [None]:
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))

import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

from src.utils import load_config
from src.data import LandCoverDataset, get_transforms
from src.utils.visualization import visualize_predictions, hex_to_rgb

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

## Load Configuration

In [None]:
# Load configuration
config = load_config('../config/unet_config.yaml')

print("Class Information:")
class_names = config.get('classes.class_names')
class_colors = config.get('classes.class_colors')

for i, (name, color) in enumerate(zip(class_names, class_colors)):
    print(f"{i}: {name} - {color}")

## Dataset Loading and Exploration

In [None]:
# Note: This assumes you have data in the specified directories
# Update paths according to your data location

try:
    # Load dataset
    transforms = get_transforms('val', config.get('data'))
    dataset = LandCoverDataset(
        images_dir=config.get('paths.train_images'),
        masks_dir=config.get('paths.train_masks'),
        transform=transforms
    )
    
    print(f"Dataset size: {len(dataset)} samples")
    
    # Sample a few examples
    if len(dataset) > 0:
        sample_image, sample_mask = dataset[0]
        print(f"Image shape: {sample_image.shape}")
        print(f"Mask shape: {sample_mask.shape}")
        print(f"Unique classes in sample: {torch.unique(sample_mask)}")
    else:
        print("No data found - using synthetic examples")
        
except Exception as e:
    print(f"Could not load dataset: {e}")
    print("Using synthetic examples for demonstration")
    dataset = None

## Class Distribution Visualization

In [None]:
# Expected class distribution from problem statement
expected_distribution = {
    'Bareland': 1.5,
    'Rangeland': 22.9,
    'Developed space': 16.1,
    'Road': 6.7,
    'Tree': 20.2,
    'Water': 3.3,
    'Agriculture land': 13.7,
    'Building': 15.6
}

# Create visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Bar plot
classes = list(expected_distribution.keys())
percentages = list(expected_distribution.values())
colors = [f"{color}" for color in class_colors]

bars = ax1.bar(classes, percentages, color=[f"{hex_to_rgb(c)[0]/255:.2f}, {hex_to_rgb(c)[1]/255:.2f}, {hex_to_rgb(c)[2]/255:.2f}" for c in class_colors])
ax1.set_title('Expected Class Distribution')
ax1.set_ylabel('Percentage (%)')
ax1.set_xticklabels(classes, rotation=45, ha='right')

# Add percentage labels on bars
for bar, pct in zip(bars, percentages):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
             f'{pct}%', ha='center', va='bottom')

# Pie chart
wedges, texts, autotexts = ax2.pie(percentages, labels=classes, autopct='%1.1f%%', startangle=90)
ax2.set_title('Class Distribution (Pie Chart)')

plt.tight_layout()
plt.show()

## Color Palette Visualization

In [None]:
# Create color palette visualization
fig, ax = plt.subplots(1, 1, figsize=(12, 3))

# Create color swatches
for i, (name, color) in enumerate(zip(class_names, class_colors)):
    rgb = hex_to_rgb(color)
    rgb_norm = [c/255.0 for c in rgb]
    
    # Draw color rectangle
    rect = plt.Rectangle((i, 0), 1, 1, facecolor=rgb_norm, edgecolor='black', linewidth=1)
    ax.add_patch(rect)
    
    # Add text label
    ax.text(i+0.5, 0.5, name, ha='center', va='center', 
            rotation=45, fontsize=10, fontweight='bold',
            color='white' if sum(rgb) < 384 else 'black')
    
    # Add hex code
    ax.text(i+0.5, -0.2, color, ha='center', va='center', 
            fontsize=8, color='black')

ax.set_xlim(0, len(class_names))
ax.set_ylim(-0.3, 1.1)
ax.set_aspect('equal')
ax.axis('off')
ax.set_title('Land Cover Class Color Palette', fontsize=14, fontweight='bold', pad=20)

plt.tight_layout()
plt.show()

## Sample Data Visualization

In [None]:
# Create synthetic examples if real data is not available
def create_synthetic_example(size=(256, 256)):
    """Create a synthetic land cover example."""
    # Create synthetic image
    image = np.random.rand(*size, 3).astype(np.float32)
    
    # Create synthetic mask with different regions
    mask = np.zeros(size, dtype=np.int64)
    
    # Add different land cover types
    h, w = size
    
    # Water (class 5) - bottom area
    mask[int(0.8*h):, :] = 5
    
    # Trees (class 4) - left side
    mask[:int(0.7*h), :int(0.3*w)] = 4
    
    # Agriculture (class 6) - center
    mask[int(0.2*h):int(0.6*h), int(0.3*w):int(0.7*w)] = 6
    
    # Buildings (class 7) - top right
    mask[:int(0.3*h), int(0.7*w):] = 7
    
    # Roads (class 3) - connecting paths
    mask[int(0.4*h):int(0.5*h), :] = 3  # horizontal road
    mask[:, int(0.6*w):int(0.65*w)] = 3  # vertical road
    
    return image, mask

# Use real data if available, otherwise synthetic
if dataset is not None and len(dataset) > 0:
    # Use real data
    images = []
    masks = []
    
    for i in range(min(4, len(dataset))):
        img, mask = dataset[i]
        # Convert tensor to numpy for visualization
        img_np = img.permute(1, 2, 0).numpy()
        mask_np = mask.numpy()
        images.append(img_np)
        masks.append(mask_np)
else:
    # Use synthetic data
    images = []
    masks = []
    
    for i in range(4):
        img, mask = create_synthetic_example()
        images.append(img)
        masks.append(mask)

# Visualize examples
from src.utils.visualization import mask_to_rgb, create_color_map

color_map = create_color_map(class_colors)

fig, axes = plt.subplots(2, 4, figsize=(16, 8))

for i in range(4):
    # Original image
    axes[0, i].imshow(images[i])
    axes[0, i].set_title(f'Sample {i+1} - Original')
    axes[0, i].axis('off')
    
    # Segmentation mask
    mask_rgb = mask_to_rgb(masks[i], color_map)
    axes[1, i].imshow(mask_rgb)
    axes[1, i].set_title(f'Sample {i+1} - Segmentation')
    axes[1, i].axis('off')

plt.tight_layout()
plt.show()

## Class Analysis

In [None]:
# Analyze class statistics from the samples
if len(masks) > 0:
    # Count pixels for each class across all samples
    class_counts = {i: 0 for i in range(len(class_names))}
    total_pixels = 0
    
    for mask in masks:
        unique, counts = np.unique(mask, return_counts=True)
        for class_id, count in zip(unique, counts):
            if class_id < len(class_names):
                class_counts[class_id] += count
                total_pixels += count
    
    # Calculate percentages
    sample_distribution = {}
    for class_id, count in class_counts.items():
        if class_id < len(class_names):
            percentage = (count / total_pixels) * 100 if total_pixels > 0 else 0
            sample_distribution[class_names[class_id]] = percentage
    
    # Create comparison plot
    fig, ax = plt.subplots(1, 1, figsize=(12, 6))
    
    x_pos = np.arange(len(class_names))
    width = 0.35
    
    expected_vals = [expected_distribution.get(name, 0) for name in class_names]
    sample_vals = [sample_distribution.get(name, 0) for name in class_names]
    
    bars1 = ax.bar(x_pos - width/2, expected_vals, width, label='Expected', alpha=0.8)
    bars2 = ax.bar(x_pos + width/2, sample_vals, width, label='Sample Data', alpha=0.8)
    
    ax.set_xlabel('Land Cover Classes')
    ax.set_ylabel('Percentage (%)')
    ax.set_title('Expected vs Sample Class Distribution')
    ax.set_xticks(x_pos)
    ax.set_xticklabels(class_names, rotation=45, ha='right')
    ax.legend()
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics
    print("\nClass Distribution Comparison:")
    print(f"{'Class':<20} {'Expected %':<12} {'Sample %':<12} {'Difference':<12}")
    print("-" * 60)
    
    for name in class_names:
        exp_val = expected_distribution.get(name, 0)
        sample_val = sample_distribution.get(name, 0)
        diff = sample_val - exp_val
        print(f"{name:<20} {exp_val:<12.1f} {sample_val:<12.1f} {diff:<12.1f}")

## Data Insights and Recommendations

Based on the exploration above, here are key insights:

1. **Class Imbalance**: The dataset shows significant class imbalance, with Rangeland (22.9%) and Tree (20.2%) being the most common classes, while Bareland (1.5%) and Water (3.3%) are rare.

2. **Color Palette**: Each class has a distinct color for easy visualization and interpretation.

3. **Recommendations**:
   - Use class weights in loss functions to handle imbalance
   - Consider data augmentation for underrepresented classes
   - Monitor per-class performance metrics during training
   - Use appropriate evaluation metrics (IoU, F1-score) that account for imbalance