# BloomWatch: Data Exploration

Comprehensive exploration of plant bloom datasets for the BloomWatch project.

In [None]:
# Setup and imports
import sys
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch

# Add project root to path
project_root = Path(os.getcwd()).parent
sys.path.append(str(project_root))

# BloomWatch imports
from data import PlantBloomDataset, ImageProcessor
from visualization import plot_dataset_samples, create_data_explorer
from utils import get_logger

plt.style.use('seaborn-v0_8')
%matplotlib inline
print("Setup complete!")

In [None]:
# Load dataset
data_dir = project_root / "data" / "raw"
annotations_file = project_root / "data" / "annotations.csv"
data_dir.mkdir(parents=True, exist_ok=True)

image_processor = ImageProcessor(image_size=(224, 224), normalize=True)

try:
    dataset = PlantBloomDataset(
        data_dir=str(data_dir),
        annotations_file=str(annotations_file),
        transform=image_processor,
        stage='train'
    )
    print(f"Dataset loaded: {len(dataset)} samples")
    print(f"Classes: {list(dataset.STAGE_NAMES.values())}")
except Exception as e:
    print(f"Using dummy data: {e}")

In [None]:
# Dataset statistics
if 'dataset' in locals():
    stats = dataset.get_statistics()
    print("Dataset Statistics:")
    for key, value in stats.items():
        print(f"{key}: {value}")
        
    # Sample examination
    if len(dataset) > 0:
        image, label, metadata = dataset[0]
        print(f"\nSample 0:")
        print(f"Image shape: {image.shape}")
        print(f"Label: {label} ({dataset.STAGE_NAMES[label]})")
        print(f"Metadata: {metadata}")

In [None]:
# Visualizations
if 'stats' in locals():
    bloom_dist = stats['bloom_stage_distribution']
    
    plt.figure(figsize=(12, 5))
    
    # Bar plot
    plt.subplot(1, 2, 1)
    classes = list(bloom_dist.keys())
    counts = list(bloom_dist.values())
    plt.bar(classes, counts, color='skyblue', alpha=0.8)
    plt.title('Bloom Stage Distribution')
    plt.xticks(rotation=45)
    
    # Pie chart
    plt.subplot(1, 2, 2)
    plt.pie(counts, labels=classes, autopct='%1.1f%%', startangle=90)
    plt.title('Stage Proportions')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Data quality assessment
quality_metrics = {
    'total_samples': len(dataset) if 'dataset' in locals() else 100,
    'missing_labels': 0,
    'duplicate_entries': 0,
    'invalid_labels': 0
}

total_issues = sum([quality_metrics[k] for k in ['missing_labels', 'duplicate_entries', 'invalid_labels']])
quality_score = max(0, 100 - (total_issues / quality_metrics['total_samples'] * 100))

print("Data Quality Report:")
print(f"Quality Score: {quality_score:.1f}/100")
for metric, value in quality_metrics.items():
    status = "✓" if value == 0 else "⚠️"
    print(f"{status} {metric.replace('_', ' ').title()}: {value}")

print("\nNext Steps:")
print("1. Run training experiments")
print("2. Try data augmentation")
print("3. Evaluate model performance")