# Arthropod Classification Pipeline - Quick Start Example

This notebook demonstrates the complete workflow of the arthropod classification pipeline:

1. **Image Processing**: Detection, segmentation, and extraction of specimens
2. **Classification**: Hierarchical taxonomic classification
3. **Export**: Results export to Excel and CSV

## Prerequisites

- Python 3.8+
- All dependencies installed (`pip install -r requirements.txt`)
- YOLO models trained and available in `data/models/`
- Sample composite images in `data/raw/`

## Setup

In [None]:
# Import required modules
from pathlib import Path
import sys

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

from src.config import Config
from src.image_processing import SpecimenDetector, SpecimenSegmenter, SpecimenExtractor
from src.classification import TaxonomyHierarchy, InferenceEngine
from src.export import ExcelExporter, CSVExporter, StatisticsCalculator
from src.utils.logging_config import setup_logging, get_logger

# Setup logging
setup_logging(log_file=project_root / "logs" / "notebook.log")
logger = get_logger(__name__)

print("✓ Imports successful")

## 1. Load Configuration

In [None]:
# Load default configuration
config_path = project_root / "config" / "default_config.yaml"
config = Config(config_path=config_path)

# Print key settings
print(f"Device: {config.get('device')}")
print(f"Data root: {config.get('paths.data_root')}")
print(f"Detection confidence: {config.get('image_processing.detection.confidence_threshold')}")
print(f"Segmentation confidence: {config.get('image_processing.segmentation.confidence_threshold')}")

## 2. Image Processing

Process a composite image to detect, segment, and extract individual specimens.

In [None]:
# Define input parameters
composite_image_path = project_root / "data" / "raw" / "composite_S001_1mm.png"
sample_id = "S001"
size_fraction = "1"
output_dir = project_root / "data" / "processed" / sample_id

print(f"Processing: {composite_image_path}")
print(f"Sample ID: {sample_id}")
print(f"Size fraction: {size_fraction}")

### 2.1 Detection

In [None]:
# Initialize detector
detector = SpecimenDetector(
    size_fraction=size_fraction,
    model_path=config.get('image_processing.detection.model_path'),
    confidence_threshold=config.get('image_processing.detection.confidence_threshold'),
    device=config.get('device')
)

# Detect specimens
detections = detector.detect_specimens(composite_image_path)

print(f"✓ Detected {len(detections)} specimens")
print(f"\nFirst 3 detections:")
for i, det in enumerate(detections[:3]):
    print(f"  {i+1}. BBox: {det['bounding_box']}, Confidence: {det['confidence']:.3f}")

### 2.2 Segmentation

In [None]:
# Initialize segmenter
segmenter = SpecimenSegmenter(
    size_fraction=size_fraction,
    model_path=config.get('image_processing.segmentation.model_path'),
    confidence_threshold=config.get('image_processing.segmentation.confidence_threshold'),
    dilation_factor=config.get('image_processing.segmentation.dilation_factor'),
    device=config.get('device')
)

# Segment specimens
segmentations = segmenter.segment_specimens(composite_image_path, detections)

print(f"✓ Segmented {len(segmentations)} specimens")
print(f"\nFirst 3 segmentations:")
for i, seg in enumerate(segmentations[:3]):
    print(f"  {i+1}. BBox: {seg['bounding_box']}, Confidence: {seg['confidence']:.3f}")

### 2.3 Extraction

In [None]:
# Initialize extractor
extractor = SpecimenExtractor(output_dir=output_dir)

# Extract specimens
extraction_results = extractor.extract_specimens(
    segmentations=segmentations,
    sample_id=sample_id,
    size_fraction=size_fraction
)

print(f"✓ Extracted {len(extraction_results)} specimens to {output_dir}")

# Get statistics
stats = extractor.get_statistics(extraction_results)
print(f"\nStatistics:")
print(f"  Total specimens: {stats['total_specimens']}")
print(f"  Avg confidence: {stats['avg_confidence']:.3f}")
print(f"  Size range: {stats['min_size']}px - {stats['max_size']}px")

### 2.4 Visualize Sample Specimens

In [None]:
# Display first 3 extracted specimens
from PIL import Image
import matplotlib.pyplot as plt

specimen_files = sorted(output_dir.glob("*.png"))[:3]

fig, axes = plt.subplots(1, len(specimen_files), figsize=(12, 4))
if len(specimen_files) == 1:
    axes = [axes]

for ax, img_path in zip(axes, specimen_files):
    img = Image.open(img_path)
    ax.imshow(img)
    ax.set_title(img_path.name)
    ax.axis('off')

plt.tight_layout()
plt.show()

## 3. Classification

Classify extracted specimens using hierarchical taxonomic classification.

### 3.1 Load Taxonomy

In [None]:
# Load taxonomy hierarchy
taxonomy_csv = project_root / "data" / "taxonomy" / "catalogue_of_life.csv"
taxonomy = TaxonomyHierarchy(
    csv_path=taxonomy_csv,
    start_year=config.get('classification.start_year'),
    end_year=config.get('classification.end_year')
)

print(f"✓ Loaded taxonomy with {len(taxonomy.hierarchy)} taxa")
print(f"  Root: {taxonomy.root_id}")
print(f"  Major groups: {list(taxonomy.get_children(taxonomy.root_id))[:5]}")

### 3.2 Run Inference

In [None]:
# Initialize inference engine
model_dir = project_root / config.get('paths.models')
inference_engine = InferenceEngine(
    model_dir=model_dir,
    model_set=config.get('classification.model_set'),
    taxonomy=taxonomy,
    default_threshold=config.get('classification.default_threshold'),
    device=config.get('device')
)

# Classify all specimens in output directory
specimen_images = list(output_dir.glob("*.png"))
print(f"Classifying {len(specimen_images)} specimens...")

classification_results = inference_engine.classify_batch(
    image_paths=specimen_images,
    batch_size=config.get('classification.batch_size')
)

print(f"✓ Classified {len(classification_results)} specimens")
print(f"\nFirst 3 classifications:")
for i, result in enumerate(classification_results[:3]):
    print(f"  {i+1}. {result['image_path'].name}:")
    print(f"     Taxon: {result['predicted_taxon']}")
    print(f"     Confidence: {result['confidence']:.3f}")
    print(f"     Path: {' → '.join(result['path'])}")

## 4. Export Results

Export classification results to Excel and CSV formats.

### 4.1 Calculate Statistics

In [None]:
# Calculate comprehensive statistics
stats_calculator = StatisticsCalculator()

# Basic statistics
basic_stats = stats_calculator.calculate_basic_stats(classification_results)
print("Basic Statistics:")
print(f"  Total specimens: {basic_stats['total_specimens']}")
print(f"  Unique taxa: {basic_stats['unique_taxa']}")
print(f"  Avg confidence: {basic_stats['avg_confidence']:.3f}")

# Per-taxon statistics
per_taxon_stats = stats_calculator.calculate_per_taxon_stats(classification_results)
print(f"\nTop 5 taxa by count:")
sorted_taxa = sorted(per_taxon_stats.items(), key=lambda x: x[1]['count'], reverse=True)[:5]
for taxon, stats in sorted_taxa:
    print(f"  {taxon}: {stats['count']} specimens (avg conf: {stats['avg_confidence']:.3f})")

### 4.2 Export to Excel

In [None]:
# Initialize Excel exporter
excel_output_dir = project_root / "output" / "excel"
excel_output_dir.mkdir(parents=True, exist_ok=True)

excel_exporter = ExcelExporter(output_dir=excel_output_dir)

# Export results
excel_path = excel_exporter.export(
    results=classification_results,
    filename=f"{sample_id}_results.xlsx",
    statistics=basic_stats,
    include_per_taxon=True,
    include_hierarchy=True
)

print(f"✓ Excel exported to: {excel_path}")

### 4.3 Export to CSV

In [None]:
# Initialize CSV exporter
csv_output_dir = project_root / "output" / "csv"
csv_output_dir.mkdir(parents=True, exist_ok=True)

csv_exporter = CSVExporter(output_dir=csv_output_dir)

# Export results
csv_path = csv_exporter.export(
    results=classification_results,
    filename=f"{sample_id}_results.csv",
    include_path=True
)

# Export summary statistics
summary_path = csv_exporter.export_summary(
    summary=basic_stats,
    filename=f"{sample_id}_summary.csv"
)

print(f"✓ CSV exported to: {csv_path}")
print(f"✓ Summary exported to: {summary_path}")

### 4.4 View Results as DataFrame

In [None]:
# Load CSV results as pandas DataFrame
import pandas as pd

df = pd.read_csv(csv_path)
print(f"Results DataFrame shape: {df.shape}")
print(f"\nFirst 10 rows:")
df.head(10)

## 5. Summary

Complete pipeline executed successfully!

In [None]:
print("="*60)
print("PIPELINE SUMMARY")
print("="*60)
print(f"Sample ID: {sample_id}")
print(f"Size fraction: {size_fraction}")
print(f"")
print(f"Image Processing:")
print(f"  Detections: {len(detections)}")
print(f"  Segmentations: {len(segmentations)}")
print(f"  Extracted: {len(extraction_results)}")
print(f"")
print(f"Classification:")
print(f"  Total classified: {len(classification_results)}")
print(f"  Unique taxa: {basic_stats['unique_taxa']}")
print(f"  Avg confidence: {basic_stats['avg_confidence']:.3f}")
print(f"")
print(f"Output:")
print(f"  Specimens: {output_dir}")
print(f"  Excel: {excel_path}")
print(f"  CSV: {csv_path}")
print("="*60)

## Next Steps

- Process additional samples by changing `sample_id` and `size_fraction`
- Process multiple samples using the batch script: `scripts/02_process_images.py`
- Train your own models: `scripts/03_train_models.py`
- Optimize thresholds for your data
- Explore results in Excel or load CSVs in R/Python for further analysis

For more information, see:
- `README.md` - General overview
- `docs/INSTALLATION.md` - Installation guide
- `docs/ARCHITECTURE.md` - System architecture
- Module-specific READMEs in `src/` subdirectories