# EMSN 2.0 - Vocalization Classifier Training Batch 2
## Geoptimaliseerd voor Colab Pro (A100/V100)

Train de **61 ontbrekende soorten** met maximale GPU acceleratie.

### Optimalisaties:
- **Parallel downloads** - 8 gelijktijdige audio downloads
- **Grote batch size** - 128 (A100) of 64 (V100/T4)
- **Mixed Precision (FP16)** - 2x snellere training op A100
- **Multi-worker DataLoader** - CPU preprocessing parallel aan GPU
- **Async spectrogram generatie** - ThreadPool voor I/O

**Geschatte tijd:** 
- A100: ~20-30 minuten
- V100: ~45-60 minuten  
- T4: ~90 minuten

In [None]:
# Check GPU en bepaal optimale instellingen
!nvidia-smi

import torch
import gc

# Clean GPU memory
torch.cuda.empty_cache()
gc.collect()

print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"GPU: {gpu_name}")
    print(f"GPU Memory: {gpu_mem:.1f} GB")
    
    # === STABILITY SETTINGS ===
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False
    torch.backends.cudnn.benchmark = False  # Meer stabiel
    torch.backends.cudnn.deterministic = True
    print("‚úÖ CUDA stability settings toegepast")
    
    # Configuratie - conservatief voor stabiliteit
    if 'A100' in gpu_name:
        GPU_TYPE = 'A100'
        RECOMMENDED_BATCH = 32  # Klein voor stabiliteit
        print(f"\nüöÄ A100 gedetecteerd - Stability mode")
    elif 'V100' in gpu_name:
        GPU_TYPE = 'V100'
        RECOMMENDED_BATCH = 32
    elif 'L4' in gpu_name:
        GPU_TYPE = 'L4'
        RECOMMENDED_BATCH = 32
    else:
        GPU_TYPE = 'T4'
        RECOMMENDED_BATCH = 32
        print(f"\n‚úÖ T4 gedetecteerd")
    
    # MIXED PRECISION UITGESCHAKELD - voorkomt CUDA errors
    USE_AMP = False
    print("‚ö†Ô∏è Mixed Precision UITGESCHAKELD voor stabiliteit")
    
else:
    GPU_TYPE = 'CPU'
    RECOMMENDED_BATCH = 16
    USE_AMP = False
    print("‚ö†Ô∏è Geen GPU!")

In [None]:
# Install dependencies (inclusief async/parallel libraries)
!pip install librosa scikit-learn scikit-image matplotlib tqdm requests aiohttp aiofiles -q
print("‚úÖ Dependencies ge√Ønstalleerd")

In [None]:
# Lokale opslag (geen Google Drive nodig)
import os

# Gebruik Colab lokale storage - geen Drive mount nodig
DRIVE_BASE = '/content/EMSN-Vocalization'
MODELS_DIR = f'{DRIVE_BASE}/models'
AUDIO_DIR = f'{DRIVE_BASE}/audio'

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(AUDIO_DIR, exist_ok=True)

print(f"‚úÖ Lokale opslag geconfigureerd:")
print(f"   Models: {MODELS_DIR}")
print(f"   Audio: {AUDIO_DIR}")
print(f"\n‚ö†Ô∏è Let op: Data verdwijnt na sessie!")
print(f"   Download modellen aan het eind via cel 12")

In [None]:
# === CONFIGURATIE (Auto-optimized voor jouw GPU) ===
VERSION = '2025'
EPOCHS = 25
LEARNING_RATE = 0.001
MIN_SAMPLES = 50

# Automatisch geoptimaliseerde batch size
BATCH_SIZE = RECOMMENDED_BATCH
NUM_WORKERS = 4 if GPU_TYPE in ['A100', 'V100', 'L4'] else 2

# Parallel download instellingen
MAX_CONCURRENT_DOWNLOADS = 8 if GPU_TYPE in ['A100', 'V100'] else 4
MAX_RECORDINGS_PER_TYPE = 25  # Meer data voor betere modellen

print(f"üìä Configuratie voor {GPU_TYPE}:")
print(f"   Batch size: {BATCH_SIZE}")
print(f"   DataLoader workers: {NUM_WORKERS}")
print(f"   Parallel downloads: {MAX_CONCURRENT_DOWNLOADS}")
print(f"   Mixed Precision: {USE_AMP}")
print(f"   Epochs: {EPOCHS}")

# ‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
# ‚ïë  Xeno-canto API key                                            ‚ïë
# ‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
XC_API_KEY = '14258afd1c8a8e055387d012f2620e20f59ef3a2'

if not XC_API_KEY:
    print("\n‚ö†Ô∏è  WAARSCHUWING: Geen API key ingevuld!")
    print("   Vul je key in en run deze cel opnieuw.")
else:
    print(f"\n‚úÖ API key geconfigureerd ({len(XC_API_KEY)} karakters)")

In [None]:
# Ontbrekende soorten voor batch 2
# Format: (Nederlandse naam, Wetenschappelijke naam, directory_naam)

MISSING_SPECIES = [
    # Prioriteit 1 - ZEER BELANGRIJK
    ("Kauw", "Coloeus monedula", "kauw"),
    ("Kokmeeuw", "Chroicocephalus ridibundus", "kokmeeuw"),
    ("Nijlgans", "Alopochen aegyptiaca", "nijlgans"),
    
    # Prioriteit 2 - Regelmatig
    ("Bergeend", "Tadorna tadorna", "bergeend"),
    ("Blauwe Kiekendief", "Circus cyaneus", "blauwe_kiekendief"),
    ("Bonte Strandloper", "Calidris alpina", "bonte_strandloper"),
    ("Boomvalk", "Falco subbuteo", "boomvalk"),
    ("Bosrietzanger", "Acrocephalus palustris", "bosrietzanger"),
    ("Bosruiter", "Tringa glareola", "bosruiter"),
    ("Braamsluiper", "Curruca curruca", "braamsluiper"),
    ("Brilduiker", "Bucephala clangula", "brilduiker"),
    ("Drieteenstrandloper", "Calidris alba", "drieteenstrandloper"),
    ("Eider", "Somateria mollissima", "eider"),
    ("Fluiter", "Phylloscopus sibilatrix", "fluiter"),
    ("Gele Kwikstaart", "Motacilla flava", "gele_kwikstaart"),
    ("Goudplevier", "Pluvialis apricaria", "goudplevier"),
    ("Grasmus", "Curruca communis", "grasmus"),
    ("Groenpootruiter", "Tringa nebularia", "groenpootruiter"),
    ("Grote Gele Kwikstaart", "Motacilla cinerea", "grote_gele_kwikstaart"),
    ("Grote Zaagbek", "Mergus merganser", "grote_zaagbek"),
    ("IJsvogel", "Alcedo atthis", "ijsvogel"),
    ("Kanoetstrandloper", "Calidris canutus", "kanoetstrandloper"),
    ("Keep", "Fringilla montifringilla", "keep"),
    ("Kemphaan", "Calidris pugnax", "kemphaan"),
    ("Kleine Rietgans", "Anser brachyrhynchus", "kleine_rietgans"),
    ("Kleine Strandloper", "Calidris minuta", "kleine_strandloper"),
    ("Kluut", "Recurvirostra avosetta", "kluut"),
    ("Koekoek", "Cuculus canorus", "koekoek"),
    ("Mandarijneend", "Aix galericulata", "mandarijneend"),
    ("Middelste Zaagbek", "Mergus serrator", "middelste_zaagbek"),
    ("Nonnetje", "Mergellus albellus", "nonnetje"),
    ("Oeverloper", "Actitis hypoleucos", "oeverloper"),
    ("Paapje", "Saxicola rubetra", "paapje"),
    ("Pijlstaart", "Anas acuta", "pijlstaart"),
    ("Ransuil", "Asio otus", "ransuil"),
    ("Regenwulp", "Numenius phaeopus", "regenwulp"),
    ("Rietzanger", "Acrocephalus schoenobaenus", "rietzanger"),
    ("Rode Wouw", "Milvus milvus", "rode_wouw"),
    ("Roodhalsfuut", "Podiceps grisegena", "roodhalsfuut"),
    ("Rosse Grutto", "Limosa lapponica", "rosse_grutto"),
    ("Sijs", "Spinus spinus", "sijs"),
    ("Slobeend", "Spatula clypeata", "slobeend"),
    ("Smelleken", "Falco columbarius", "smelleken"),
    ("Steenloper", "Arenaria interpres", "steenloper"),
    ("Tafeleend", "Aythya ferina", "tafeleend"),
    ("Tapuit", "Oenanthe oenanthe", "tapuit"),
    ("Toendrarietgans", "Anser serrirostris", "toendrarietgans"),
    ("Velduil", "Asio flammeus", "velduil"),
    ("Watersnip", "Gallinago gallinago", "watersnip"),
    ("Witgat", "Tringa ochropus", "witgat"),
    ("Zwarte Ruiter", "Tringa erythropus", "zwarte_ruiter"),
    
    # Prioriteit 3 - Minder algemeen (selectie)
    ("Barmsijs", "Acanthis flammea", "barmsijs"),
    ("Beflijster", "Turdus torquatus", "beflijster"),
    ("Bokje", "Lymnocryptes minimus", "bokje"),
    ("Flamingo", "Phoenicopterus roseus", "flamingo"),
    ("Grauwe Kiekendief", "Circus pygargus", "grauwe_kiekendief"),
    ("Grauwe Klauwier", "Lanius collurio", "grauwe_klauwier"),
    ("Klapekster", "Lanius excubitor", "klapekster"),
    ("Kruisbek", "Loxia curvirostra", "kruisbek"),
    ("Oehoe", "Bubo bubo", "oehoe"),
    ("Snor", "Locustella luscinioides", "snor"),
]

print(f"Te trainen: {len(MISSING_SPECIES)} soorten")

In [None]:
# Xeno-canto API v3 met PARALLEL DOWNLOADS
import requests
import asyncio
import aiohttp
import aiofiles
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import json

def search_xeno_canto(scientific_name, voc_type='song', max_results=100):
    """Zoek opnames op Xeno-canto API v3."""
    parts = scientific_name.split()
    if len(parts) < 2:
        return []
    
    genus, species = parts[0].lower(), parts[1].lower()
    
    if ' ' in voc_type:
        type_query = f'type:"{voc_type}"'
    else:
        type_query = f'type:{voc_type}'
    
    query = f'gen:{genus} sp:{species} {type_query} q:A'
    url = f'https://xeno-canto.org/api/3/recordings?query={query}&key={XC_API_KEY}'
    
    try:
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            return response.json().get('recordings', [])[:max_results]
        elif response.status_code == 401:
            print(f"  ‚ùå 401 - Check API key!")
        return []
    except Exception as e:
        print(f"  API error: {e}")
        return []

def download_single(args):
    """Download √©√©n opname (voor ThreadPoolExecutor)."""
    recording, output_dir = args
    xc_id = recording['id']
    file_url = recording.get('file', '')
    
    if not file_url:
        return None
    
    if file_url.startswith('//'):
        file_url = 'https:' + file_url
    elif not file_url.startswith('http'):
        file_url = 'https://xeno-canto.org' + file_url
    
    output_path = output_dir / f"XC{xc_id}.mp3"
    
    if output_path.exists():
        return output_path
    
    try:
        response = requests.get(file_url, timeout=60)
        if response.status_code == 200:
            with open(output_path, 'wb') as f:
                f.write(response.content)
            return output_path
    except:
        pass
    return None

def download_recordings_parallel(recordings, output_dir, max_workers=8):
    """Download meerdere opnames parallel."""
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    downloaded = []
    args_list = [(rec, output_dir) for rec in recordings]
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(download_single, args): args[0]['id'] for args in args_list}
        for future in as_completed(futures):
            result = future.result()
            if result:
                downloaded.append(result)
    
    return downloaded

# Test API
def test_api():
    print("Testing Xeno-canto API v3...")
    query = 'gen:turdus sp:merula type:song q:A'
    url = f'https://xeno-canto.org/api/3/recordings?query={query}&key={XC_API_KEY}'
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            n = r.json().get('numRecordings', 0)
            print(f"‚úÖ API werkt! ({n} Merel opnames gevonden)")
            return True
        print(f"‚ùå API error: {r.status_code}")
        return False
    except Exception as e:
        print(f"‚ùå Error: {e}")
        return False

if XC_API_KEY:
    test_api()
else:
    print("‚ö†Ô∏è Vul eerst je API key in!")

print("\n‚úÖ Parallel download functies geladen")

In [None]:
# Spectrogram generatie met PARALLEL PROCESSING
import librosa
import numpy as np
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from functools import partial

SAMPLE_RATE = 48000
N_MELS = 128
N_FFT = 2048
HOP_LENGTH = 512
FMIN = 500
FMAX = 8000
SEGMENT_DURATION = 3.0

def process_single_audio(audio_path, max_segments=5):
    """Verwerk √©√©n audio bestand naar spectrogrammen."""
    try:
        audio, sr = librosa.load(str(audio_path), sr=SAMPLE_RATE, mono=True)
    except:
        return []
    
    segment_samples = int(SEGMENT_DURATION * SAMPLE_RATE)
    spectrograms = []
    
    for i in range(0, len(audio), segment_samples):
        if len(spectrograms) >= max_segments:
            break
        
        segment = audio[i:i + segment_samples]
        if len(segment) < segment_samples // 2:
            continue
        
        if len(segment) < segment_samples:
            segment = np.pad(segment, (0, segment_samples - len(segment)))
        
        mel_spec = librosa.feature.melspectrogram(
            y=segment, sr=SAMPLE_RATE,
            n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH,
            fmin=FMIN, fmax=FMAX
        )
        
        mel_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_norm = (mel_db - mel_db.min()) / (mel_db.max() - mel_db.min() + 1e-8)
        
        if mel_norm.shape != (128, 128):
            from skimage.transform import resize
            mel_norm = resize(mel_norm, (128, 128), anti_aliasing=True)
        
        spectrograms.append(mel_norm)
    
    return spectrograms

def process_audio_files_parallel(audio_paths, max_segments=3, max_workers=4):
    """Verwerk meerdere audio bestanden parallel."""
    all_specs = []
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        func = partial(process_single_audio, max_segments=max_segments)
        results = list(executor.map(func, audio_paths))
    
    for specs in results:
        all_specs.extend(specs)
    
    return all_specs

print("‚úÖ Parallel spectrogram functies geladen")

In [None]:
# CNN Model met Mixed Precision Training (Stabiele versie)
import torch
import torch.nn as nn

# Gebruik nieuwe autocast API (voorkomt deprecation warnings)
if hasattr(torch, 'amp') and hasattr(torch.amp, 'autocast'):
    from torch.amp import autocast, GradScaler
    autocast_device = 'cuda'
else:
    from torch.cuda.amp import autocast, GradScaler
    autocast_device = None

class VocalizationCNN(nn.Module):
    def __init__(self, input_shape=(128, 128), num_classes=3):
        super().__init__()
        
        self.features = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(0.25),
            
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(0.25),
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(0.25),
        )
        
        h, w = input_shape[0] // 8, input_shape[1] // 8
        flatten_size = 128 * h * w
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flatten_size, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Mixed precision scaler
scaler = GradScaler() if USE_AMP else None

print(f"‚úÖ Model klaar voor {device}")
if USE_AMP:
    print(f"‚úÖ Mixed Precision (FP16) actief")

In [None]:
# GEOPTIMALISEERDE Training Pipeline (met label remapping fix)
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import time

def train_species_optimized(dutch_name, scientific_name, dirname):
    """
    Geoptimaliseerde pipeline met label remapping voor ontbrekende klassen.
    """
    print(f"\n{'='*60}")
    print(f"üê¶ {dutch_name} ({scientific_name})")
    print(f"{'='*60}")
    
    start_time = time.time()
    audio_dir = Path(f'{DRIVE_BASE}/audio/{dirname}')
    
    X_all, y_all = [], []
    voc_types = [('song', 0), ('call', 1), ('alarm call', 2)]
    available_types = []  # Track welke types we hebben
    
    # FASE 1: Download alle audio parallel per type
    for voc_type, label in voc_types:
        print(f"  üì• {voc_type}...", end=' ')
        recordings = search_xeno_canto(scientific_name, voc_type, max_results=MAX_RECORDINGS_PER_TYPE)
        
        if not recordings:
            print("0 gevonden")
            continue
        
        type_dir = audio_dir / voc_type.replace(' ', '_')
        
        audio_files = download_recordings_parallel(
            recordings[:MAX_RECORDINGS_PER_TYPE], 
            type_dir, 
            max_workers=MAX_CONCURRENT_DOWNLOADS
        )
        print(f"{len(audio_files)} gedownload", end=' ')
        
        if audio_files:
            specs = process_audio_files_parallel(audio_files, max_segments=3, max_workers=NUM_WORKERS)
            if specs:  # Alleen toevoegen als we spectrograms hebben
                for spec in specs:
                    X_all.append(spec)
                    y_all.append(label)
                available_types.append((voc_type, label))
            print(f"‚Üí {len(specs)} specs")
        else:
            print()
    
    # Check data
    if len(X_all) < 30:
        print(f"  ‚ö†Ô∏è Te weinig data ({len(X_all)}), overslaan")
        return None, 'insufficient_data'
    
    X = np.array(X_all)
    y = np.array(y_all)
    
    # === KRITIEKE FIX: Remap labels naar 0, 1, 2, ... ===
    # Als we bijv. alleen call (1) en alarm (2) hebben, moeten labels 0 en 1 worden
    unique_labels = np.unique(y)
    num_classes = len(unique_labels)
    
    if num_classes < 2:
        print(f"  ‚ö†Ô∏è Slechts 1 klasse, overslaan")
        return None, 'single_class'
    
    # Maak label mapping
    label_map = {old_label: new_label for new_label, old_label in enumerate(unique_labels)}
    y_remapped = np.array([label_map[label] for label in y])
    
    # Class names voor dit model
    all_class_names = ['song', 'call', 'alarm']
    class_names = [all_class_names[l] for l in unique_labels]
    
    unique, counts = np.unique(y_remapped, return_counts=True)
    class_dist = {class_names[i]: int(counts[i]) for i in range(len(counts))}
    print(f"  üìä Data: {len(X)} specs, klassen: {class_dist}")
    
    # Train/val split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_remapped, test_size=0.2, random_state=42, stratify=y_remapped
    )
    
    # DataLoaders
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train).unsqueeze(1),
        torch.LongTensor(y_train)
    )
    val_dataset = TensorDataset(
        torch.FloatTensor(X_val).unsqueeze(1),
        torch.LongTensor(y_val)
    )
    
    train_loader = DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True,
        num_workers=NUM_WORKERS, pin_memory=True
    )
    val_loader = DataLoader(
        val_dataset, batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS, pin_memory=True
    )
    
    # Model met correct aantal klassen
    model = VocalizationCNN(num_classes=num_classes).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    
    # Training met error recovery
    best_acc = 0
    best_state = None
    
    try:
        for epoch in range(EPOCHS):
            model.train()
            
            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(device, non_blocking=True)
                y_batch = y_batch.to(device, non_blocking=True)
                
                optimizer.zero_grad()
                
                if USE_AMP and scaler is not None:
                    if autocast_device:
                        with autocast(device_type=autocast_device):
                            outputs = model(X_batch)
                            loss = criterion(outputs, y_batch)
                    else:
                        with autocast():
                            outputs = model(X_batch)
                            loss = criterion(outputs, y_batch)
                    scaler.scale(loss).backward()
                    scaler.step(optimizer)
                    scaler.update()
                else:
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)
                    loss.backward()
                    optimizer.step()
            
            # Validate
            model.eval()
            val_correct = 0
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    X_batch = X_batch.to(device, non_blocking=True)
                    y_batch = y_batch.to(device, non_blocking=True)
                    outputs = model(X_batch)
                    val_correct += (outputs.argmax(1) == y_batch).sum().item()
            
            val_acc = val_correct / len(y_val)
            if val_acc > best_acc:
                best_acc = val_acc
                best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
                
    except RuntimeError as e:
        if 'CUDA' in str(e):
            print(f"  ‚ö†Ô∏è CUDA error, cleanup...")
            torch.cuda.empty_cache()
            gc.collect()
            if best_state is None:
                return None, f'cuda_error: {str(e)[:30]}'
        else:
            raise e
    
    if best_state is None:
        print(f"  ‚ö†Ô∏è Training mislukt")
        return None, 'training_failed'
    
    # Save model met class_names voor inference
    model_path = Path(f'{DRIVE_BASE}/models/{dirname}_cnn_{VERSION}.pt')
    torch.save({
        'model_state_dict': best_state,
        'num_classes': num_classes,
        'class_names': class_names,  # Belangrijk voor inference!
        'label_map': label_map,
        'accuracy': best_acc,
        'species_name': dutch_name,
        'scientific_name': scientific_name,
        'version': VERSION,
        'class_distribution': class_dist
    }, model_path)
    
    # Cleanup GPU memory na elke soort
    del model, train_loader, val_loader
    torch.cuda.empty_cache()
    gc.collect()
    
    elapsed = time.time() - start_time
    print(f"  ‚úÖ {model_path.name} | Acc: {best_acc:.1%} | {elapsed:.0f}s")
    
    return best_acc, 'success'

print("‚úÖ Training pipeline met label remapping geladen")

In [None]:
# üöÄ TRAIN ALLE SOORTEN (Geoptimaliseerd)
from datetime import datetime
import pandas as pd

results = []
start_all = time.time()

print(f"{'='*60}")
print(f"üöÄ EMSN Vocalization Training - Batch 2")
print(f"{'='*60}")
print(f"Start: {datetime.now().strftime('%H:%M:%S')}")
print(f"Soorten: {len(MISSING_SPECIES)}")
print(f"GPU: {GPU_TYPE} ({torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'})")
print(f"Batch size: {BATCH_SIZE} | Workers: {NUM_WORKERS} | AMP: {USE_AMP}")
print(f"{'='*60}")

successful = 0
failed = 0

for i, (dutch, scientific, dirname) in enumerate(MISSING_SPECIES):
    progress = f"[{i+1}/{len(MISSING_SPECIES)}]"
    
    try:
        acc, status = train_species_optimized(dutch, scientific, dirname)
        results.append({
            'species': dutch,
            'scientific': scientific,
            'accuracy': acc,
            'status': status
        })
        
        if status == 'success':
            successful += 1
        else:
            failed += 1
            
    except Exception as e:
        print(f"  ‚ùå Error: {str(e)[:60]}")
        results.append({
            'species': dutch,
            'scientific': scientific,
            'accuracy': None,
            'status': f'error: {str(e)[:40]}'
        })
        failed += 1
    
    # Checkpoint elke 10 soorten
    if (i + 1) % 10 == 0:
        pd.DataFrame(results).to_csv(
            f'{DRIVE_BASE}/training_batch2_checkpoint.csv', index=False
        )
        elapsed = time.time() - start_all
        remaining = (elapsed / (i + 1)) * (len(MISSING_SPECIES) - i - 1)
        print(f"\n  üíæ Checkpoint | ‚úÖ {successful} | ‚ùå {failed} | ETA: {remaining/60:.0f}min\n")

# Eindresultaat
elapsed_all = time.time() - start_all
print(f"\n{'='*60}")
print(f"üèÅ TRAINING VOLTOOID!")
print(f"{'='*60}")
print(f"Tijd: {elapsed_all/60:.1f} minuten")
print(f"Succesvol: {successful}/{len(MISSING_SPECIES)}")
print(f"Mislukt: {failed}/{len(MISSING_SPECIES)}")

In [None]:
# üìä Resultaten Samenvatting
import pandas as pd

df = pd.DataFrame(results)
df.to_csv(f'{DRIVE_BASE}/training_results_batch2_{VERSION}.csv', index=False)

successful_df = df[df['status'] == 'success']

print(f"\n{'='*60}")
print(f"üìä RESULTATEN BATCH 2")
print(f"{'='*60}")
print(f"Getraind: {len(successful_df)}/{len(df)} soorten")

if len(successful_df) > 0:
    print(f"\nAccuracy statistieken:")
    print(f"  Gemiddeld: {successful_df['accuracy'].mean():.1%}")
    print(f"  Minimum:   {successful_df['accuracy'].min():.1%}")
    print(f"  Maximum:   {successful_df['accuracy'].max():.1%}")
    
    print(f"\nüèÜ Top 10 beste modellen:")
    top10 = successful_df.nlargest(10, 'accuracy')[['species', 'accuracy']]
    for _, row in top10.iterrows():
        print(f"  {row['accuracy']:.1%} - {row['species']}")

failed_df = df[df['status'] != 'success']
if len(failed_df) > 0:
    print(f"\n‚ö†Ô∏è Mislukte soorten ({len(failed_df)}):")
    for _, row in failed_df.iterrows():
        print(f"  {row['species']}: {row['status']}")

In [None]:
# üì• DOWNLOAD MODELLEN (belangrijk - data verdwijnt na sessie!)
from pathlib import Path
from google.colab import files
import shutil

models_dir = Path(f'{DRIVE_BASE}/models')
models = sorted(models_dir.glob('*.pt'))

print(f"{'='*60}")
print(f"üìÅ GETRAINDE MODELLEN")
print(f"{'='*60}")
print(f"Totaal: {len(models)} modellen")

if models:
    # Bereken totale grootte
    total_size = sum(m.stat().st_size for m in models) / 1e6
    print(f"Grootte: {total_size:.1f} MB")
    
    # Maak ZIP bestand
    print(f"\nüì¶ ZIP bestand maken...")
    zip_path = '/content/emsn_models_batch2.zip'
    shutil.make_archive('/content/emsn_models_batch2', 'zip', models_dir)
    zip_size = Path(zip_path).stat().st_size / 1e6
    print(f"‚úÖ {zip_path} ({zip_size:.1f} MB)")
    
    print(f"\n{'='*60}")
    print(f"üì• DOWNLOAD OPTIES")
    print(f"{'='*60}")
    print("""
OPTIE 1: Download ZIP (klik op link hieronder)
""")
    # Automatische download trigger
    files.download(zip_path)
    
    print("""
OPTIE 2: Kopieer naar Pi via terminal
   - Open nieuwe terminal in Colab
   - Run: scp /content/emsn_models_batch2.zip ronny@192.168.1.178:~/
   
OPTIE 3: Upload naar Google Drive (als je ruimte hebt vrijgemaakt)
   from google.colab import drive
   drive.mount('/content/drive')
   !cp /content/emsn_models_batch2.zip /content/drive/MyDrive/
""")
else:
    print("‚ö†Ô∏è Geen modellen gevonden. Run eerst de training.")