# EMSN Nestkast Species Training - Januari 2026

**Probleem:** AI model herkent slapende Koolmees niet (92.7% 'leeg' ipv 'koolmees')

**Oorzaak:** Model getraind met slechts 112 samples

**Oplossing:** Hertrainen met 814+ samples:
- 275 koolmees screenshots (midden nestkast dec/jan)
- 539 leeg screenshots (voor+achter nestkast dec/jan)

## Data staat klaar op Google Drive!
Gewoon alle cellen runnen - geen upload nodig.

In [None]:
# GPU Check
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 1. Mount Google Drive & Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Data staat klaar op Drive
import os
DATA_DIR = '/content/drive/MyDrive/EMSN/nestbox_training'

print("\n=== Training Data ===")
for cls in os.listdir(DATA_DIR):
    path = os.path.join(DATA_DIR, cls)
    if os.path.isdir(path):
        count = len([f for f in os.listdir(path) if f.endswith('.jpg')])
        print(f"{cls}: {count} afbeeldingen")

In [None]:
# Config
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision import transforms, models
from PIL import Image
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

INPUT_SIZE = 224
BATCH_SIZE = 32
EPOCHS = 30
LEARNING_RATE = 0.001

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

## 2. Dataset & Preprocessing

In [None]:
class NestboxDataset(Dataset):
    def __init__(self, data_dir, transform=None):
        self.data_dir = Path(data_dir)
        self.transform = transform
        self.samples = []
        self.classes = sorted([d.name for d in self.data_dir.iterdir() if d.is_dir()])
        self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
        
        for cls_name in self.classes:
            cls_dir = self.data_dir / cls_name
            for img_path in cls_dir.glob('*.jpg'):
                self.samples.append((img_path, self.class_to_idx[cls_name]))
        
        print(f"Classes: {self.classes}")
        print(f"Total: {len(self.samples)} samples")
        for cls in self.classes:
            count = sum(1 for s in self.samples if self.classes[s[1]] == cls)
            print(f"  {cls}: {count}")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        img_path, label = self.samples[idx]
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, label

# Transforms
train_transform = transforms.Compose([
    transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.3, contrast=0.3),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((INPUT_SIZE, INPUT_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Load dataset
full_dataset = NestboxDataset(DATA_DIR, transform=train_transform)
classes = full_dataset.classes

# Split 80/20
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

print(f"\nTrain: {len(train_dataset)}, Val: {len(val_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

In [None]:
# Bekijk voorbeelden
fig, axes = plt.subplots(2, 4, figsize=(12, 6))
for i, (img, label) in enumerate(train_dataset):
    if i >= 8: break
    ax = axes[i // 4, i % 4]
    img_np = img.numpy().transpose(1, 2, 0)
    img_np = img_np * [0.229, 0.224, 0.225] + [0.485, 0.456, 0.406]
    img_np = np.clip(img_np, 0, 1)
    ax.imshow(img_np)
    ax.set_title(classes[label])
    ax.axis('off')
plt.tight_layout()
plt.show()

## 3. Model Training

In [None]:
def create_model(num_classes):
    model = models.mobilenet_v2(weights=models.MobileNet_V2_Weights.IMAGENET1K_V1)
    
    # Freeze early layers
    for param in list(model.parameters())[:-20]:
        param.requires_grad = False
    
    # Custom classifier
    num_features = model.classifier[1].in_features
    model.classifier = nn.Sequential(
        nn.Dropout(p=0.3),
        nn.Linear(num_features, 128),
        nn.ReLU(),
        nn.Dropout(p=0.2),
        nn.Linear(128, num_classes)
    )
    return model

model = create_model(len(classes)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)

print(f"Model: MobileNetV2, Classes: {classes}")

In [None]:
def train_epoch(model, loader, criterion, optimizer):
    model.train()
    total_loss, correct, total = 0, 0, 0
    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    return total_loss / len(loader), 100. * correct / total

def validate(model, loader, criterion):
    model.eval()
    total_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    return total_loss / len(loader), 100. * correct / total

In [None]:
# Training!
best_val_acc = 0
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
patience_counter = 0
PATIENCE = 7

# Model wordt opgeslagen op Drive
MODEL_SAVE_PATH = '/content/drive/MyDrive/EMSN/nestbox_training/nestbox_species_model.pt'

print(f"Training {EPOCHS} epochs...")
print("=" * 70)

for epoch in range(EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc = validate(model, val_loader, criterion)
    scheduler.step(val_loss)
    
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        torch.save({
            'model_state_dict': model.state_dict(),
            'classes': classes,
            'num_classes': len(classes),
            'architecture': 'mobilenet_v2',
            'input_size': INPUT_SIZE,
            'best_val_acc': best_val_acc,
            'train_samples': len(train_dataset),
            'val_samples': len(val_dataset),
            'epochs': epoch + 1,
            'trained_at': str(np.datetime64('now')),
            'model_type': 'species_detector',
            'supports_day_night': True
        }, MODEL_SAVE_PATH)
        marker = ' *BEST* (saved to Drive)'
    else:
        patience_counter += 1
        marker = ''
    
    print(f"Epoch {epoch+1:2d}/{EPOCHS}: Train Loss={train_loss:.4f} Acc={train_acc:.1f}% | Val Loss={val_loss:.4f} Acc={val_acc:.1f}%{marker}")
    
    if patience_counter >= PATIENCE:
        print(f"\nEarly stopping at epoch {epoch+1}")
        break

print("=" * 70)
print(f"Best validation accuracy: {best_val_acc:.1f}%")
print(f"Model saved to: {MODEL_SAVE_PATH}")

In [None]:
# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(history['train_loss'], label='Train')
ax1.plot(history['val_loss'], label='Val')
ax1.set_xlabel('Epoch'); ax1.set_ylabel('Loss'); ax1.legend(); ax1.set_title('Loss')
ax2.plot(history['train_acc'], label='Train')
ax2.plot(history['val_acc'], label='Val')
ax2.set_xlabel('Epoch'); ax2.set_ylabel('Accuracy %'); ax2.legend(); ax2.set_title('Accuracy')
plt.tight_layout()
plt.show()

## 4. Evaluatie

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Load best
checkpoint = torch.load(MODEL_SAVE_PATH)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

all_preds, all_labels = [], []
with torch.no_grad():
    for images, labels in val_loader:
        outputs = model(images.to(device))
        _, predicted = outputs.max(1)
        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(labels.numpy())

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted'); plt.ylabel('Actual'); plt.title('Confusion Matrix')
plt.show()

print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=classes))

## 5. Model Info & Installatie

In [None]:
# Model info
import os
print("=" * 50)
print("MODEL INFO")
print("=" * 50)
for k, v in checkpoint.items():
    if k != 'model_state_dict':
        print(f"{k}: {v}")
print(f"\nFile: {os.path.getsize(MODEL_SAVE_PATH) / 1024**2:.1f} MB")
print(f"Location: {MODEL_SAVE_PATH}")

print("\n" + "=" * 50)
print("INSTALLATIE OP PI")
print("=" * 50)
print("""
Model staat op Google Drive. Op de Pi uitvoeren:

1. Sync model naar Pi:
   rclone copy gdrive:EMSN/nestbox_training/nestbox_species_model.pt /tmp/

2. Kopieer naar NAS:
   sudo cp /tmp/nestbox_species_model.pt /mnt/nas-birdnet-archive/nestbox/models/

3. Test:
   /home/ronny/emsn2/venv/bin/python3 \\
     /home/ronny/emsn2/scripts/nestbox/nestbox_occupancy_detector.py \\
     --all --verbose

4. Check Grafana dashboard
""")