Im Notebook exploration.ipynb wird das CV Model trainiert.

Es speichert das trainierte Modell als 'resnet34-stage-final' und 'export_resnet34.pkl'.

In [None]:
# train_fastai.py
# Training eines Bildklassifikations-Modells mit fastai
# Train-Bilder: data/images/train, Validierungs-Bilder: data/images/val

from pathlib import Path
import sys
import subprocess

# fastai installieren falls nicht vorhanden
try:
    from fastai.vision.all import *
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "fastai"])
    from fastai.vision.all import *

import torch
import csv

def main():
    path = Path("/workspaces/PS9-Boston-Dynamic-Mobile-CV-Testing-Systems") / "data" / "images"
    print("Using images path:", path)
    train_folder = path/"train"
    valid_folder = path/"val"

    if not train_folder.exists() or not valid_folder.exists():
        raise FileNotFoundError(f"Benötigte Ordner nicht gefunden: {train_folder} oder {valid_folder}")
    
    # Device Auswahl: GPU wenn verfügbar
    try:
        dev = defaults.device
    except Exception:
        dev = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Verfügbare Device(s):", dev)
    # DataLoaders: Ordnerstruktur erwartet pro Klasse einen Unterordner
    
    train_csv = path/'train'/'train_labels.csv'
    valid_csv = path/'val'/'val_labels.csv'

    def read_labels(csv_path, subfolder):
        items = []
        labels = {}
        if not csv_path.exists():
            return items, labels
        with csv_path.open('r', newline='') as f:
            reader = csv.reader(f, delimiter=';')
            for row in reader:
                if not row:
                    continue
                name = row[0].strip()
                label = row[1].strip() if len(row) > 1 else ''
                if not name:
                    continue
                p = Path(name)
                # normalize: relative names -> under images/<subfolder>
                if not p.is_absolute():
                    if p.parent == Path('.'):
                        p = path / subfolder / p.name
                    else:
                        p = path / p
                p = p.resolve()
                items.append(p)
                labels[p] = label
        return items, labels

    train_items, train_labels = read_labels(train_csv, 'train')
    valid_items, valid_labels = read_labels(valid_csv, 'val')

    # Diagnose / Existenzprüfung
    def _check_items(items, kind):
        missing = [p for p in items if not p.exists()]
        print(f"{kind}: parsed {len(items)} entries, missing {len(missing)}")
        if missing:
            print("Fehlende Beispiel-Dateien:", missing[:5])
        return missing
    missing_train = _check_items(train_items, "train_csv")
    missing_valid = _check_items(valid_items, "valid_csv")
    if missing_train or missing_valid:
        raise FileNotFoundError("Einige in CSV gelistete Bilddateien wurden nicht gefunden (siehe Ausgabe).")

    # Wenn CSVs erfolgreich eingelesen wurden: DataBlock mit diesen Items verwenden
    if train_items and valid_items:
        all_items = train_items + valid_items
        labels_map = {**train_labels, **valid_labels}
        valid_set = set(valid_items)

        dblock = DataBlock(
            blocks=(ImageBlock, CategoryBlock),
            get_items=lambda _=None: all_items,
            get_y=lambda o: labels_map[o],
            splitter=FuncSplitter(lambda o: o in valid_set),
            item_tfms=Resize(460),
            batch_tfms=aug_transforms(size=224)
        )
        dls = dblock.dataloaders(path, bs=64)
    else:
        # fallback wird weiter unten behandelt
        pass

    if train_items and valid_items:
        all_items = train_items + valid_items
        labels_map = {**train_labels, **valid_labels}
        valid_set = set(valid_items)

        dblock = DataBlock(
            blocks=(ImageBlock, CategoryBlock),
            get_items=lambda p: all_items,
            get_y=lambda o: labels_map[o],
            splitter=FuncSplitter(lambda o: o in valid_set),
            item_tfms=Resize(460),
            batch_tfms=aug_transforms(size=224)
        )
        dls = dblock.dataloaders(path, bs=64)
    else:
        # Fallback: vorhandene Ordnerstruktur verwenden
        # Robustes get_image_files: versuche fastai-Import, sonst Fallback via Path.rglob
        try:
            from fastai.vision.all import get_image_files
        except Exception:
            def get_image_files(p):
                exts = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tif', '.tiff'}
                return [f for f in Path(p).rglob('*') if f.suffix.lower() in exts]

        # Diagnose: welche Bilddateien werden gefunden?
        train_imgs = list(get_image_files(train_folder))
        valid_imgs = list(get_image_files(valid_folder))
        print(f"Found {len(train_imgs)} train images, {len(valid_imgs)} valid images")
        if len(train_imgs) == 0 or len(valid_imgs) == 0:
            print("WARN: train/val folders contain no images (check extensions / nested structure).")

        # Robustes DataBlock: erwartet Struktur repo/data/images/{train,val}/{class}/*.jpg
        # Labels werden aus dem parent-Ordner (Klassenname) genommen; Split nach Grandparent (train/val).
        dblock = DataBlock(
            blocks=(ImageBlock, CategoryBlock),
            get_items=get_image_files,
            get_y=lambda o: o.parent.name,                 # Klasse ist Ordner direkt über dem Bild
            splitter=GrandparentSplitter(train_name='train', valid_name='val'),
            item_tfms=Resize(460),
            batch_tfms=aug_transforms(size=224)
        )
        dls = dblock.dataloaders(path, bs=64)

    print("=== DATASET / DATALOADER DIAGNOSTICS ===")
    print("dls.vocab:", getattr(dls, "vocab", None))
    print("n_train:", len(dls.train_ds), "n_valid:", len(dls.valid_ds))
    from collections import Counter
    def sample_label_counts(ds, n=500):
        cnt = Counter()
        for i in range(min(len(ds), n)):
            _, y = ds[i]   # returns (x, y)
            if hasattr(y, "item"):
                cnt[int(y.item())] += 1
            else:
                cnt[str(y)] += 1
        return cnt
    print("train label counts (sample):", sample_label_counts(dls.train_ds))
    print("valid label counts (sample):", sample_label_counts(dls.valid_ds))

    # Predictions distribution on validation set
    try:
        preds, targs = learn.get_preds(dl=dls.valid)
        pred_classes = preds.argmax(dim=1)
        print("valid pred dist (top classes):", Counter(pred_classes.numpy()).most_common()[:10])
        print("valid targ dist (top classes):", Counter(targs.numpy()).most_common()[:10])
    except Exception as e:
        print("Could not run get_preds():", e)
    print("=== end diagnostics ===")


    # Model: ResNet34 als Startpunkt
    learn = cnn_learner(dls, resnet34, metrics=[accuracy])

    # Optional: learning rate Suche (auskommentieren, wenn Probleme)
    try:
        lr_suggestion = learn.lr_find(suggest_funcs=(valley,))
        print("LR-Find suggestion:", lr_suggestion)
    except Exception:
        # Falls lr_find Probleme macht (Headless/Plot), verwenden wir Default
        print("lr_find fehlgeschlagen oder nicht möglich; Verwende Standard lr 3e-3")
        lr_suggestion = 3e-3

    # Erste Trainingsphase
    learn.fine_tune(5, base_lr=3e-3)

    # Feinabstimmung: unfreeze und niedrigeres LR für alle Schichten
    learn.unfreeze()
    learn.fit_one_cycle(3, lr_max=slice(1e-6, 1e-4))

    # Modell speichern / exportieren für Inferenz
    learn.save("resnet34-stage-final")
    learn.export("export_resnet34.pkl")  # kann mit load_learner wieder geladen werden

    print("Training abgeschlossen. Modell gespeichert als 'resnet34-stage-final' und 'export_resnet34.pkl'.")

if __name__ == "__main__":
    main()

Using images path: /workspaces/PS9-Boston-Dynamic-Mobile-CV-Testing-Systems/data/images
Verfügbare Device(s): cpu
train_csv: parsed 35 entries, missing 0
valid_csv: parsed 14 entries, missing 0
=== DATASET / DATALOADER DIAGNOSTICS ===
dls.vocab: ['analog', 'digital_ac', 'digital_temp']
n_train: 35 n_valid: 14
train label counts (sample): Counter({0: 17, 1: 17, 2: 1})
valid label counts (sample): Counter({1: 8, 0: 6})
Could not run get_preds(): cannot access local variable 'learn' where it is not associated with a value
=== end diagnostics ===


  warn("`cnn_learner` has been renamed to `vision_learner` -- please update your code")


lr_find fehlgeschlagen oder nicht möglich; Verwende Standard lr 3e-3


epoch,train_loss,valid_loss,accuracy,time
0,,1.401462,0.571429,00:02


  warn("Your generator is empty.")


epoch,train_loss,valid_loss,accuracy,time
0,,1.401462,0.571429,00:02
1,,1.401462,0.571429,00:02
2,,1.401462,0.571429,00:02
3,,1.401462,0.571429,00:02
4,,1.401462,0.571429,00:01


epoch,train_loss,valid_loss,accuracy,time
0,,1.401462,0.571429,00:02
1,,1.401462,0.571429,00:02
2,,1.401462,0.571429,00:02


Training abgeschlossen. Modell gespeichert als 'resnet34-stage-final' und 'export_resnet34.pkl'.


In [None]:
# ...existing code...
from collections import defaultdict, Counter
import random
import math
from fastai.vision.all import *

def build_dls_from_items(items, labels_map, valid_set, bs=32):
    dblock = DataBlock(
        blocks=(ImageBlock, CategoryBlock),
        get_items=lambda _=None: items,
        get_y=lambda o: labels_map[str(o)],
        splitter=FuncSplitter(lambda o: o in valid_set),
        item_tfms=Resize(460),
        batch_tfms=aug_transforms(size=224)
    )
    return dblock.dataloaders(path, bs=bs)

def sample_preserve_dist(all_items, labels_map, frac, min_per_class=1):
    by_label = defaultdict(list)
    for it in all_items:
        by_label[labels_map[str(it)]].append(it)
    sampled = []
    for lab, its in by_label.items():
        k = max(min_per_class, math.ceil(len(its) * frac))
        k = min(k, len(its))
        sampled.extend(random.sample(its, k))
    return sampled

# Konfiguration: passe fractions/epochs bei Bedarf an
fractions = [0.1, 0.25, 0.5, 1.0]
results = {}
random.seed(42)

# Benötigt: all_items, labels_map_str, valid_set müssen bereits existieren (aus vorheriger CSV-Logik)
for f in fractions:
    print(f"== fraction {f} ==")
    items_sub = sample_preserve_dist(all_items, labels_map_str, f)
    # valid_set bleibt unverändert (oder man könnte valid nur aus valid_items nehmen)
    dls_sub = build_dls_from_items(items_sub + list(valid_items), labels_map_str, set(valid_items), bs=32)
    print("n_train_sub:", len(dls_sub.train_ds), "n_valid:", len(dls_sub.valid_ds), "vocab:", dls_sub.vocab)
    try:
        learn_sub = cnn_learner(dls_sub, resnet18, metrics=[accuracy])
        # kurzes Training (schnell testen)
        learn_sub.fine_tune(2, base_lr=3e-3)
        val = learn_sub.validate()
        acc = float(val[1]) if len(val) > 1 else None
        print("val acc:", acc)
        results[f] = acc
    except Exception as e:
        print("Fehler beim Training:", e)
        results[f] = None

print("=== Lernkurve (fraction -> val_acc) ===")
for f,a in results.items():
    print(f"{f}: {a}")
# ...existing code...