In [14]:
# VOLLSTÄNDIGE Label-Analyse aller Mask-Dateien
import glob
import rasterio
import numpy as np
import os
from collections import Counter
from tqdm import tqdm

# Korrekte Pfade
MASK_DIR = '/Users/tim.strauss/PycharmProjects/furnlDetection/brick_furnace_detection/Brick_Data_Train/Mask'


def complete_label_analysis():
    """Vollständige Analyse ALLER Label in ALLEN Mask-Dateien"""

    print("=" * 60)
    print("COMPLETE DATASET LABEL ANALYSIS")
    print("=" * 60)

    # Alle Mask-Dateien finden
    mask_files = glob.glob(os.path.join(MASK_DIR, "*.tif"))
    print(f"Found {len(mask_files)} mask files")

    if len(mask_files) == 0:
        print(f"ERROR: No .tif files found in {MASK_DIR}")
        print("Please check the path and make sure mask files exist.")
        return None, None, None

    # Label-Zähler initialisieren
    label_counts = Counter()
    total_pixels = 0
    files_with_errors = []

    print("Analyzing ALL mask files...")

    # Analysiere ALLE Dateien (nicht nur 50!)
    for i, mask_path in enumerate(tqdm(mask_files, desc="Processing masks")):
        try:
            with rasterio.open(mask_path) as src:
                mask = src.read(1)
                unique_labels, counts = np.unique(mask, return_counts=True)

                for label, count in zip(unique_labels, counts):
                    label_counts[int(label)] += int(count)
                    total_pixels += int(count)

        except Exception as e:
            files_with_errors.append((mask_path, str(e)))
            continue

    # Ergebnisse anzeigen
    print("\n" + "=" * 60)
    print("COMPLETE LABEL DISTRIBUTION")
    print("=" * 60)

    print(f"Successfully processed: {len(mask_files) - len(files_with_errors)}/{len(mask_files)} files")
    print(f"Files with errors: {len(files_with_errors)}")
    print(f"Total pixels analyzed: {total_pixels:,}")

    # Sortiere Labels nach Häufigkeit
    sorted_labels = sorted(label_counts.items(), key=lambda x: x[1], reverse=True)

    print(f"\nUnique labels found: {len(sorted_labels)}")

    if sorted_labels:
        print(f"Label range: {min(label_counts.keys())} to {max(label_counts.keys())}")

        print("\nLabel distribution:")
        print("-" * 50)
        for label, count in sorted_labels:
            percentage = (count / total_pixels) * 100
            print(f"Label {label:3d}: {count:>12,} pixels ({percentage:7.4f}%)")

        # Zusätzliche Analysen
        print("\n" + "=" * 60)
        print("CRITICAL CHECKS")
        print("=" * 60)

        all_labels = set(label_counts.keys())
        min_label = min(all_labels)
        max_label = max(all_labels)

        print(f"Min label: {min_label}")
        print(f"Max label: {max_label}")
        print(f"All labels present: {sorted(all_labels)}")

        # Prüfe auf fehlende Labels im Bereich
        expected_range = set(range(min_label, max_label + 1))
        missing_labels = expected_range - all_labels

        if missing_labels:
            print(f"⚠️  Missing labels in range: {sorted(missing_labels)}")
        else:
            print("✅ All labels in range are present (continuous)")

    # Zeige Fehler-Dateien
    if files_with_errors:
        print(f"\nFiles with errors ({len(files_with_errors)}):")
        for filepath, error in files_with_errors[:10]:  # Erste 10
            print(f"  {os.path.basename(filepath)}: {error}")
        if len(files_with_errors) > 10:
            print(f"  ... and {len(files_with_errors) - 10} more")

    return label_counts, total_pixels, sorted_labels


# Führe vollständige Analyse aus
label_counts, total_pixels, sorted_labels = complete_label_analysis()

COMPLETE DATASET LABEL ANALYSIS
Found 7471 mask files
Analyzing ALL mask files...


Processing masks: 100%|██████████| 7471/7471 [00:09<00:00, 782.18it/s]


COMPLETE LABEL DISTRIBUTION
Successfully processed: 7471/7471 files
Files with errors: 0
Total pixels analyzed: 489,619,456

Unique labels found: 9
Label range: 0 to 8

Label distribution:
--------------------------------------------------
Label   0:  489,273,929 pixels (99.9294%)
Label   2:      166,043 pixels ( 0.0339%)
Label   1:      110,363 pixels ( 0.0225%)
Label   3:       24,800 pixels ( 0.0051%)
Label   8:       19,491 pixels ( 0.0040%)
Label   5:       12,314 pixels ( 0.0025%)
Label   4:        7,167 pixels ( 0.0015%)
Label   6:        3,204 pixels ( 0.0007%)
Label   7:        2,145 pixels ( 0.0004%)

CRITICAL CHECKS
Min label: 0
Max label: 8
All labels present: [0, 1, 2, 3, 4, 5, 6, 7, 8]
✅ All labels in range are present (continuous)





In [15]:
# Spezielle Suche nach seltenen Labels 6 und 7
def search_rare_labels():
    """Suche gezielt nach Labels 6 und 7"""

    print("\n" + "=" * 60)
    print("SEARCHING FOR RARE LABELS 6 AND 7")
    print("=" * 60)

    mask_files = glob.glob(os.path.join(MASK_DIR, "*.tif"))
    found_6 = []
    found_7 = []
    found_other_rare = {}

    print(f"Searching through {len(mask_files)} files...")

    for i, mask_path in enumerate(tqdm(mask_files, desc="Searching for rare labels")):
        try:
            with rasterio.open(mask_path) as src:
                mask = src.read(1)
                unique_labels = np.unique(mask)

                # Suche nach Label 6
                if 6 in unique_labels:
                    count_6 = np.sum(mask == 6)
                    found_6.append((mask_path, count_6))

                # Suche nach Label 7
                if 7 in unique_labels:
                    count_7 = np.sum(mask == 7)
                    found_7.append((mask_path, count_7))

                # Suche nach Labels > 9 (unexpected)
                for label in unique_labels:
                    if label > 9:
                        if label not in found_other_rare:
                            found_other_rare[label] = []
                        found_other_rare[label].append((mask_path, np.sum(mask == label)))

        except Exception as e:
            print(f"Error processing {mask_path}: {e}")
            continue

    # Ergebnisse
    print(f"\n🔍 SEARCH RESULTS:")
    print(f"Found label 6 in {len(found_6)} files")
    print(f"Found label 7 in {len(found_7)} files")

    if found_6:
        print(f"\nFiles with label 6:")
        for filepath, count in found_6[:5]:  # Erste 5
            print(f"  {os.path.basename(filepath)}: {count} pixels")
        if len(found_6) > 5:
            print(f"  ... and {len(found_6) - 5} more files")

    if found_7:
        print(f"\nFiles with label 7:")
        for filepath, count in found_7[:5]:  # Erste 5
            print(f"  {os.path.basename(filepath)}: {count} pixels")
        if len(found_7) > 5:
            print(f"  ... and {len(found_7) - 5} more files")

    # Andere seltene Labels
    if found_other_rare:
        print(f"\n⚠️  Found unexpected labels > 9:")
        for rare_label, files in found_other_rare.items():
            print(f"  Label {rare_label}: {len(files)} files")
            for filepath, count in files[:3]:  # Erste 3
                print(f"    {os.path.basename(filepath)}: {count} pixels")

    return found_6, found_7, found_other_rare


# Führe gezielte Suche aus
found_6, found_7, found_other_rare = search_rare_labels()


SEARCHING FOR RARE LABELS 6 AND 7
Searching through 7471 files...


Searching for rare labels: 100%|██████████| 7471/7471 [00:08<00:00, 874.16it/s]


🔍 SEARCH RESULTS:
Found label 6 in 130 files
Found label 7 in 61 files

Files with label 6:
  Image7105.tif: 29 pixels
  Image2451.tif: 24 pixels
  Image5868.tif: 14 pixels
  Image2444.tif: 19 pixels
  Image5303.tif: 33 pixels
  ... and 125 more files

Files with label 7:
  Image5504.tif: 41 pixels
  Image6743.tif: 27 pixels
  Image247.tif: 46 pixels
  Image6781.tif: 45 pixels
  Image6756.tif: 45 pixels
  ... and 56 more files





In [16]:
# Validierung der Label-Konversion
def validate_label_conversion():
    """Teste die Label-Konversion wie im Training-Code"""

    print("\n" + "=" * 60)
    print("LABEL CONVERSION VALIDATION")
    print("=" * 60)

    if label_counts is None:
        print("ERROR: Run complete analysis first!")
        return None, None

    # Zeige verfügbare Labels
    available_labels = sorted(label_counts.keys())
    print(f"Available original labels: {available_labels}")

    # Test verschiedene Konversion-Strategien
    print("\nTesting conversion strategies:")

    # Strategie 1: 1-9 → 0-8, 0 → -1 (wie im aktuellen Code)
    print("\n1️⃣  Current strategy: 0→-1, 1-9→0-8")
    converted_labels = []
    for original in available_labels:
        if original == 0:
            converted = -1  # Background → ignore
        else:
            converted = original - 1  # 1-9 → 0-8
        converted_labels.append(converted)
        print(f"   Original {original} → Converted {converted}")

    valid_converted = [x for x in converted_labels if x != -1]
    if valid_converted:
        print(f"   Valid range after conversion: {min(valid_converted)} to {max(valid_converted)}")
        print(f"   Number of classes: {max(valid_converted) + 1}")

    # Strategie 2: Kontinuierliche Mapping
    print("\n2️⃣  Continuous mapping strategy:")
    non_zero_labels = [x for x in available_labels if x != 0]
    print(f"   Non-background labels: {non_zero_labels}")

    continuous_mapping = {0: -1}  # Background → ignore
    for i, original in enumerate(sorted(non_zero_labels)):
        continuous_mapping[original] = i

    print("   Mapping:")
    for original, converted in continuous_mapping.items():
        print(f"   Original {original} → Converted {converted}")

    num_classes = len(non_zero_labels)
    print(f"   Number of classes needed: {num_classes}")

    return available_labels, continuous_mapping


# Führe Validierung aus
available_labels, continuous_mapping = validate_label_conversion()


LABEL CONVERSION VALIDATION
Available original labels: [0, 1, 2, 3, 4, 5, 6, 7, 8]

Testing conversion strategies:

1️⃣  Current strategy: 0→-1, 1-9→0-8
   Original 0 → Converted -1
   Original 1 → Converted 0
   Original 2 → Converted 1
   Original 3 → Converted 2
   Original 4 → Converted 3
   Original 5 → Converted 4
   Original 6 → Converted 5
   Original 7 → Converted 6
   Original 8 → Converted 7
   Valid range after conversion: 0 to 7
   Number of classes: 8

2️⃣  Continuous mapping strategy:
   Non-background labels: [1, 2, 3, 4, 5, 6, 7, 8]
   Mapping:
   Original 0 → Converted -1
   Original 1 → Converted 0
   Original 2 → Converted 1
   Original 3 → Converted 2
   Original 4 → Converted 3
   Original 5 → Converted 4
   Original 6 → Converted 5
   Original 7 → Converted 6
   Original 8 → Converted 7
   Number of classes needed: 8


In [17]:
# Zusammenfassung und Empfehlungen
def generate_recommendations():
    """Generiere Empfehlungen basierend auf der Analyse"""

    print("\n" + "=" * 60)
    print("RECOMMENDATIONS FOR TRAINING CODE")
    print("=" * 60)

    if label_counts is None:
        print("ERROR: Run complete analysis first!")
        return

    available_labels = sorted(label_counts.keys())
    non_background_labels = [x for x in available_labels if x != 0]

    print(f"📊 Dataset Summary:")
    print(f"   - Total unique labels: {len(available_labels)}")
    print(f"   - Background label: 0")
    print(f"   - Foreground labels: {non_background_labels}")
    print(f"   - Required NUM_CLASSES: {len(non_background_labels)}")

    # Prüfe ob Labels 6,7 existieren
    has_label_6 = 6 in available_labels
    has_label_7 = 7 in available_labels

    print(f"\n🔍 Critical findings:")
    print(f"   - Label 6 exists: {has_label_6}")
    print(f"   - Label 7 exists: {has_label_7}")

    if has_label_6 and has_label_7:
        print("   ✅ All 9 classes (0-8) are present!")
        print("   → Your original NUM_CLASSES = 9 was CORRECT")
        print("   → The training log showing only 7 labels was due to analyzing only 50 files")
    elif not has_label_6 and not has_label_7:
        print("   ❌ Labels 6 and 7 are completely missing!")
        print(f"   → Actual NUM_CLASSES should be {len(non_background_labels)}")
    else:
        print(f"   ⚠️  Only one of labels 6,7 exists: 6={has_label_6}, 7={has_label_7}")

    # Empfohlene Konfiguration
    print(f"\n⚙️  Recommended configuration:")
    print(f"```python")
    print(f"NUM_CLASSES = {len(non_background_labels)}  # Classes: {non_background_labels}")
    print(f"AVAILABLE_LABELS = {available_labels}")

    # Label-Mapping
    print(f"\n# Label conversion mapping:")
    mapping = {0: -1}  # Background
    for i, label in enumerate(sorted(non_background_labels)):
        mapping[label] = i

    for original, converted in mapping.items():
        print(f"# {original} → {converted}")

    print(f"\nLABEL_MAPPING = {mapping}")

    # Gewichte basierend auf Häufigkeit
    print(f"\n# Class weights (based on frequency):")
    weights = []
    for label in sorted(non_background_labels):
        frequency = label_counts[label] / total_pixels
        # Inversely proportional weight (rare classes get higher weights)
        weight = min(20.0, max(1.0, 0.001 / frequency))
        weights.append(round(weight, 1))
        print(f"# Class {mapping[label]} (original {label}): {frequency:.6f} → weight {weight:.1f}")

    print(f"class_weights = torch.tensor({weights})")
    print(f"```")


# Generiere Empfehlungen
generate_recommendations()


RECOMMENDATIONS FOR TRAINING CODE
📊 Dataset Summary:
   - Total unique labels: 9
   - Background label: 0
   - Foreground labels: [1, 2, 3, 4, 5, 6, 7, 8]
   - Required NUM_CLASSES: 8

🔍 Critical findings:
   - Label 6 exists: True
   - Label 7 exists: True
   ✅ All 9 classes (0-8) are present!
   → Your original NUM_CLASSES = 9 was CORRECT
   → The training log showing only 7 labels was due to analyzing only 50 files

⚙️  Recommended configuration:
```python
NUM_CLASSES = 8  # Classes: [1, 2, 3, 4, 5, 6, 7, 8]
AVAILABLE_LABELS = [0, 1, 2, 3, 4, 5, 6, 7, 8]

# Label conversion mapping:
# 0 → -1
# 1 → 0
# 2 → 1
# 3 → 2
# 4 → 3
# 5 → 4
# 6 → 5
# 7 → 6
# 8 → 7

LABEL_MAPPING = {0: -1, 1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}

# Class weights (based on frequency):
# Class 0 (original 1): 0.000225 → weight 4.4
# Class 1 (original 2): 0.000339 → weight 2.9
# Class 2 (original 3): 0.000051 → weight 19.7
# Class 3 (original 4): 0.000015 → weight 20.0
# Class 4 (original 5): 0.000025 → 