# Auto-Labeler Training - Single Model (All PPE Classes)

**Project:** Safety Compliance Monitoring (SCM)

**Date:** January 2026

**Objective:** Train ONE YOLOv8 model for auto-labeling purposes
- ALL 17 SH17 PPE classes in single model
- Used for generating pre-annotations
- Speeds up supplement dataset annotation
- Future-proof: Can use additional classes later if needed

**Note:** This is the AUTO-LABELING model, not the final deployment models!

**Classes (17 total):**
- Core 9: hardhat, vest, shoes, gloves, glasses, facemask, faceguard, earmuffs, safetysuit
- Additional 8: (boots, safety-harness, goggles, respirator, apron, welding-mask, etc.)

## 1. Environment Setup

In [None]:
# Install dependencies
!pip install ultralytics roboflow opencv-python matplotlib seaborn pandas

# Verify installation
import ultralytics
print(f"Ultralytics version: {ultralytics.__version__}")


from ultralytics import YOLO
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

In [None]:
# Set up directories
import os
from pathlib import Path

base_dir = Path.home() / 'optense' / 'scm'
models_dir = base_dir / 'models'
datasets_dir = base_dir / 'datasets'
results_dir = base_dir / 'training_results' / 'auto_labeler'

for dir_path in [models_dir, datasets_dir, results_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)

print(f"‚úì Base directory: {base_dir}")
print(f"‚úì Results will be saved to: {results_dir}")

## 2. Download SH17 Dataset from Roboflow

In [None]:
from roboflow import Roboflow

# ============================================
# CONFIGURATION - UPDATE THESE VALUES
# ============================================

ROBOFLOW_API_KEY = "Xp2CBnbQfijsteBmE2Op"  # From: https://app.roboflow.com/settings/api
WORKSPACE_NAME = "optense"
PROJECT_NAME = "scm-person-detector"
VERSION_NUMBER = 3

# ============================================

rf = Roboflow(api_key=ROBOFLOW_API_KEY)
print("‚úì Roboflow initialized")

In [None]:
# Download dataset
print("Downloading SH17 dataset (all 17 PPE classes)...")

project = rf.workspace(WORKSPACE_NAME).project(PROJECT_NAME)
dataset = project.version(VERSION_NUMBER).download(
    "yolov8",
    location=str(datasets_dir / "sh17_full")
)

print(f"‚úì Dataset downloaded to: {dataset.location}")

data_yaml = Path(dataset.location) / "data.yaml"
print(f"‚úì data.yaml found: {data_yaml.exists()}")

## 3. Dataset Inspection

In [None]:
import yaml
from collections import Counter
import matplotlib.pyplot as plt

# Load data.yaml
with open(data_yaml, 'r') as f:
    data = yaml.safe_load(f)

print("="*60)
print("SH17 DATASET - AUTO-LABELER TRAINING")
print("="*60)
print(f"Number of classes: {data['nc']}")
print(f"Class names: {data['names']}")

# Count images
dataset_root = Path(data_yaml).parent
train_images = list((dataset_root / data['train']).glob('*.jpg')) + \
               list((dataset_root / data['train']).glob('*.png'))
val_images = list((dataset_root / data['val']).glob('*.jpg')) + \
             list((dataset_root / data['val']).glob('*.png'))

print(f"\nDataset size:")
print(f"  Train images: {len(train_images)}")
print(f"  Val images: {len(val_images)}")
print(f"  Total: {len(train_images) + len(val_images)}")

# Count annotations per class
train_labels_dir = dataset_root / data['train'].replace('images', 'labels')
class_counts = Counter()

for label_file in train_labels_dir.glob('*.txt'):
    with open(label_file, 'r') as f:
        for line in f:
            class_id = int(line.split()[0])
            class_counts[class_id] += 1

print(f"\nClass distribution:")
for class_id, count in sorted(class_counts.items()):
    class_name = data['names'][class_id]
    print(f"  {class_name:15s}: {count:5d} annotations")

total_annotations = sum(class_counts.values())
print(f"\nTotal annotations: {total_annotations}")

# Calculate imbalance ratio
max_count = max(class_counts.values())
min_count = min(class_counts.values())
print(f"Imbalance ratio: {max_count/min_count:.1f}:1")

In [None]:
# Visualize class distribution
plt.figure(figsize=(14, 6))

class_names = [data['names'][i] for i in sorted(class_counts.keys())]
counts = [class_counts[i] for i in sorted(class_counts.keys())]

colors = ['#ff6b6b' if c < 500 else '#4ecdc4' if c < 1500 else '#95e1d3' 
          for c in counts]

plt.bar(class_names, counts, color=colors)
plt.xlabel('Class', fontsize=12)
plt.ylabel('Number of Annotations', fontsize=12)
plt.title('SH17 Dataset - Class Distribution', fontsize=14)
plt.xticks(rotation=45, ha='right')

# Add legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='#ff6b6b', label='Low (<500)'),
    Patch(facecolor='#4ecdc4', label='Medium (500-1500)'),
    Patch(facecolor='#95e1d3', label='High (>1500)')
]
plt.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

## 4. Training Configuration

In [None]:
# Training configuration for auto-labeler

EPOCHS = 100  # Can reduce to 50 if time is critical
BATCH_SIZE = 16  # Adjust based on GPU memory
IMAGE_SIZE = 640
PATIENCE = 20

training_config = {
    'data': str(data_yaml),
    'epochs': EPOCHS,
    'batch': BATCH_SIZE,
    'imgsz': IMAGE_SIZE,
    'patience': PATIENCE,
    'save': True,
    'save_period': 10,
    'device': 0 if torch.cuda.is_available() else 'cpu',
    'workers': 8,
    'project': str(results_dir),
    'name': 'yolov8n_auto_labeler',
    'exist_ok': True,
    'pretrained': True,
    'optimizer': 'AdamW',
    'verbose': True,
    'seed': 42,
    'cos_lr': True,
    'close_mosaic': 10,
    'resume': False,
    'amp': True,
    
    # Learning rates
    'lr0': 0.01,
    'lrf': 0.01,
    'momentum': 0.937,
    'weight_decay': 0.0005,
    'warmup_epochs': 3.0,
    'warmup_momentum': 0.8,
    'warmup_bias_lr': 0.1,
    
    # Loss weights (adjust these for class imbalance)
    'box': 7.5,      # Box loss weight
    'cls': 0.5,      # Classification loss weight
    'dfl': 1.5,      # DFL loss weight
    
    # Augmentation (moderate-aggressive for 17 classes)
    'hsv_h': 0.018,
    'hsv_s': 0.7,
    'hsv_v': 0.4,
    'degrees': 12.0,
    'translate': 0.15,
    'scale': 0.6,
    'shear': 0.0,
    'perspective': 0.0,
    'flipud': 0.0,
    'fliplr': 0.5,
    'mosaic': 1.0,
    'mixup': 0.2,
    'copy_paste': 0.15,
    
    # Validation
    'plots': True,
    'val': True,
}

print("‚úì Training configuration ready")
print(f"  Device: {training_config['device']}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Epochs: {EPOCHS}")
print(f"  Image size: {IMAGE_SIZE}√ó{IMAGE_SIZE}")
print("\nNote: YOLOv8 handles class imbalance automatically during training")

## 5. Train Auto-Labeler Model

In [None]:
print("="*60)
print("TRAINING AUTO-LABELER MODEL")
print("="*60)
print("Purpose: Generate pre-annotations for supplement dataset")
print("Classes: All 17 PPE types (including future-use classes)")
print("\nStarting training...")
print("Expected duration: 2-4 hours")
print("You can let this run overnight!")
print("")

# Initialize model
model = YOLO('yolov8n.pt')

# Train
results = model.train(**training_config)

print("\n" + "="*60)
print("‚úì TRAINING COMPLETE!")
print("="*60)

## 6. Evaluate Model

In [None]:
print("Evaluating auto-labeler on validation set...\n")
metrics = model.val()

print("="*60)
print("AUTO-LABELER MODEL RESULTS")
print("="*60)
print(f"mAP50: {metrics.box.map50:.4f}")
print(f"mAP50-95: {metrics.box.map:.4f}")
print(f"Precision: {metrics.box.mp:.4f}")
print(f"Recall: {metrics.box.mr:.4f}")

print("\nPer-class mAP50:")
for i, class_name in enumerate(data['names']):
    if hasattr(metrics.box, 'maps') and i < len(metrics.box.maps):
        class_map = metrics.box.maps[i]
        status = "‚úì" if class_map > 0.6 else "‚ö†Ô∏è" if class_map > 0.4 else "‚ùå"
        print(f"  {status} {class_name:15s}: {class_map:.4f}")

print("\n" + "="*60)
print("NOTE: This model is for AUTO-LABELING, not deployment")
print("Lower accuracy is acceptable - human will verify!")
print("="*60)

## 7. Visualize Training Results

In [None]:
from PIL import Image

# Find results directory
run_dirs = sorted(results_dir.glob('*'), key=os.path.getmtime, reverse=True)
if run_dirs:
    latest_run = run_dirs[0]
    print(f"Loading results from: {latest_run}\n")
    
    # Display key plots
    plot_files = ['results.png', 'confusion_matrix.png', 'val_batch0_pred.jpg']
    
    fig, axes = plt.subplots(1, 3, figsize=(20, 6))
    fig.suptitle('Auto-Labeler Training Results', fontsize=16)
    
    for idx, plot_file in enumerate(plot_files):
        plot_path = latest_run / plot_file
        if plot_path.exists():
            img = Image.open(plot_path)
            axes[idx].imshow(img)
            axes[idx].axis('off')
            title = plot_file.replace('.png', '').replace('.jpg', '').replace('_', ' ').title()
            axes[idx].set_title(title)
        else:
            axes[idx].text(0.5, 0.5, f'{plot_file} not found', 
                          ha='center', va='center')
            axes[idx].axis('off')
    
    plt.tight_layout()
    plt.show()
else:
    print("No training results found")

## 8. Test on Sample Images

In [None]:
import cv2
import random

# Get random validation images
test_images = random.sample(val_images, min(6, len(val_images)))

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Auto-Labeler Predictions on Validation Set', fontsize=16)
axes = axes.flatten()

for idx, img_path in enumerate(test_images):
    # Run prediction
    results = model.predict(source=str(img_path), conf=0.25, verbose=False)
    
    # Get annotated image
    annotated = results[0].plot()
    annotated = cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB)
    
    axes[idx].imshow(annotated)
    axes[idx].axis('off')
    axes[idx].set_title(f"{Path(img_path).name}\n{len(results[0].boxes)} detections")

plt.tight_layout()
plt.show()

## 9. Export Model for Auto-Labeling

In [None]:
# Find best weights
best_weights = list(results_dir.glob('**/weights/best.pt'))[0]
print(f"Best weights: {best_weights}")

# Copy to organized location
import shutil

auto_labeler_dir = models_dir / 'auto_labeler'
auto_labeler_dir.mkdir(parents=True, exist_ok=True)

auto_labeler_model = auto_labeler_dir / 'yolov8n_auto_labeler.pt'
shutil.copy(best_weights, auto_labeler_model)

print(f"\n‚úì Auto-labeler model saved to: {auto_labeler_model}")
print("\n" + "="*60)
print("READY FOR AUTO-LABELING!")
print("="*60)
print("\nNext steps:")
print("1. Use this model to pre-annotate supplement images")
print("2. Human reviews and corrects annotations")
print("3. Merge corrected data with SH17")
print("4. Train specialized Model A and Model B for deployment")
print("")
print(f"Auto-labeler ready at: {auto_labeler_model}")

## 10. Create Auto-Labeling Script

In [None]:
# Save auto-labeling script for easy use

auto_label_script = base_dir / 'tools' / 'auto_label.py'
auto_label_script.parent.mkdir(parents=True, exist_ok=True)

script_content = f'''#!/usr/bin/env python3
"""
Auto-Labeler Script
Uses trained model to generate pre-annotations for manual review.

Usage:
    python auto_label.py --input unlabeled_images/ --output pre_annotated/
"""

import argparse
from pathlib import Path
from ultralytics import YOLO
import shutil

# Model path
MODEL_PATH = "{auto_labeler_model}"

def auto_label(input_dir, output_dir, conf_threshold=0.25):
    """Auto-label images using trained model."""
    
    print(f"Loading model: {{MODEL_PATH}}")
    model = YOLO(MODEL_PATH)
    
    input_dir = Path(input_dir)
    output_dir = Path(output_dir)
    
    (output_dir / 'images').mkdir(parents=True, exist_ok=True)
    (output_dir / 'labels').mkdir(parents=True, exist_ok=True)
    
    # Get all images
    image_files = []
    for ext in ['*.jpg', '*.jpeg', '*.png']:
        image_files.extend(list(input_dir.glob(ext)))
    
    print(f"Found {{len(image_files)}} images to process")
    
    stats = {{'processed': 0, 'with_detections': 0, 'total_detections': 0}}
    
    for i, img_path in enumerate(image_files, 1):
        results = model.predict(source=str(img_path), conf=conf_threshold, verbose=False)
        result = results[0]
        
        if result.boxes is not None and len(result.boxes) > 0:
            stats['with_detections'] += 1
            stats['total_detections'] += len(result.boxes)
            
            # Copy image
            shutil.copy(img_path, output_dir / 'images' / img_path.name)
            
            # Save annotations in YOLO format
            label_file = output_dir / 'labels' / f"{{img_path.stem}}.txt"
            
            with open(label_file, 'w') as f:
                for box in result.boxes:
                    # Get box in YOLO format
                    class_id = int(box.cls[0])
                    x_center, y_center, width, height = box.xywhn[0].tolist()
                    f.write(f"{{class_id}} {{x_center:.6f}} {{y_center:.6f}} {{width:.6f}} {{height:.6f}}\\n")
        
        stats['processed'] += 1
        if i % 50 == 0:
            print(f"  Processed: {{i}}/{{len(image_files)}}")
    
    print("\\n" + "="*60)
    print("AUTO-LABELING COMPLETE")
    print("="*60)
    print(f"Total images: {{stats['processed']}}")
    print(f"Images with detections: {{stats['with_detections']}}")
    print(f"Total detections: {{stats['total_detections']}}")
    if stats['with_detections'] > 0:
        print(f"Avg detections/image: {{stats['total_detections']/stats['with_detections']:.1f}}")
    print(f"\\nOutput: {{output_dir}}")
    print("\\n‚ö†Ô∏è  IMPORTANT: Review and verify all annotations!")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Auto-label PPE images')
    parser.add_argument('--input', required=True, help='Input directory with unlabeled images')
    parser.add_argument('--output', required=True, help='Output directory for annotated images')
    parser.add_argument('--conf', type=float, default=0.25, help='Confidence threshold')
    
    args = parser.parse_args()
    auto_label(args.input, args.output, args.conf)
'''

with open(auto_label_script, 'w') as f:
    f.write(script_content)

# Make executable
import stat
auto_label_script.chmod(auto_label_script.stat().st_mode | stat.S_IEXEC)

print(f"‚úì Auto-labeling script saved to: {auto_label_script}")
print("\nUsage:")
print(f"  python {auto_label_script} --input unlabeled/ --output annotated/")

## Summary

In [None]:
print("="*70)
print("AUTO-LABELER TRAINING COMPLETE")
print("="*70)

print("\nüìä MODEL PERFORMANCE:")
print(f"   mAP50: {metrics.box.map50:.4f}")
print(f"   mAP50-95: {metrics.box.map:.4f}")
print(f"   Classes: {', '.join(data['names'])}")

print("\nüìÅ OUTPUT FILES:")
print(f"   Model: {auto_labeler_model}")
print(f"   Script: {auto_label_script}")
print(f"   Results: {results_dir}")

print("\n‚úÖ NEXT STEPS:")
print("   1. Run auto-labeling on supplement images:")
print(f"      python {auto_label_script} --input supplements/ --output pre_annotated/")
print("")
print("   2. Review and correct annotations in Roboflow/CVAT")
print("")
print("   3. Merge corrected data with SH17")
print("")
print("   4. Train final Model A (common PPE) and Model B (rare PPE)")
print("="*70)