# Train YOLOv8 on Combined Datasets (High Accuracy)

This notebook merges two datasets to create a larger, more robust training set:
1.  **`parizadkarimi/helmet-violations`** (Base dataset)
2.  **`aneesarom/rider-with-helmet-without-helmet-number-plate`** (Extra data)

The script below automatically aligns their class labels (Helmet, No Helmet, Plate).

### ⚠️ CRITICAL STEP ⚠️
**You MUST accept rules for BOTH datasets:**
1.  [parizadkarimi/helmet-violations](https://www.kaggle.com/datasets/parizadkarimi/helmet-violations)
2.  [aneesarom/rider-with-helmet-without-helmet-number-plate](https://www.kaggle.com/datasets/aneesarom/rider-with-helmet-without-helmet-number-plate)

### Steps
1.  **Run All Cells**.
2.  **Upload `kaggle.json`**.
3.  The script will handle downloading, merging, and training.

In [None]:
!pip install ultralytics kaggle pyyaml

In [None]:
import os
from google.colab import files

# 1. Upload kaggle.json
print("Please upload your 'kaggle.json' file:")
if not os.path.exists('/root/.kaggle/kaggle.json'):
    uploaded = files.upload()
    for fn in uploaded.keys():
        print('User uploaded file "{name}" with length {length} bytes'.format(
            name=fn, length=len(uploaded[fn])))
    !mkdir -p ~/.kaggle
    !mv kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    print("Kaggle API configured!")
else:
    print("Kaggle API already configured.")

In [None]:
# 2. Download and Merge Datasets
import yaml
import glob
import shutil
from pathlib import Path
import numpy as np

def download_dataset(slug, folder_name):
    print(f"Downloading {slug}...")
    !kaggle datasets download -d {slug}
    zip_name = slug.split('/')[-1] + ".zip"
    !unzip -q {zip_name} -d {folder_name}
    !rm {zip_name}

# Define Datasets
datasets = [
    {'slug': 'parizadkarimi/helmet-violations', 'folder': 'ds1'},
    {'slug': 'aneesarom/rider-with-helmet-without-helmet-number-plate', 'folder': 'ds2'}
]

for ds in datasets:
    try:
        download_dataset(ds['slug'], ds['folder'])
    except Exception as e:
        print(f"Error downloading {ds['slug']}: {e}")

# Merge Logic
print("Merging Datasets...")

# Final Classes: 0: Helmet, 1: No Helmet, 2: Plate
target_classes = ['Helmet', 'No Helmet', 'Plate']
final_dataset_path = 'final_dataset'

for split in ['train', 'valid', 'test']:
    os.makedirs(f"{final_dataset_path}/{split}/images", exist_ok=True)
    os.makedirs(f"{final_dataset_path}/{split}/labels", exist_ok=True)

def match_class(name):
    name = name.lower()
    if 'no' in name or 'without' in name or 'head' in name:
        return 1 # No Helmet
    elif 'helmet' in name and 'no' not in name:
        return 0 # Helmet
    elif 'plate' in name or 'license' in name or 'number' in name:
        return 2 # Plate
    return -1 # Ignore (e.g. Rider)

for ds in datasets:
    # Find data.yaml
    yaml_files = glob.glob(f"{ds['folder']}/**/data.yaml", recursive=True)
    if not yaml_files:
        print(f"Skipping {ds['folder']}: No data.yaml found")
        continue
        
    with open(yaml_files[0], 'r') as f:
        data_config = yaml.safe_load(f)
    
    names = data_config.get('names', [])
    print(f"Dataset {ds['folder']} classes: {names}")
    
    # Build ID map
    id_map = {}
    for i, name in enumerate(names):
        new_id = match_class(name)
        if new_id != -1:
            id_map[i] = new_id
            
    # Copy and Remap
    # Assuming standard structure standard or recursive search
    for split in ['train', 'valid', 'test']:
        # Find images
        split_images = glob.glob(f"{ds['folder']}/**/{split}/images/*.*", recursive=True)
        # Some datasets use 'val' instead of 'valid'
        if not split_images and split == 'valid':
             split_images = glob.glob(f"{ds['folder']}/**/val/images/*.*", recursive=True)
        
        print(f"Processing {len(split_images)} images for {ds['folder']} - {split}")
        
        for img_path in split_images:
            # Derive paths
            lbl_path = img_path.replace('images', 'labels').rsplit('.', 1)[0] + ".txt"
            
            if os.path.exists(lbl_path):
                # Read Label
                new_lines = []
                with open(lbl_path, 'r') as f:
                    lines = f.readlines()
                    for line in lines:
                        parts = line.strip().split()
                        if len(parts) >= 5:
                            cls = int(parts[0])
                            if cls in id_map:
                                new_cls = id_map[cls]
                                new_lines.append(f"{new_cls} " + " ".join(parts[1:]))
                
                if new_lines:
                    # Write new label
                    basename = f"{ds['folder']}_{os.path.basename(lbl_path)}"
                    with open(f"{final_dataset_path}/{split}/labels/{basename}", 'w') as f:
                        f.write("\n".join(new_lines))
                    
                    # Copy Image
                    img_basename = f"{ds['folder']}_{os.path.basename(img_path)}"
                    shutil.copy(img_path, f"{final_dataset_path}/{split}/images/{img_basename}")

# Create final data.yaml
final_yaml = {
    'train': f"/content/{final_dataset_path}/train/images",
    'val': f"/content/{final_dataset_path}/valid/images",
    'nc': 3,
    'names': target_classes
}

with open(f"{final_dataset_path}/data.yaml", 'w') as f:
    yaml.dump(final_yaml, f)

print("Merge Complete! Final dataset created at 'final_dataset'.")

In [None]:
# 4. Train Model
from ultralytics import YOLO

# Load Medium model
model = YOLO('yolov8m.pt') 

# Train on Merged Dataset
model.train(data=f"{final_dataset_path}/data.yaml", epochs=50, imgsz=640, project='runs/detect', name='train')


In [None]:
# 5. Export
from google.colab import files
import glob

weights_files = glob.glob('runs/detect/train/weights/best.pt', recursive=True)

if weights_files:
    print(f"Found model at: {weights_files[0]}")
    files.download(weights_files[0])
else:
    print("Model not found.")