# Train YOLOv8 Large (ULTRA Accuracy) on 3 Combined Datasets

This is the **Corrected Training Script** with your specific datasets.
1.  **`pkdarabi/helmet`** (Base)
2.  **`aneesarom/rider-with-helmet...`**
3.  **`rhamzanul/smart-helmet-detection`**

**Model**: Uses **YOLOv8l** (Large).

### ⚠️ CRITICAL STEP ⚠️
**You MUST accept rules for ALL 3 datasets:**
1.  [pkdarabi/helmet](https://www.kaggle.com/datasets/pkdarabi/helmet)
2.  [aneesarom/rider-with-helmet-without-helmet-number-plate](https://www.kaggle.com/datasets/aneesarom/rider-with-helmet-without-helmet-number-plate)
3.  [rhamzanul/smart-helmet-detection-using-yolo-v8](https://www.kaggle.com/datasets/rhamzanul/smart-helmet-detection-using-yolo-v8)

### Steps
1.  **Factory Reset**: Runtime -> Disconnect and Delete Runtime.
2.  **Upload `kaggle.json`** when prompted.
3.  **Run All**.

In [None]:
!pip install ultralytics kaggle pyyaml

In [None]:
import os
from google.colab import files
import shutil

# 0. CLEANUP 
print("Cleaning up old runs...")
dirs_to_clean = ['ds1', 'ds2', 'ds3', 'final_dataset', 'dataset', 'runs']
for d in dirs_to_clean:
    if os.path.exists(d):
        shutil.rmtree(d)

# 1. Upload kaggle.json
print("Please upload your 'kaggle.json' file:")
if not os.path.exists('/root/.kaggle/kaggle.json') and not os.path.exists('kaggle.json'):
    uploaded = files.upload()
    for fn in uploaded.keys():
        print('User uploaded file "{name}" with length {length} bytes'.format(
            name=fn, length=len(uploaded[fn])))
    !mkdir -p ~/.kaggle
    !mv kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    print("Kaggle API configured!")
else:
    if os.path.exists('kaggle.json'):
        !mkdir -p ~/.kaggle
        !mv kaggle.json ~/.kaggle/
        !chmod 600 ~/.kaggle/kaggle.json
    print("Kaggle API already configured.")

In [None]:
# 2. Download and Merge 3 Datasets
import yaml
import glob
import shutil
from pathlib import Path

def download_dataset(slug, folder_name):
    print(f"Downloading {slug}...")
    !kaggle datasets download -d {slug}
    zip_name = slug.split('/')[-1] + ".zip"
    !unzip -qo {zip_name} -d {folder_name}
    !rm {zip_name}

# YOUR CORRECTED LIST
datasets = [
    {'slug': 'pkdarabi/helmet', 'folder': 'ds1'},
    {'slug': 'aneesarom/rider-with-helmet-without-helmet-number-plate', 'folder': 'ds2'},
    {'slug': 'rhamzanul/smart-helmet-detection-using-yolo-v8', 'folder': 'ds3'}
]

for ds in datasets:
    try:
        download_dataset(ds['slug'], ds['folder'])
        print(f"Successfully downloaded {ds['slug']}")
    except Exception as e:
        print(f"Error downloading {ds['slug']}: {e}")
        os.makedirs(ds['folder'], exist_ok=True)

# Merge Logic
print("Merging 3 Datasets...")

# Final Classes: 0: Helmet, 1: No Helmet, 2: Plate
target_classes = ['Helmet', 'No Helmet', 'Plate']
final_dataset_path = 'final_dataset'

for split in ['train', 'valid', 'test']:
    os.makedirs(f"{final_dataset_path}/{split}/images", exist_ok=True)
    os.makedirs(f"{final_dataset_path}/{split}/labels", exist_ok=True)

def match_class(name):
    name = name.lower()
    if 'no' in name or 'without' in name or 'head' in name:
        return 1 # No Helmet
    elif 'helmet' in name and 'no' not in name:
        return 0 # Helmet
    elif 'plate' in name or 'license' in name or 'number' in name:
        return 2 # Plate
    return -1 # Ignore others

for ds in datasets:
    yaml_files = glob.glob(f"{ds['folder']}/**/data.yaml", recursive=True)
    if not yaml_files:
        print(f"Skipping {ds['folder']}: No data.yaml found")
        continue
        
    with open(yaml_files[0], 'r') as f:
        data_config = yaml.safe_load(f)
    
    names = data_config.get('names', [])
    print(f"Dataset {ds['folder']} classes: {names}")
    
    id_map = {}
    for i, name in enumerate(names):
        new_id = match_class(name)
        if new_id != -1:
            id_map[i] = new_id

    for split in ['train', 'valid', 'test']:
        split_images = glob.glob(f"{ds['folder']}/**/{split}/images/*.*", recursive=True)
        if not split_images and split == 'valid':
             split_images = glob.glob(f"{ds['folder']}/**/val/images/*.*", recursive=True)
        
        print(f"Processing {len(split_images)} images for {ds['folder']} - {split}")
        
        for img_path in split_images:
            lbl_path = img_path.replace('images', 'labels').rsplit('.', 1)[0] + ".txt"
            if os.path.exists(lbl_path):
                new_lines = []
                with open(lbl_path, 'r') as f:
                    lines = f.readlines()
                    for line in lines:
                        parts = line.strip().split()
                        if len(parts) >= 5:
                            cls = int(parts[0])
                            if cls in id_map:
                                new_cls = id_map[cls]
                                new_lines.append(f"{new_cls} " + " ".join(parts[1:]))
                
                if new_lines:
                    basename = f"{ds['folder']}_{os.path.basename(lbl_path)}"
                    with open(f"{final_dataset_path}/{split}/labels/{basename}", 'w') as f:
                        f.write("\n".join(new_lines))
                    img_basename = f"{ds['folder']}_{os.path.basename(img_path)}"
                    shutil.copy(img_path, f"{final_dataset_path}/{split}/images/{img_basename}")

# Create final data.yaml
final_yaml = {
    'train': f"/content/{final_dataset_path}/train/images",
    'val': f"/content/{final_dataset_path}/valid/images",
    'nc': 3,
    'names': target_classes
}

with open(f"{final_dataset_path}/data.yaml", 'w') as f:
    yaml.dump(final_yaml, f)

print("Merge Complete! Mega-Dataset created.")

In [None]:
# 4. Train Model
from ultralytics import YOLO

# Load Large model for MAXIMUM Accuracy
model = YOLO('yolov8l.pt') 

# Train
print("Starting training on merged dataset (Large Model)...")
print("Training for 200 Epochs for Absolute Best Results...")
model.train(data=f"{final_dataset_path}/data.yaml", epochs=50, imgsz=640, project='runs/detect', name='train')


In [None]:
# 5. Export
from google.colab import files
import glob

weights_files = glob.glob('runs/detect/train/weights/best.pt', recursive=True)

if weights_files:
    print(f"Found model at: {weights_files[0]}")
    print("Downloading... This is a larger file (~80-100MB), please wait.")
    files.download(weights_files[0])
else:
    print("Model not found. Please checks 'runs/detect/train' folder manually.")