Kaggle URL: [Yolov8s Train Augmentation](https://www.kaggle.com/code/phatle1578/yolov8s-train-augmentation)

In [None]:
!pip uninstall -y ray ray[default] ray[tune] >/dev/null 2>&1 || true
!pip -q install ultralytics==8.3.27 opencv-python==4.10.0.84 tqdm==4.67.1 torch==2.1.2 torchvision==0.16.2 open_clip_torch==2.24.0
# üîß FIX l·ªói TensorBoard / protobuf conflict
!pip install -q protobuf==3.20.3 tensorboard==2.14.0

import os
import json
import random
import shutil
import yaml
from ultralytics import YOLO
from PIL import Image
from pathlib import Path
import cv2
from tqdm.auto import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models

# ====== CONFIG ======
TRAIN_ROOT = Path("/kaggle/input/dataset-dl-project/observing/train/samples")
ANN_PATH   = Path("/kaggle/input/dataset-dl-project/observing/train/annotations/annotations.json")

# Th∆∞ m·ª•c output
WORK_ROOT      = Path("/kaggle/working")
YOLO_ROOT      = WORK_ROOT / "yolo_data"

# Chia train/val theo video
VAL_RATIO = 0.2
RANDOM_SEED = 42

ZOOM_IN_ENABLED = True
ZOOM_SCALE = 1.2    # ph√≥ng to bbox l√™n ~1.8 l·∫ßn
MAX_ZOOM_PER_FRAME = 3

# ƒê·ªçc file annotations.json
with open(ANN_PATH, "r") as f:
    ann_data = json.load(f)   # list c√°c video

# L·∫•y list video_id
video_ids = [item["video_id"] for item in ann_data]

# Chia train/val theo video_id
random.seed(RANDOM_SEED)
video_ids_shuffled = video_ids.copy()
random.shuffle(video_ids_shuffled)

num_val = int(len(video_ids_shuffled) * VAL_RATIO)
val_ids = set(video_ids_shuffled[:num_val])
train_ids = set(video_ids_shuffled[num_val:])

print(f"S·ªë video train: {len(train_ids)}, val: {len(val_ids)}")

# Map video_id -> 'train' ho·∫∑c 'val'
split_map = {}
for vid in video_ids:
    split_map[vid] = "val" if vid in val_ids else "train"

def build_bboxes_per_frame(video_item):
    """
    video_item: 1 dict trong ann_data
    return: dict[frame_id] = list[bbox_dict]
    """
    bboxes_per_frame = {}
    for track in video_item["annotations"]:   # m·ªói track l√† 1 'annotations'
        for b in track["bboxes"]:
            frame = b["frame"]
            bboxes_per_frame.setdefault(frame, []).append(b)
    return bboxes_per_frame

# T·∫°o c·∫•u tr√∫c th∆∞ m·ª•c YOLO
for split in ["train", "val"]:
    (YOLO_ROOT / "images" / split).mkdir(parents=True, exist_ok=True)
    (YOLO_ROOT / "labels" / split).mkdir(parents=True, exist_ok=True)

# Th∆∞ m·ª•c Siamese (m·ªói object = 1 folder, ch·ª©a templates + patches)
SIAMESE_ROOT.mkdir(parents=True, exist_ok=True)

def bbox_to_yolo_line(bbox, img_w, img_h, class_id=0):
    x1, y1 = bbox["x1"], bbox["y1"]
    x2, y2 = bbox["x2"], bbox["y2"]

    xc = (x1 + x2) / 2.0
    yc = (y1 + y2) / 2.0
    bw = (x2 - x1)
    bh = (y2 - y1)

    # normalize
    xc /= img_w
    yc /= img_h
    bw /= img_w
    bh /= img_h

    return f"{class_id} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}\n"

def expand_bbox(x1, y1, x2, y2, img_w, img_h, scale=1.8):
    """
    M·ªü r·ªông bbox quanh t√¢m theo factor scale.
    Tr·∫£ v·ªÅ bbox m·ªõi (nx1, ny1, nx2, ny2) ƒë√£ clamp trong ·∫£nh.
    """
    cx = (x1 + x2) / 2.0
    cy = (y1 + y2) / 2.0
    bw = (x2 - x1)
    bh = (y2 - y1)

    new_bw = bw * scale
    new_bh = bh * scale

    nx1 = cx - new_bw / 2.0
    ny1 = cy - new_bh / 2.0
    nx2 = cx + new_bw / 2.0
    ny2 = cy + new_bh / 2.0

    nx1 = max(0, int(nx1))
    ny1 = max(0, int(ny1))
    nx2 = min(img_w - 1, int(nx2))
    ny2 = min(img_h - 1, int(ny2))

    if nx2 <= nx1 or ny2 <= ny1:
        return None  # bbox l·ªói

    return nx1, ny1, nx2, ny2

def process_one_video(video_item):
    video_id = video_item["video_id"]
    split = split_map[video_id]   # 'train' ho·∫∑c 'val'

    print(f"Processing video {video_id} ({split})")

    # Build mapping frame -> list bboxes
    bboxes_per_frame = build_bboxes_per_frame(video_item)
    frames_to_keep = set(bboxes_per_frame.keys())

    # ƒê∆∞·ªùng d·∫´n video & ·∫£nh template
    obj_dir = TRAIN_ROOT / video_id
    video_path = obj_dir / "drone_video.mp4"
    template_dir = obj_dir / "object_images"   # 3 ·∫£nh object

    # Th∆∞ m·ª•c output YOLO
    img_out_dir   = YOLO_ROOT / "images" / split
    label_out_dir = YOLO_ROOT / "labels" / split
    
    # M·ªü video v√† tr√≠ch frame
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        print(f"  >> WARNING: cannot open video {video_path}")
        return

    frame_idx = 0
    patch_count = 0

    pbar = tqdm(total=int(cap.get(cv2.CAP_PROP_FRAME_COUNT)), desc=f"{video_id}", leave=False, dynamic_ncols=True)

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_idx in frames_to_keep:
            h, w = frame.shape[:2]

            # T√™n file base (ƒë·∫£m b·∫£o l√† unique)
            base_name = f"{video_id}_{frame_idx:06d}"

            # 1) L∆∞u ·∫£nh cho YOLO
            img_out_path = img_out_dir / f"{base_name}.jpg"
            cv2.imwrite(str(img_out_path), frame)

            # 2) T·∫°o label file YOLOs
            label_lines = []
            frame_bboxes = bboxes_per_frame[frame_idx]
            for bbox in frame_bboxes:
                line = bbox_to_yolo_line(bbox, w, h, class_id=0)
                label_lines.append(line)

            label_out_path = label_out_dir / f"{base_name}.txt"
            with open(label_out_path, "w") as f:
                f.writelines(label_lines)

            if ZOOM_IN_ENABLED:
                zoom_count = 0
                for bbox_idx, bbox in enumerate(frame_bboxes):
                    if zoom_count >= MAX_ZOOM_PER_FRAME:
                        break
                    x1, y1 = bbox["x1"], bbox["y1"]
                    x2, y2 = bbox["x2"], bbox["y2"]

                    expanded = expand_bbox(x1, y1, x2, y2, w, h, scale=ZOOM_SCALE)
                    if expanded is None:
                        continue
                        
                    ex1, ey1, ex2, ey2 = expanded
                    crop = frame[ey1:ey2, ex1:ex2]
                    ch, cw = crop.shape[:2]

                    # bbox m·ªõi trong crop
                    new_x1 = x1 - ex1
                    new_y1 = y1 - ey1
                    new_x2 = x2 - ex1
                    new_y2 = y2 - ey1

                    new_x1 = max(0, new_x1)
                    new_y1 = max(0, new_y1)
                    new_x2 = min(cw - 1, new_x2)
                    new_y2 = min(ch - 1, new_y2)

                    if new_x2 <= new_x1 or new_y2 <= new_y1:
                        continue

                    # l∆∞u ·∫£nh zoom-in
                    zoom_name = f"{video_id}_{frame_idx:06d}_z{bbox_idx}"
                    zoom_img_path = img_out_dir / f"{zoom_name}.jpg"
                    cv2.imwrite(str(zoom_img_path), crop)

                    # label YOLO cho ·∫£nh zoom-in (1 bbox)
                    xc = (new_x1 + new_x2) / 2.0 / cw
                    yc = (new_y1 + new_y2) / 2.0 / ch
                    bw = (new_x2 - new_x1) / cw
                    bh = (new_y2 - new_y1) / ch

                    zoom_label_path = label_out_dir / f"{zoom_name}.txt"
                    with open(zoom_label_path, "w") as fz:
                        fz.write(f"0 {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}\n")

                    zoom_count += 1

        frame_idx += 1
        pbar.update(1)

    pbar.close()
    cap.release()

In [None]:
YOLO_ROOT = Path("/kaggle/working/yolo_data")  # ch·ªânh n·∫øu kh√°c

data_yaml = {
    "path": str(YOLO_ROOT),   # th∆∞ m·ª•c g·ªëc ch·ª©a images/labels
    "train": "images/train",  # relative path t·ª´ YOLO_ROOT
    "val":   "images/val",
    "nc": 1,
    "names": ["object"]
}

with open("/kaggle/working/data.yaml", "w") as f:
    yaml.dump(data_yaml, f, sort_keys=False)

model = YOLO("yolov8s.pt")   # pretrain COCO

model.train(
    data="/kaggle/working/data.yaml",
    epochs=20,
    imgsz=640,
    batch=16,
    workers=4,
    project="/kaggle/working/yolo_train",
    name="yolov8s_object",
    # augmentation (c√≥ th·ªÉ ch·ªânh)
    augment=True,
    fliplr=0.5,
    scale=0.5,
    degrees=10.0,
    shear=2.0,
    mosaic=0.5,
    close_mosaic=5, 
    hsv_h=0.02,   # ƒë·ªïi hue nh·∫π
    hsv_s=0.8,    # saturation m·∫°nh h∆°n (0.7‚Äì0.9)
    hsv_v=0.5,
)