## Download dataset and preprocessing

In [29]:
import torch
import torchvision
import torchvision.transforms as transforms
from torchvision.datasets import VOCDetection
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image # Pillow for PIL Image handling
import numpy as np
import os
import copy
import cv2 # Import OpenCV

# --- Configuration ---
DATA_DIR = './data_voc'
YEAR = '2012'
DOWNLOAD = True

TARGET_CLASSES = ['person'] 
IMAGE_SIZE = (448, 448) 

# --- Custom Transform to Filter Annotations and Resize Images with Bounding Boxes ---
class FilterAndResizeVOCClasses:
    def __init__(self, target_classes, image_size, apply_clahe=True): # Added apply_clahe parameter
        self.target_classes = target_classes
        self.image_size = image_size
        self.apply_clahe = apply_clahe

        # Define CLAHE object (only if apply_clahe is True)
        if self.apply_clahe:
            # Create a CLAHE object (clipLimit: contrast limiting threshold, tileGridSize: size of grid for histogram equalization)
            self.clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) # Common parameters

        self.image_transform_pre_clahe = transforms.Compose([
            transforms.Resize(image_size) # Resize first
        ])
        self.image_transform_post_clahe = transforms.Compose([
            transforms.ToTensor() # Convert to Tensor after CLAHE (if applied)
        ])

    def __call__(self, img, target):
        original_width = int(target['annotation']['size']['width'])
        original_height = int(target['annotation']['size']['height'])

        # 1. Apply initial image transform (Resize)
        img = self.image_transform_pre_clahe(img) # img is still a PIL Image here

        # 2. Apply CLAHE (if enabled)
        if self.apply_clahe:
            # Convert PIL Image to OpenCV format (numpy array)
            img_np = np.array(img)
            # CLAHE usually works on grayscale or L*a*b* L-channel
            # Convert to grayscale for CLAHE
            if len(img_np.shape) == 3 and img_np.shape[2] == 3: # Check if it's an RGB image
                img_gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
            else: # Already grayscale or single channel
                img_gray = img_np

            # Apply CLAHE
            clahe_img = self.clahe.apply(img_gray)
            
            # If original was RGB, convert back to RGB (by stacking grayscale or converting from L channel)
            if len(img_np.shape) == 3 and img_np.shape[2] == 3:
                 # Stack the CLAHE-enhanced grayscale image to form an RGB image for consistent input to ToTensor
                clahe_img_rgb = cv2.cvtColor(clahe_img, cv2.COLOR_GRAY2RGB)
            else:
                clahe_img_rgb = clahe_img # Already single channel or expected to be

            # Convert back to PIL Image for further torchvision transforms
            img = Image.fromarray(clahe_img_rgb)

        # 3. Apply final image transform (ToTensor)
        img = self.image_transform_post_clahe(img)

        # --- Bounding box filtering and scaling (same as before) ---
        filtered_target = copy.deepcopy(target)
        objects = filtered_target['annotation']['object']
        if not isinstance(objects, list):
            objects = [objects]
        filtered_objects = []
        for obj in objects:
            if obj['name'] in self.target_classes:
                bndbox = obj['bndbox']
                xmin = float(bndbox['xmin'])
                ymin = float(bndbox['ymin'])
                xmax = float(bndbox['xmax'])
                ymax = float(bndbox['ymax'])
                scale_x = self.image_size[1] / original_width
                scale_y = self.image_size[0] / original_height
                obj['bndbox']['xmin'] = int(xmin * scale_x)
                obj['bndbox']['ymin'] = int(ymin * scale_y)
                obj['bndbox']['xmax'] = int(xmax * scale_x)
                obj['bndbox']['ymax'] = int(ymax * scale_y)
                obj['bndbox']['xmin'] = max(0, obj['bndbox']['xmin'])
                obj['bndbox']['ymin'] = max(0, obj['bndbox']['ymin'])
                obj['bndbox']['xmax'] = min(self.image_size[1] - 1, obj['bndbox']['xmax'])
                obj['bndbox']['ymax'] = min(self.image_size[0] - 1, obj['bndbox']['ymax'])
                obj['bndbox']['xmax'] = max(obj['bndbox']['xmin'], obj['bndbox']['xmax'])
                obj['bndbox']['ymax'] = max(obj['bndbox']['ymin'], obj['bndbox']['ymax'])
                filtered_objects.append(obj)
        filtered_target['annotation']['object'] = filtered_objects

        return img, filtered_target

# Instantiate our custom transform with CLAHE enabled
filter_and_resize_transform = FilterAndResizeVOCClasses(TARGET_CLASSES, IMAGE_SIZE, apply_clahe=True)

# --- Define a custom collate_fn for DataLoader (same as before) ---
def custom_collate_fn(batch):
    images = torch.stack([item[0] for item in batch], 0)
    targets = [item[1] for item in batch]
    return images, targets


# --- 1. Import and Dataset Splitting (same as before) ---
print(f"Loading PASCAL VOC {YEAR} training dataset (filtering for {TARGET_CLASSES}, resizing to {IMAGE_SIZE}, and applying CLAHE)...")
train_dataset = VOCDetection(
    root=DATA_DIR,
    year=YEAR,
    image_set='train',
    download=DOWNLOAD,
    transforms=filter_and_resize_transform
)
print(f"Training dataset loaded: {len(train_dataset)} samples")

print(f"\nLoading PASCAL VOC {YEAR} validation dataset (filtering for {TARGET_CLASSES}, resizing to {IMAGE_SIZE}, and applying CLAHE)...")
val_dataset = VOCDetection(
    root=DATA_DIR,
    year=YEAR,
    image_set='val',
    download=DOWNLOAD,
    transforms=filter_and_resize_transform
)
print(f"Validation dataset loaded: {len(val_dataset)} samples")

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=0, collate_fn=custom_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=4, shuffle=False, num_workers=0, collate_fn=custom_collate_fn)


# --- 2. Display Example Images (same as before) ---
print("\nDisplaying example images with CLAHE applied and filtered annotations...")

def to_numpy_image(tensor_image):
    return tensor_image.permute(1, 2, 0).numpy().clip(0, 1)

dataiter = iter(val_loader)
images, targets = next(dataiter)

plt.figure(figsize=(15, 10))

for i in range(len(images)):
    ax = plt.subplot(1, len(images), i + 1)
    img_tensor = images[i]
    img_np = to_numpy_image(img_tensor)
    plt.imshow(img_np)
    plt.axis('off')

    img_target = targets[i]
    objects = img_target['annotation']['object']
    if not isinstance(objects, list):
        objects = [objects]

    if not objects:
        plt.title(f"Image {i+1}\n(No {TARGET_CLASSES} objects found)")
        continue

    for obj in objects:
        label = obj['name']
        bndbox = obj['bndbox']
        xmin = int(bndbox['xmin'])
        ymin = int(bndbox['ymin'])
        xmax = int(bndbox['xmax'])
        ymax = int(bndbox['ymax'])

        rect = patches.Rectangle(
            (xmin, ymin),
            xmax - xmin,
            ymax - ymin,
            linewidth=2,
            edgecolor='g',
            facecolor='none'
        )
        ax.add_patch(rect)

        plt.text(
            xmin, ymin - 5,
            label,
            color='g',
            fontsize=10,
            weight='bold',
            bbox=dict(facecolor='lightgreen', alpha=0.5, edgecolor='none', pad=0)
        )
    plt.title(f"Image {i+1}")

plt.tight_layout()
plt.show()

print("\nCLAHE applied. Images are now uniformly resized, and bounding boxes are scaled accordingly.")

Loading PASCAL VOC 2012 training dataset (filtering for ['person'], resizing to (448, 448), and applying CLAHE)...
Training dataset loaded: 5717 samples

Loading PASCAL VOC 2012 validation dataset (filtering for ['person'], resizing to (448, 448), and applying CLAHE)...
Validation dataset loaded: 5823 samples

Displaying example images with CLAHE applied and filtered annotations...


<Figure size 1500x1000 with 4 Axes>


CLAHE applied. Images are now uniformly resized, and bounding boxes are scaled accordingly.


In [32]:
# Part 2: Displaying Example Images

# --- 2. Display Example Images ---
print("\n--- Part 2: Displaying Example Images with Filtered and Resized Annotations ---")

# Helper function to convert tensor image back to numpy for plotting
def to_numpy_image(tensor_image):
    # Permute dimensions from (C, H, W) to (H, W, C) for matplotlib
    # Clamp values to [0, 1] in case of any floating point issues after transformations
    return tensor_image.permute(1, 2, 0).numpy().clip(0, 1)

# Get a batch of images and targets from the validation loader
# This line should now work correctly, thanks to the custom collate_fn
dataiter = iter(val_loader)
images, targets = next(dataiter)

plt.figure(figsize=(15, 10))

for i in range(len(images)):
    ax = plt.subplot(1, len(images), i + 1)
    img_tensor = images[i]
    img_np = to_numpy_image(img_tensor)
    plt.imshow(img_np)
    plt.axis('off')

    # Get annotations for the current image. targets is now a list of dicts.
    img_target = targets[i]
    
    # The 'object' key will now only contain 'person' objects,
    # and their bounding box coordinates will be scaled to IMAGE_SIZE.
    objects = img_target['annotation']['object']
    if not isinstance(objects, list): # Handle case where there's only one object (after filtering)
        objects = [objects]

    if not objects: # If no 'person' objects are left after filtering for this image
        plt.title(f"Image {i+1}\n(No {TARGET_CLASSES} objects found)")
        continue

    for obj in objects:
        label = obj['name']
        bndbox = obj['bndbox']
        xmin = int(bndbox['xmin'])
        ymin = int(bndbox['ymin'])
        xmax = int(bndbox['xmax'])
        ymax = int(bndbox['ymax'])

        # Create a Rectangle patch for the bounding box
        rect = patches.Rectangle(
            (xmin, ymin),             # (x,y) lower-left corner
            xmax - xmin,              # width
            ymax - ymin,              # height
            linewidth=2,
            edgecolor='g',            # Green box for person
            facecolor='none'
        )
        ax.add_patch(rect) # Add the patch to the plot

        # Add label text above the bounding box
        plt.text(
            xmin, ymin - 5,           # Position for text
            label,
            color='g',                # Green text
            fontsize=10,
            weight='bold',
            bbox=dict(facecolor='lightgreen', alpha=0.5, edgecolor='none', pad=0) # Light green background for text
        )
    plt.title(f"Image {i+1}")

plt.tight_layout() # Adjust subplot params for a tight layout
plt.show() # Display the plot

# print("\n--- Part 2: Displaying Example Images Complete ---")


--- Part 2: Displaying Example Images with Filtered and Resized Annotations ---


<Figure size 1500x1000 with 4 Axes>

# YOLOV8

In [33]:
import os
import xml.etree.ElementTree as ET
from shutil import copyfile

# --- Configuration for PASCAL VOC and YOLO Conversion ---
VOC_ROOT_DIR = './data_voc/VOCdevkit/VOC2012' # Path to your downloaded VOC2012 data
OUTPUT_YOLO_DIR = './yolov8_pascal_person_data' # Where your YOLO formatted data will go

TARGET_CLASSES = ['person'] # Only convert annotations for this class
# Map class names to YOLO-compatible integer IDs.
# Since we only have 'person', its ID will be 0.
CLASS_NAME_TO_ID = {name: i for i, name in enumerate(TARGET_CLASSES)}
# Example if you had more: {'person': 0, 'car': 1, 'aeroplane': 2}

def convert_bbox_to_yolo(size, box):
    """
    Converts [xmin, ymin, xmax, ymax] to [x_center, y_center, width, height]
    all normalized by image dimensions.
    """
    dw = 1. / size[0] # 1 / width
    dh = 1. / size[1] # 1 / height
    x = (box[0] + box[1]) / 2.0 # (xmin + xmax) / 2
    y = (box[2] + box[3]) / 2.0 # (ymin + ymax) / 2
    w = box[1] - box[0]        # xmax - xmin
    h = box[3] - box[2]        # ymax - ymin
    x = x * dw
    w = w * dw
    y = y * dh
    h = h * dh
    return (x, y, w, h)

def process_image_set(image_set_name, output_base_dir):
    """
    Processes a given image set (e.g., 'train', 'val') from VOC to YOLO format.
    """
    print(f"Processing image set: {image_set_name}...")

    # Define paths for VOC
    images_dir = os.path.join(VOC_ROOT_DIR, 'JPEGImages')
    annotations_dir = os.path.join(VOC_ROOT_DIR, 'Annotations')
    image_set_path = os.path.join(VOC_ROOT_DIR, 'ImageSets', 'Main', f'{image_set_name}.txt')

    # Define output paths for YOLO format
    output_images_dir = os.path.join(output_base_dir, 'images', image_set_name)
    output_labels_dir = os.path.join(output_base_dir, 'labels', image_set_name)
    
    os.makedirs(output_images_dir, exist_ok=True)
    os.makedirs(output_labels_dir, exist_ok=True)

    with open(image_set_path, 'r') as f:
        image_ids = [line.strip() for line in f.readlines()]

    for img_id in image_ids:
        xml_path = os.path.join(annotations_dir, f'{img_id}.xml')
        img_path = os.path.join(images_dir, f'{img_id}.jpg')
        label_output_path = os.path.join(output_labels_dir, f'{img_id}.txt')
        img_output_path = os.path.join(output_images_dir, f'{img_id}.jpg')

        try:
            tree = ET.parse(xml_path)
            root = tree.getroot()

            size = root.find('size')
            img_width = int(size.find('width').text)
            img_height = int(size.find('height').text)

            yolo_annotations = []
            for obj in root.findall('object'):
                obj_name = obj.find('name').text
                if obj_name in TARGET_CLASSES:
                    class_id = CLASS_NAME_TO_ID[obj_name]
                    
                    bndbox = obj.find('bndbox')
                    xmin = float(bndbox.find('xmin').text)
                    ymin = float(bndbox.find('ymin').text)
                    xmax = float(bndbox.find('xmax').text)
                    ymax = float(bndbox.find('ymax').text)

                    # VOC bbox are 1-indexed, so convert to 0-indexed for width/height calculation if needed
                    # (xmin, ymin, xmax, ymax) as floats
                    bbox_voc = (xmin, xmax, ymin, ymax) # (xmin, xmax, ymin, ymax) for convert_bbox_to_yolo
                    yolo_bbox = convert_bbox_to_yolo((img_width, img_height), bbox_voc)
                    
                    yolo_annotations.append(f"{class_id} {yolo_bbox[0]:.6f} {yolo_bbox[1]:.6f} {yolo_bbox[2]:.6f} {yolo_bbox[3]:.6f}")
            
            # Only create label file and copy image if there are target objects found
            if yolo_annotations:
                with open(label_output_path, 'w') as f:
                    f.write('\n'.join(yolo_annotations))
                copyfile(img_path, img_output_path)
            # else: # Optional: if you want to explicitly exclude images with no target objects
            #     print(f"Skipping {img_id}: no target objects found.")

        except FileNotFoundError:
            print(f"Warning: XML or image file not found for {img_id}. Skipping.")
        except Exception as e:
            print(f"Error processing {img_id}: {e}. Skipping.")

# --- Run the conversion ---
print("Starting PASCAL VOC to YOLO format conversion...")
# PASCAL VOC has 'train', 'val', and 'trainval' image sets.
# We'll convert 'train' and 'val' for YOLO training and validation.
process_image_set('train', OUTPUT_YOLO_DIR)
process_image_set('val', OUTPUT_YOLO_DIR)
print("Conversion complete.")

# --- Verify the directory structure ---
print(f"\nVerify your data structure in {OUTPUT_YOLO_DIR}:")
print(f"  {OUTPUT_YOLO_DIR}/images/train/")
print(f"  {OUTPUT_YOLO_DIR}/images/val/")
print(f"  {OUTPUT_YOLO_DIR}/labels/train/")
print(f"  {OUTPUT_YOLO_DIR}/labels/val/")
print(f"  (Each image in 'images' should have a corresponding .txt file in 'labels')")

Starting PASCAL VOC to YOLO format conversion...
Processing image set: train...
Processing image set: val...
Conversion complete.

Verify your data structure in ./yolov8_pascal_person_data:
  ./yolov8_pascal_person_data/images/train/
  ./yolov8_pascal_person_data/images/val/
  ./yolov8_pascal_person_data/labels/train/
  ./yolov8_pascal_person_data/labels/val/
  (Each image in 'images' should have a corresponding .txt file in 'labels')


In [34]:
from ultralytics import YOLO

print("Loading YOLOv8n model...")
model = YOLO('yolov8n.pt')

print("Starting YOLOv8 model training...")
results = model.train(
    data='D:\\homework\\DS223\\final-project-ZhangZwaa-1\\yolov8_pascal_person_data\\a.yaml',
    epochs=30,                 # Number of training epochs (adjust as needed)
    imgsz=IMAGE_SIZE[0],       # Image size for training (e.g., 448 for 448x448)
    batch=-1,                  # AutoBatch (automatically determines batch size)
    name='yolov8n_person_voc'  # Name for your training run results folder
)

print("Training complete.")

Loading YOLOv8n model...
Starting YOLOv8 model training...
Ultralytics 8.3.151  Python-3.12.10 torch-2.7.0+cu126 CUDA:0 (NVIDIA GeForce RTX 4080 SUPER, 16376MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=-1, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=D:\homework\DS223\final-project-ZhangZwaa-1\yolov8_pascal_person_data\a.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=30, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=448, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolov8n.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=yolov8n_perso

[34m[1mtrain: [0mScanning D:\homework\DS223\final-project-ZhangZwaa-1\yolov8_pascal_person_data\labels\train.cache... 2142 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2142/2142 [00:00<?, ?it/s]

[34m[1mAutoBatch: [0mComputing optimal batch size for imgsz=448 at 60.0% CUDA memory utilization.
[34m[1mAutoBatch: [0mCUDA:0 (NVIDIA GeForce RTX 4080 SUPER) 15.99G total, 0.46G reserved, 0.07G allocated, 15.46G free
      Params      GFLOPs  GPU_mem (GB)  forward (ms) backward (ms)                   input                  output





     3011043       4.015         0.294         31.34          48.6        (1, 3, 448, 448)                    list
     3011043        8.03         0.382         17.72         32.54        (2, 3, 448, 448)                    list
     3011043       16.06         0.518         54.21         86.45        (4, 3, 448, 448)                    list
     3011043       32.12         0.797         15.86         11.12        (8, 3, 448, 448)                    list
     3011043       64.24         1.315         24.72         14.14       (16, 3, 448, 448)                    list
[34m[1mAutoBatch: [0mUsing batch-size 129 for CUDA:0 9.49G/15.99G (59%) 
[34m[1mtrain: [0mFast image access  (ping: 0.10.0 ms, read: 29.010.3 MB/s, size: 94.1 KB)


[34m[1mtrain: [0mScanning D:\homework\DS223\final-project-ZhangZwaa-1\yolov8_pascal_person_data\labels\train.cache... 2142 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2142/2142 [00:00<?, ?it/s]


[34m[1mval: [0mFast image access  (ping: 0.10.0 ms, read: 43.917.9 MB/s, size: 134.7 KB)


[34m[1mval: [0mScanning D:\homework\DS223\final-project-ZhangZwaa-1\yolov8_pascal_person_data\labels\val.cache... 2232 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2232/2232 [00:00<?, ?it/s]


Plotting labels to d:\homework\runs\detect\yolov8n_person_voc\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 57 weight(decay=0.0), 64 weight(decay=0.0010078125), 63 bias(decay=0.0)
Image sizes 448 train, 448 val
Using 8 dataloader workers
Logging results to [1md:\homework\runs\detect\yolov8n_person_voc[0m
Starting training for 30 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/30      7.64G      1.101      2.214      1.199        359        448: 100%|██████████| 17/17 [00:04<00:00,  3.75it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:05<00:00,  1.74it/s]


                   all       2232       5110      0.867      0.462      0.595      0.385

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/30       7.6G        1.1       1.29      1.206        386        448: 100%|██████████| 17/17 [00:03<00:00,  4.66it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:05<00:00,  1.59it/s]


                   all       2232       5110      0.763       0.48      0.577      0.375

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/30      7.55G      1.143      1.255      1.226        357        448: 100%|██████████| 17/17 [00:03<00:00,  4.46it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:07<00:00,  1.25it/s]


                   all       2232       5110      0.602      0.277      0.374      0.173

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/30      7.58G      1.195      1.286      1.269        370        448: 100%|██████████| 17/17 [00:03<00:00,  4.42it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:05<00:00,  1.51it/s]


                   all       2232       5110      0.492      0.366      0.358      0.183

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/30      7.63G      1.233      1.284      1.284        363        448: 100%|██████████| 17/17 [00:03<00:00,  4.76it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:05<00:00,  1.71it/s]


                   all       2232       5110      0.619       0.23      0.315      0.132

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/30      7.47G      1.276      1.309      1.312        342        448: 100%|██████████| 17/17 [00:03<00:00,  4.72it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:05<00:00,  1.67it/s]

                   all       2232       5110      0.263      0.251      0.165      0.058






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/30      7.55G      1.268       1.31      1.313        394        448: 100%|██████████| 17/17 [00:03<00:00,  4.78it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:05<00:00,  1.79it/s]


                   all       2232       5110       0.37      0.365        0.3      0.132

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/30      7.69G      1.249       1.23      1.304        342        448: 100%|██████████| 17/17 [00:03<00:00,  4.79it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.89it/s]

                   all       2232       5110      0.528      0.368      0.384      0.192






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/30       7.6G      1.224      1.205      1.299        403        448: 100%|██████████| 17/17 [00:03<00:00,  4.87it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.92it/s]

                   all       2232       5110       0.56      0.459      0.472      0.238






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/30      7.57G      1.205      1.183      1.283        359        448: 100%|██████████| 17/17 [00:03<00:00,  5.02it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.92it/s]

                   all       2232       5110      0.678      0.531      0.581      0.316






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      11/30      7.57G      1.197      1.146      1.272        349        448: 100%|██████████| 17/17 [00:03<00:00,  4.93it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.92it/s]

                   all       2232       5110      0.693      0.569      0.642      0.366






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      12/30      7.59G      1.174      1.117      1.258        299        448: 100%|██████████| 17/17 [00:03<00:00,  4.93it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.85it/s]


                   all       2232       5110       0.67        0.6      0.644      0.359

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      13/30      7.56G      1.152      1.081      1.246        385        448: 100%|██████████| 17/17 [00:03<00:00,  5.09it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.93it/s]

                   all       2232       5110      0.764      0.617      0.708      0.423






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      14/30      7.46G      1.145      1.056       1.24        366        448: 100%|██████████| 17/17 [00:03<00:00,  4.93it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.96it/s]

                   all       2232       5110      0.749      0.619      0.695      0.403






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      15/30      7.57G      1.124      1.037      1.238        372        448: 100%|██████████| 17/17 [00:03<00:00,  4.90it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.86it/s]

                   all       2232       5110      0.759      0.588      0.675      0.409






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      16/30      7.45G      1.092      1.025      1.214        314        448: 100%|██████████| 17/17 [00:03<00:00,  4.91it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.90it/s]

                   all       2232       5110      0.717      0.554      0.641      0.389






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      17/30      7.57G      1.115      1.011      1.216        347        448: 100%|██████████| 17/17 [00:03<00:00,  5.00it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.95it/s]

                   all       2232       5110      0.772      0.609      0.699      0.424






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      18/30      7.46G      1.071     0.9751        1.2        353        448: 100%|██████████| 17/17 [00:03<00:00,  4.98it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.98it/s]

                   all       2232       5110      0.755       0.57      0.656      0.396






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      19/30      7.56G      1.049     0.9536      1.186        422        448: 100%|██████████| 17/17 [00:03<00:00,  5.05it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.97it/s]

                   all       2232       5110      0.798      0.656       0.74      0.467






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      20/30      7.56G      1.025     0.9314      1.179        342        448: 100%|██████████| 17/17 [00:03<00:00,  5.01it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.96it/s]

                   all       2232       5110      0.832      0.665      0.773      0.501





Closing dataloader mosaic

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      21/30      7.52G      1.014     0.9183      1.151        153        448: 100%|██████████| 17/17 [00:04<00:00,  4.23it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.88it/s]

                   all       2232       5110      0.783      0.635      0.731      0.471






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      22/30      7.46G      1.003     0.8537       1.14        188        448: 100%|██████████| 17/17 [00:03<00:00,  5.08it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.91it/s]

                   all       2232       5110      0.774      0.652      0.737      0.472






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      23/30      7.53G     0.9822     0.8033       1.12        212        448: 100%|██████████| 17/17 [00:03<00:00,  5.13it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.83it/s]

                   all       2232       5110      0.827      0.669      0.775       0.51






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      24/30      7.53G     0.9565     0.7607      1.107        156        448: 100%|██████████| 17/17 [00:03<00:00,  4.98it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.84it/s]

                   all       2232       5110      0.812       0.68      0.775      0.497






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      25/30      7.52G     0.9325     0.7368      1.088        173        448: 100%|██████████| 17/17 [00:03<00:00,  5.09it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.83it/s]

                   all       2232       5110      0.823      0.708      0.798      0.519






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      26/30      7.45G      0.914       0.72      1.078        189        448: 100%|██████████| 17/17 [00:03<00:00,  5.10it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.90it/s]

                   all       2232       5110      0.836      0.692      0.789      0.523






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      27/30      7.44G     0.8963     0.7011      1.073        157        448: 100%|██████████| 17/17 [00:03<00:00,  5.10it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.91it/s]

                   all       2232       5110       0.85      0.692        0.8       0.53






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      28/30      7.52G     0.8717     0.6648      1.065        187        448: 100%|██████████| 17/17 [00:03<00:00,  5.09it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.94it/s]

                   all       2232       5110      0.838      0.705      0.804      0.542






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      29/30      7.53G     0.8553     0.6574      1.056        159        448: 100%|██████████| 17/17 [00:03<00:00,  5.12it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.93it/s]

                   all       2232       5110      0.838      0.717      0.809      0.546






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      30/30      7.55G     0.8394     0.6394      1.035        165        448: 100%|██████████| 17/17 [00:03<00:00,  5.12it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:04<00:00,  1.95it/s]

                   all       2232       5110      0.838      0.722      0.813      0.551






30 epochs completed in 0.077 hours.
Optimizer stripped from d:\homework\runs\detect\yolov8n_person_voc\weights\last.pt, 6.2MB
Optimizer stripped from d:\homework\runs\detect\yolov8n_person_voc\weights\best.pt, 6.2MB

Validating d:\homework\runs\detect\yolov8n_person_voc\weights\best.pt...
Ultralytics 8.3.151  Python-3.12.10 torch-2.7.0+cu126 CUDA:0 (NVIDIA GeForce RTX 4080 SUPER, 16376MiB)
Model summary (fused): 72 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 9/9 [00:05<00:00,  1.53it/s]


                   all       2232       5110      0.842      0.719      0.813      0.551
Speed: 0.0ms preprocess, 0.2ms inference, 0.0ms loss, 0.7ms postprocess per image
Results saved to [1md:\homework\runs\detect\yolov8n_person_voc[0m
Training complete.


## Validation

In [None]:
from ultralytics import YOLO

# Load your best trained model (usually saved as 'runs/detect/your_run_name/weights/best.pt')
# Replace 'path/to/your/best.pt' with the actual path
# The 'results' object from model.train() might also contain the path
try:
    # If training was successful, results is a list of Results objects.
    # Use the first result's save_dir to get the run directory.
    # trained_model_path = os.path.join(results[0].save_dir, 'weights', 'best.pt')
    trained_model_path = "D:\\homework\\runs\\detect\\yolov8n_person_voc\\weights\\best.pt" 
    model = YOLO(trained_model_path)
    print(f"Loaded trained model from: {trained_model_path}")
except:
    # If you're running this part separately or restarted your kernel,
    # manually specify the path to your trained model.
    print("Loading trained model from a default path (if training was run previously)...")
    model = YOLO('runs/detect/yolov8n_person_voc/weights/best.pt') # Adjust if your run name differs

# Validate the model on the validation set
print("\nValidating the trained model...")
metrics = model.val(data='D:\\homework\\DS223\\final-project-ZhangZwaa-1\\yolov8_pascal_person_data\\a.yaml') # Use the same dataset config

Loaded trained model from: D:\homework\runs\detect\yolov8n_person_voc\weights\best.pt

Validating the trained model...
Ultralytics 8.3.151  Python-3.12.10 torch-2.7.0+cu126 CUDA:0 (NVIDIA GeForce RTX 4080 SUPER, 16376MiB)
Model summary (fused): 72 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.00.0 ms, read: 1474.8611.9 MB/s, size: 112.2 KB)


[34m[1mval: [0mScanning D:\homework\DS223\final-project-ZhangZwaa-1\yolov8_pascal_person_data\labels\val.cache... 2232 images, 0 backgrounds, 0 corrupt: 100%|██████████| 2232/2232 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 140/140 [00:05<00:00, 24.82it/s]


                   all       2232       5110      0.846      0.719      0.814      0.551
Speed: 0.0ms preprocess, 0.3ms inference, 0.0ms loss, 0.5ms postprocess per image
Results saved to [1md:\homework\runs\detect\val11[0m


In [14]:
# Print main detection metrics (mAP50, mAP50-95, precision, recall)
print(f"mAP50: {metrics.box.map50:.4f}")
print(f"mAP50-95: {metrics.box.map:.4f}")
print(f"Precision: {metrics.box.mp:.4f}")
print(f"Recall: {metrics.box.mr:.4f}")

# Make predictions on an image (e.g., a sample from your validation set)
# Replace 'path/to/your/image.jpg' with an actual image path you want to test
print("\nMaking a prediction on a sample image...")
# You might need to copy an image from your val set, e.g.,
# from yolov8_pascal_person_data/images/val/
sample_image_path = os.path.join(OUTPUT_YOLO_DIR, 'images', 'val', '2008_000028.jpg') # Example from VOC
if os.path.exists(sample_image_path):
    predict_results = model.predict(source=sample_image_path, save=True, conf=0.7) # conf is confidence threshold
    print(f"Prediction results saved to: {predict_results[0].save_dir}")
    print(f"Detected objects: {predict_results[0].boxes}")
else:
    print(f"Sample image not found at {sample_image_path}. Please provide a valid path to an image for prediction.")

mAP50: 0.8042
mAP50-95: 0.5421
Precision: 0.8363
Recall: 0.7119

Making a prediction on a sample image...
Sample image not found at ./yolov8_pascal_person_data\images\val\2008_000028.jpg. Please provide a valid path to an image for prediction.


In [36]:
import os
from ultralytics import YOLO

# --- Configuration ---
# Path to your trained YOLOv8 model weights.
# Ensure this path is absolutely correct.
TRAINED_MODEL_PATH = 'D:\\homework\\runs\\detect\\yolov8n_person_voc\\weights\\best.pt' 

# Path to the folder containing the images you want to recognize.
# The model will process all image files (jpg, png, etc.) within this directory.
IMAGE_FOLDER_PATH = 'D:/homework/DS223/final-project-ZhangZwaa-1/testdata' 

# --- Load the trained YOLOv8 model ---
try:
    model = YOLO(TRAINED_MODEL_PATH)
    print(f"Successfully loaded model from: {TRAINED_MODEL_PATH}")
except Exception as e:
    print(f"Error loading model from {TRAINED_MODEL_PATH}: {e}")
    print("Please ensure TRAINED_MODEL_PATH is correct and the model file exists.")
    exit() # Exit the script if the model cannot be loaded

# --- Perform inference on all images in the specified folder ---
# This single call to model.predict() will efficiently process all images
# within the IMAGE_FOLDER_PATH.
print(f"\nStarting object recognition on images in: {IMAGE_FOLDER_PATH}")
print("Detected images with bounding boxes will be saved to a subfolder within 'runs/detect/'.")

results = model.predict(
    source=IMAGE_FOLDER_PATH,  # The input source is the folder containing images
    save=True,                 # Save the results (images with detections) to disk
    conf=0.7,                  # Confidence threshold: only show detections with confidence >= 0.7
    iou=0.3,                   # IoU threshold for Non-Maximum Suppression (NMS)
                               # Lower this value to reduce overlapping duplicate boxes.
    show=False,                # Set to True to display results in pop-up windows (requires OpenCV).
                               # Set to False to just save them.
    name='inference_results_folder' # Name for the results subfolder in 'runs/detect/'
)

# --- Process and print textual results for each image ---
print("\nRecognition complete. Processing detailed results:")
for i, result in enumerate(results):
    # 'result' object contains predictions for one image
    original_image_path = result.path # Path to the original image
    detected_boxes = result.boxes     # Access the Boxes object for bounding box outputs

    print(f"\n--- Image {i+1}: {os.path.basename(original_image_path)} ---")
    
    if len(detected_boxes) == 0:
        print("  No objects detected.")
    else:
        for j, box in enumerate(detected_boxes):
            class_id = int(box.cls[0])       # Class ID (e.g., 0 for 'person')
            confidence = float(box.conf[0])  # Confidence score (e.g., 0.95)
            # Bounding box coordinates in [x1, y1, x2, y2] format
            # x1, y1 are top-left; x2, y2 are bottom-right
            xyxy_coords = box.xyxy[0].tolist() 
            
            # Get the class name from the model's loaded names
            class_name = model.names[class_id] 
            
            print(f"  Object {j+1}:")
            print(f"    Class: {class_name} (ID: {class_id})")
            print(f"    Confidence: {confidence:.2f}")
            print(f"    Bounding Box (xyxy): [{xyxy_coords[0]:.0f}, {xyxy_coords[1]:.0f}, {xyxy_coords[2]:.0f}, {xyxy_coords[3]:.0f}]")

# --- Information about where results are saved ---
if results:
    # results[0].save_dir contains the path to the folder where processed images are saved
    print(f"\nResult images with detections saved to: {results[0].save_dir}")
else:
    print("\nNo results were generated. Check if the IMAGE_FOLDER_PATH contains valid images.")

Successfully loaded model from: D:\homework\runs\detect\yolov8n_person_voc\weights\best.pt

Starting object recognition on images in: D:/homework/DS223/final-project-ZhangZwaa-1/testdata
Detected images with bounding boxes will be saved to a subfolder within 'runs/detect/'.

image 1/10 D:\homework\DS223\final-project-ZhangZwaa-1\testdata\DSC00009.JPG: 320x448 1 person, 3.5ms
image 2/10 D:\homework\DS223\final-project-ZhangZwaa-1\testdata\DSC01527.png: 320x448 (no detections), 3.1ms
image 3/10 D:\homework\DS223\final-project-ZhangZwaa-1\testdata\DSC01704.png: 448x320 (no detections), 3.4ms
image 4/10 D:\homework\DS223\final-project-ZhangZwaa-1\testdata\DSC01751.png: 448x320 2 persons, 3.1ms
image 5/10 D:\homework\DS223\final-project-ZhangZwaa-1\testdata\DSC01767.png: 448x320 1 person, 4.8ms
image 6/10 D:\homework\DS223\final-project-ZhangZwaa-1\testdata\IMG_4132.JPG: 448x320 1 person, 4.4ms
image 7/10 D:\homework\DS223\final-project-ZhangZwaa-1\testdata\IMG_5420.JPG: 352x448 5 persons, 