### 1. Importing Required Libraries for YOLO Model and for Training

In [2]:
!pip install ultralytics torch torchvision


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [1]:
from ultralytics import YOLO
import torch

### 2. Randomly selecting dataset samples from the huge dataset

In [4]:
import os
import random
import shutil

def sample_dataset(source_dir, target_root_dir, sample_size=100):
    """
    Scans the source directory for ALL subdirectories (classes),
    selects a random subset of images from each, and copies them to a target directory.
    """

    # Supported image extensions
    valid_extensions = {".jpg", ".jpeg"}

    # Create the target root directory if it doesn't exist
    if not os.path.exists(target_root_dir):
        os.makedirs(target_root_dir)
        print(f"Created target root directory: {target_root_dir}")

    # AUTOMATICALLY get all subdirectories in the source folder
    classes_to_process = [d for d in os.listdir(source_dir)
                          if os.path.isdir(os.path.join(source_dir, d))]

    print(f"Found {len(classes_to_process)} classes. Processing all of them...\n")

    total_copied = 0

    for class_name in classes_to_process:
        class_path = os.path.join(source_dir, class_name)

        # List all valid image files
        images = [f for f in os.listdir(class_path)
                  if os.path.splitext(f)[1].lower() in valid_extensions]

        # Determine sample count (take all if less than sample_size)
        num_to_sample = min(len(images), sample_size)

        if num_to_sample == 0:
            print(f"Warning: No images found in '{class_name}'. Skipping.")
            continue

        # Randomly select images
        selected_images = random.sample(images, num_to_sample)

        # Define new folder name (e.g., "sampled-apple")
        new_class_name = f"sampled-{class_name}"
        target_class_path = os.path.join(target_root_dir, new_class_name)

        # Create class directory
        if not os.path.exists(target_class_path):
            os.makedirs(target_class_path)

        # Copy images
        for image in selected_images:
            src_file = os.path.join(class_path, image)
            dst_file = os.path.join(target_class_path, image)
            shutil.copy2(src_file, dst_file)

        print(f"{class_name}: Copied {num_to_sample} images -> '{new_class_name}'")
        total_copied += num_to_sample

    print("-" * 40)
    print(f"Process completed. Total of {total_copied} images copied to '{target_root_dir}'.")

# --- CONFIGURATION & RUN ---

# Set your folder paths here
SOURCE_FOLDER = "unlabeled-data"
TARGET_FOLDER = "sampled-unlabeled-data"
SAMPLES_PER_CLASS = 100

# Run directly
sample_dataset(SOURCE_FOLDER, TARGET_FOLDER, sample_size=SAMPLES_PER_CLASS)

Created target root directory: sampled-unlabeled-data
Found 12 classes. Processing all of them...

apple: Copied 100 images -> 'sampled-apple'
bell pepper: Copied 100 images -> 'sampled-bell pepper'
strawberry: Copied 100 images -> 'sampled-strawberry'
avocado: Copied 100 images -> 'sampled-avocado'
kiwi: Copied 100 images -> 'sampled-kiwi'
lemon: Copied 100 images -> 'sampled-lemon'
eggplant: Copied 100 images -> 'sampled-eggplant'
banana: Copied 100 images -> 'sampled-banana'
zucchini: Copied 100 images -> 'sampled-zucchini'
pineapple: Copied 100 images -> 'sampled-pineapple'
tomato: Copied 100 images -> 'sampled-tomato'
orange: Copied 100 images -> 'sampled-orange'
----------------------------------------
Process completed. Total of 1200 images copied to 'sampled-unlabeled-data'.
