## SEP742 Project Code for:
### Data pre-processing and dataset customization for Yolov5n, Yolov5s and Yolov8n training
### Training, exporting (to ONNX) for Yolov5 models
### Model size and parameter investigation for different models

In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import time

## Dataset background fusion - create large dataset with random background synthesized with target detection samples

### Class to fuse background. Can set SIGN_SCALE, MIN_SIGN_SIZE and MAX_SIGN_SIZE based on needs

In [None]:
class Config:
    # Model parameters
    NUM_CLASSES = 3  # Traffic sign classes
    
    # Camera and scene settings
    PI_RESOLUTION = (640, 480)        # Deployment camera resolution
    # SIGN_SCALE = [0.5, 0.75, 1.0, 1.5]
    SIGN_SCALE = [0.33, 0.5, 0.75, 1.0, 1.25, 1.5, 2.0, 2.5, 3.0]
    
    # execution parameters
    BATCH_SIZE = 8
    
    # Data paths
    CSV_PATH = "signs.csv"  # Path to your CSV file with annotations
    #SYNTHETIC_BACKGROUNDS_DIR = "backgrounds/"  # Directory for background images (optional)
    SYNTHETIC_BACKGROUNDS_DIR = "backgrounds/"  # Directory for background images (optional)
    SYNTHETIC_OUTPUT_DIR = "results/"
    SYNTHETIC_ANNOTATION_FILE = "synthetic_annotations.csv"  # synthetic annotations file
    
    # Handling small traffic signs
    MIN_SIGN_SIZE = (14, 14)    # Smallest sign in dataset
    MAX_SIGN_SIZE = (480, 480)  # Largest sign in dataset
    
class TrafficSignDataset(Dataset):
    """Dataset for traffic sign detection, handling isolated sign images"""
    
    def __init__(self, csv_file, transform=None, synthetic_backgrounds_dir=None):
        """
        Args:
            csv_file: Path to CSV file with annotations
            transform: Optional transform to apply to images
            synthetic_backgrounds_dir: Optional directory with background images to create synthetic data
        """
        self.annotations = pd.read_csv(csv_file)
        self.transform = transform
        self.backgrounds_dir = synthetic_backgrounds_dir
        
        # Get unique image paths (as each image might contain multiple signs)
        self.unique_images = self.annotations['Path'].unique()
        
        # Load background images if provided
        self.background_images = []
        self.bg_files = []
        if self.backgrounds_dir and os.path.exists(self.backgrounds_dir):
            self.bg_files = [os.path.join(self.backgrounds_dir, f) for f in os.listdir(self.backgrounds_dir) 
                       if f.endswith(('.jpg', '.jpeg', '.png'))]
        
        print(f"Loaded {len(self.unique_images)} sign images and verified {len(self.bg_files)} background images")
        
    def __len__(self):
        return len(self.unique_images)
    
    def process_images_4(self, output_dir):
        """
        Process all unique sign images by placing each scaled version onto every background image.
        Modified to:
        - Load background images on-demand and release memory
        - Handle small backgrounds by resizing
        - Improved filename format with background-first naming
        """
        os.makedirs(output_dir, exist_ok=True)
        csv_path = os.path.join(output_dir, Config.SYNTHETIC_ANNOTATION_FILE)
        
        # Load existing annotations if file exists
        existing_annotations = []
        if os.path.exists(csv_path):
            existing_annotations = pd.read_csv(csv_path).to_dict('records')
        
        # 1. Pre-scale all unique images according to SIGN_SCALE
        scaled_sign_images = []
        scaled_sign_len = 0
        scaled_sign_index = 0
        for img_idx, img_path in enumerate(self.unique_images):
            sign_img = cv2.imread(img_path)
            if sign_img is None:
                continue
            
            sign_img = cv2.cvtColor(sign_img, cv2.COLOR_BGR2RGB)
            img_annotations = self.annotations[self.annotations['Path'] == img_path]
            if len(img_annotations) == 0:
                continue
            
            class_id = img_annotations.iloc[0]['ClassId']
            base_name = os.path.splitext(os.path.basename(img_path))[0]
            
            for scale_idx, scale in enumerate(Config.SIGN_SCALE):
                h, w = sign_img.shape[:2]
                new_h, new_w = int(h * scale), int(w * scale)
                
                if (new_h < Config.MIN_SIGN_SIZE[1] or new_w < Config.MIN_SIGN_SIZE[0] or
                    new_h > Config.MAX_SIGN_SIZE[1] or new_w > Config.MAX_SIGN_SIZE[0]):
                    continue
                    
                scaled_img = cv2.resize(sign_img, (new_w, new_h))
                scaled_sign_images.append({
                    'img': scaled_img,
                    'class_id': class_id,
                    'scale_idx': scale_idx,
                    'base_name': base_name
                })

        if not scaled_sign_images:
            print("No valid scaled sign images created!")
            return
        else:
            scaled_sign_len = len(scaled_sign_images)
            
        new_annotations = []
        
        # Process each background file (loaded on-demand)
        for bg_file in self.bg_files:
            try:
                # Load background only when needed
                bg_img = cv2.imread(bg_file)
                if bg_img is None:
                    continue
                
                bg_img = cv2.cvtColor(bg_img, cv2.COLOR_BGR2RGB)
                bg_name = os.path.splitext(os.path.basename(bg_file))[0]
                target_w, target_h = Config.PI_RESOLUTION
                
                # Handle background sizing
                bg_h, bg_w = bg_img.shape[:2]

                num_w_crops = bg_w // target_w
                num_h_crops = bg_h // target_h

                if num_w_crops == 0:
                    bg_img = cv2.resize(bg_img, (Config.PI_RESOLUTION[0], bg_h))
                    bg_h, bg_w = bg_img.shape[:2]
                    num_w_crops = bg_w // target_w
                    
                if num_h_crops == 0:
                    bg_img = cv2.resize(bg_img, (bg_w, Config.PI_RESOLUTION[1]))
                    bg_h, bg_w = bg_img.shape[:2]
                    num_h_crops = bg_h // target_h

                # Generate all crops
                for i in range(num_w_crops):
                    for j in range(num_h_crops):
                        x1 = i * target_w
                        y1 = j * target_h
                        x2 = min(x1 + target_w, bg_w)  # Handle edge cases
                        y2 = min(y1 + target_h, bg_h)
                        crop = bg_img[y1:y2, x1:x2]
                        
                        # Ensure crop matches target size (pad if necessary)
                        if crop.shape[0] != target_h or crop.shape[1] != target_w:
                            crop = cv2.resize(crop, Config.PI_RESOLUTION)
                        
                        sign_data = scaled_sign_images[scaled_sign_index]
                        scaled_sign_index = (scaled_sign_index + 1) % scaled_sign_len

                        # New filename format: bgname_crop_i_j_signname_scaleX.jpg
                        output_name = (f"{bg_name}_crop_{i}_{j}_"
                                        f"{sign_data['base_name']}_scale{sign_data['scale_idx']}.jpg")
                        output_path = os.path.join(output_dir, output_name)
                        
                        if os.path.exists(output_path):
                            continue
                        
                        composite_img, bbox = self.place_sign_on_background(
                            sign_data['img'], crop
                        )
                        
                        cv2.imwrite(output_path, cv2.cvtColor(composite_img, cv2.COLOR_RGB2BGR))
                        
                        new_annotations.append({
                            "Path": output_path,
                            "ClassId": sign_data['class_id'],
                            "Roi.X1": bbox[0],
                            "Roi.Y1": bbox[1],
                            "Roi.X2": bbox[2],
                            "Roi.Y2": bbox[3],
                            "Width": composite_img.shape[1],
                            "Height": composite_img.shape[0],
                            "SourceSign": sign_data['base_name'],
                            "ScaleFactor": Config.SIGN_SCALE[sign_data['scale_idx']],
                            "BackgroundSource": bg_file,
                            "CropPosition": f"{i}_{j}"
                        })
                
                # Release memory after processing this background
                del bg_img
                
            except Exception as e:
                print(f"Error processing {bg_file}: {str(e)}")
        
        # Save annotations
        if new_annotations:
            all_annotations = existing_annotations + new_annotations
            pd.DataFrame(all_annotations).to_csv(csv_path, index=False)
            print(f"Generated {len(new_annotations)} new images in {output_dir}")
        else:
            print("No new images generated (all combinations exist)")
            
    def get_random_background(self, target_size=(640, 480)):
        """Get a random background image and resize it to target_size"""
        if not self.background_images:
            # Create a plain gray background if no background images are available
            return np.ones((target_size[1], target_size[0], 3), dtype=np.uint8) * 127
        
        # Select random background
        bg_idx = np.random.randint(0, len(self.background_images))
        background = self.background_images[bg_idx].copy()
        
        # Resize to target size
        return cv2.resize(background, target_size)
    
    def place_sign_on_background(self, sign_img, background_img):
        """Place sign image on a random position on the background"""
        # Get dimensions
        sign_h, sign_w = sign_img.shape[:2]
        bg_h, bg_w = background_img.shape[:2]
        
        # Ensure sign is not larger than background
        if sign_h >= bg_h or sign_w >= bg_w:
            sign_img = cv2.resize(sign_img, (min(sign_w, bg_w-10), min(sign_h, bg_h-10)))
            sign_h, sign_w = sign_img.shape[:2]
            
        # Find random position for the sign
        x_pos = np.random.randint(0, bg_w - sign_w)
        y_pos = np.random.randint(0, bg_h - sign_h)
        
        # Create mask from sign image (assuming white/light background for sign images)
        gray = cv2.cvtColor(sign_img, cv2.COLOR_BGR2GRAY)
        _, mask = cv2.threshold(gray, 5, 255, cv2.THRESH_BINARY)
        
        # Place sign on background using the mask
        roi = background_img[y_pos:y_pos+sign_h, x_pos:x_pos+sign_w]
        
        # Add sign to background with mask
        masked_sign = cv2.bitwise_and(sign_img, sign_img, mask=mask)
        masked_bg = cv2.bitwise_and(roi, roi, mask=cv2.bitwise_not(mask))
        result_roi = cv2.add(masked_sign, masked_bg)
        
        # Insert back into the background
        result = background_img.copy()
        result[y_pos:y_pos+sign_h, x_pos:x_pos+sign_w] = result_roi
        
        # Return the composite image and the bounding box
        return result, [x_pos, y_pos, x_pos+sign_w, y_pos+sign_h]
    
    def __getitem__(self, idx):
        img_path = self.unique_images[idx]
        
        # Read sign image
        sign_img = cv2.imread(img_path)
        if sign_img is None:
            print(f"Warning: Could not read image at {img_path}")
            # Return a placeholder with no detections
            placeholder = np.zeros((480, 640, 3), dtype=np.uint8)
            image_tensor = torch.from_numpy(placeholder.transpose((2, 0, 1))).float() / 255.0
            return image_tensor, {"boxes": torch.zeros((0, 4)), "labels": torch.zeros(0, dtype=torch.int64)}
        
        sign_img = cv2.cvtColor(sign_img, cv2.COLOR_BGR2RGB)
        
        # Get annotations for this image
        img_annotations = self.annotations[self.annotations['Path'] == img_path]
        
        # Decide if we need to create a synthetic image or use the original
        if self.backgrounds_dir and (len(self.background_images) > 0):
            # Create synthetic training example by placing sign on random background
            # Choose target size based on deployment target (Raspberry Pi resolution)
            target_size = Config.PI_RESOLUTION
            background = self.get_random_background(target_size)
            
            # Place sign on background and get the new bounding box
            composite_img, bbox = self.place_sign_on_background(sign_img, background)
            
            # Get class label from annotation
            class_id = img_annotations.iloc[0]['ClassId']
            
            # Create boxes and labels
            boxes = [bbox]  # [x1, y1, x2, y2]
            labels = [class_id]
            
            # Use the composite image for training
            image = composite_img
        else:
            # Use original image and annotations (for standalone sign images)
            image = sign_img
            
            # Get bounding boxes and labels
            boxes = []
            labels = []
            
            for _, row in img_annotations.iterrows():
                # Extract coordinates (x1, y1, x2, y2)
                x1 = row['Roi.X1']
                y1 = row['Roi.Y1']
                x2 = row['Roi.X2']
                y2 = row['Roi.Y2']
                
                # For isolated signs, the box might be the entire image
                if x1 == 0 and y1 == 0 and x2 == 0 and y2 == 0:
                    x2 = row['Width']
                    y2 = row['Height']
                
                boxes.append([x1, y1, x2, y2])
                labels.append(row['ClassId'])
        
        # Convert to torch tensors
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        labels = torch.as_tensor(labels, dtype=torch.int64)
        
        # Prepare target dict (required by PyTorch detection models)
        target = {}
        target["boxes"] = boxes
        target["labels"] = labels
        
        # Apply transforms if any
        if self.transform:
            image = self.transform(image)
        else:
            image = torch.from_numpy(image.transpose((2, 0, 1))).float() / 255.0
        
        return image, target

### Run method of the class, modify corresponding paths/file names to actual fusion needs

In [None]:
#traffic sign merged with coco background dataset
dataset = TrafficSignDataset("signs.csv", synthetic_backgrounds_dir="train2017/")
dataset.process_images_4("result_train")

Loaded 78 sign images and verified 118287 background images
Generated 118287 new images in result_train


In [None]:
#traffic sign merged with coco background dataset
dataset = TrafficSignDataset("signs.csv", synthetic_backgrounds_dir="val2017/")
dataset.process_images_4("result_val")

### The merge generates one annotation file for all created samples.
### Convert one annotation file to Yolo format (one text label file for each image)

In [6]:
import pandas as pd
import os
import cv2

# df = pd.read_csv("train_data/synthetic_annotations.csv")
df = pd.read_csv("D:/_small files/742 proj DL data/result_train/synthetic_annotations.csv")
for _, row in df.iterrows():
    img_path = row['Path']
    img = cv2.imread(img_path)
    h, w = img.shape[:2]
    
    # Convert CSV bbox (x1,y1,x2,y2) to YOLO format (x_center,y_center,width,height)
    x_center = ((row['Roi.X1'] + row['Roi.X2']) / 2) / w
    y_center = ((row['Roi.Y1'] + row['Roi.Y2']) / 2) / h
    bbox_w = (row['Roi.X2'] - row['Roi.X1']) / w
    bbox_h = (row['Roi.Y2'] - row['Roi.Y1']) / h
    
    # Save to .txt
    # label_path = os.path.join("coco/labels/train", os.path.splitext(os.path.basename(img_path))[0] + ".txt")
    label_path = os.path.join("D:/_small files/742 proj DL data/yolov5/coco/labels/train", os.path.splitext(os.path.basename(img_path))[0] + ".txt")
    with open(label_path, 'w') as f:
        f.write(f"{row['ClassId']} {x_center} {y_center} {bbox_w} {bbox_h}\n")

In [5]:
import pandas as pd
import os
import cv2

# df = pd.read_csv("train_data/synthetic_annotations.csv")
df = pd.read_csv("D:/_small files/742 proj DL data/result_val/synthetic_annotations.csv")
for _, row in df.iterrows():
    img_path = row['Path']
    img = cv2.imread(img_path)
    h, w = img.shape[:2]
    
    # Convert CSV bbox (x1,y1,x2,y2) to YOLO format (x_center,y_center,width,height)
    x_center = ((row['Roi.X1'] + row['Roi.X2']) / 2) / w
    y_center = ((row['Roi.Y1'] + row['Roi.Y2']) / 2) / h
    bbox_w = (row['Roi.X2'] - row['Roi.X1']) / w
    bbox_h = (row['Roi.Y2'] - row['Roi.Y1']) / h
    
    # Save to .txt
    # label_path = os.path.join("coco/labels/train", os.path.splitext(os.path.basename(img_path))[0] + ".txt")
    label_path = os.path.join("D:/_small files/742 proj DL data/yolov5/coco/labels/val", os.path.splitext(os.path.basename(img_path))[0] + ".txt")
    with open(label_path, 'w') as f:
        f.write(f"{row['ClassId']} {x_center} {y_center} {bbox_w} {bbox_h}\n")

### BASH Command to train Yolov5 models （older version of Yolov5)

In [None]:

python train.py --img 640 --rect --batch 32 --epochs 30 --data yolov5n_coco.yaml --weights yolov5n.pt --cfg models/yolov5n.yaml

### BASH Command to export Yolov5 models （older version of Yolov5) to ONNX format

In [None]:
python export.py --weights runs/train/exp14/weights/best.pt --include onnx --simplify --dynamic

## Model size and parameter investigation for different models

In [15]:
from ultralytics import YOLO
from torchinfo import summary

model = YOLO("yolov8s.pt")  # Downloads YOLOv8n
summary(model.model, input_size=(1, 3, 640, 640))  # Summarize backbone

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt'...


  5%|▍         | 1.00M/21.5M [00:00<00:15, 1.39MB/s]

 Download failure, retrying 1/3 https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt...





Layer (type:depth-idx)                             Output Shape              Param #
DetectionModel                                     [1, 84, 8400]             --
├─Sequential: 1-1                                  --                        --
│    └─Conv: 2-1                                   [1, 32, 320, 320]         --
│    │    └─Conv2d: 3-1                            [1, 32, 320, 320]         (864)
│    │    └─BatchNorm2d: 3-2                       [1, 32, 320, 320]         (64)
│    └─Detect: 2-96                                --                        (recursive)
│    │    └─ModuleList: 3-118                      --                        (recursive)
│    └─Conv: 2-3                                   [1, 64, 160, 160]         --
│    │    └─Conv2d: 3-4                            [1, 64, 160, 160]         (18,432)
│    │    └─BatchNorm2d: 3-5                       [1, 64, 160, 160]         (128)
│    └─Detect: 2-96                                --                        (recur

In [5]:
from ultralytics import YOLO
from torchinfo import summary

model = YOLO("yolov8n.pt")  # Downloads YOLOv8n
summary(model.model, input_size=(1, 3, 640, 640))  # Summarize backbone

Layer (type:depth-idx)                             Output Shape              Param #
DetectionModel                                     [1, 84, 8400]             --
├─Sequential: 1-1                                  --                        --
│    └─Conv: 2-1                                   [1, 16, 320, 320]         --
│    │    └─Conv2d: 3-1                            [1, 16, 320, 320]         (432)
│    │    └─BatchNorm2d: 3-2                       [1, 16, 320, 320]         (32)
│    └─Detect: 2-96                                --                        (recursive)
│    │    └─ModuleList: 3-118                      --                        (recursive)
│    └─Conv: 2-3                                   [1, 32, 160, 160]         --
│    │    └─Conv2d: 3-4                            [1, 32, 160, 160]         (4,608)
│    │    └─BatchNorm2d: 3-5                       [1, 32, 160, 160]         (64)
│    └─Detect: 2-96                                --                        (recursi

In [14]:
from torchinfo import summary
import torch

model = torch.hub.load('ultralytics/yolov5', 'yolov5s', pretrained=True)
summary(model, input_size=(1, 3, 640, 640))

Using cache found in C:\Users\Meshery/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-4-14 Python-3.11.1 torch-2.6.0+cpu CPU

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5s.pt to yolov5s.pt...
100%|██████████| 14.1M/14.1M [00:09<00:00, 1.61MB/s]

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):


Layer (type:depth-idx)                                  Output Shape              Param #
AutoShape                                               [1, 25200, 85]            --
├─DetectMultiBackend: 1-1                               [1, 25200, 85]            --
│    └─DetectionModel: 2-1                              [1, 25200, 85]            --
│    │    └─Sequential: 3-1                             --                        (7,225,885)
Total params: 7,225,885
Trainable params: 0
Non-trainable params: 7,225,885
Total mult-adds (Units.GIGABYTES): 8.24
Input size (MB): 4.92
Forward/backward pass size (MB): 206.37
Params size (MB): 28.90
Estimated Total Size (MB): 240.19

In [4]:
from torchinfo import summary
import torch

model = torch.hub.load('ultralytics/yolov5', 'yolov5n', pretrained=True)
summary(model, input_size=(1, 3, 640, 640))

Using cache found in C:\Users\Meshery/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-4-14 Python-3.11.1 torch-2.6.0+cpu CPU

Fusing layers... 
YOLOv5n summary: 213 layers, 1867405 parameters, 0 gradients, 4.5 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):


Layer (type:depth-idx)                                  Output Shape              Param #
AutoShape                                               [1, 25200, 85]            --
├─DetectMultiBackend: 1-1                               [1, 25200, 85]            --
│    └─DetectionModel: 2-1                              [1, 25200, 85]            --
│    │    └─Sequential: 3-1                             --                        (1,867,405)
Total params: 1,867,405
Trainable params: 0
Non-trainable params: 1,867,405
Total mult-adds (Units.GIGABYTES): 2.25
Input size (MB): 4.92
Forward/backward pass size (MB): 111.75
Params size (MB): 7.47
Estimated Total Size (MB): 124.14

In [2]:
import os
import cv2
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import time
from torchinfo import summary

model = fasterrcnn_mobilenet_v3_large_fpn(
        pretrained=True,
        pretrained_backbone=True,
        min_size=320,
        max_size=640,
        # box_score_thresh=Config.CONFIDENCE_THRESHOLD,
        # box_nms_thresh=Config.NMS_THRESHOLD,
        rpn_pre_nms_top_n_test=300,
        rpn_post_nms_top_n_test=100,
    )

summary(model, 
    input_size=(1, 3, 480, 640),
    col_names=["input_size", "output_size", "num_params", "kernel_size"],
    verbose=1)



Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Kernel Shape
FasterRCNN                                              [1, 3, 480, 640]          [0, 4]                    --                        --
├─GeneralizedRCNNTransform: 1-1                         [1, 3, 480, 640]          [1, 3, 320, 448]          --                        --
├─BackboneWithFPN: 1-2                                  [1, 3, 320, 448]          [1, 256, 5, 7]            --                        --
│    └─IntermediateLayerGetter: 2-1                     [1, 3, 320, 448]          [1, 960, 10, 14]          --                        --
│    │    └─Conv2dNormActivation: 3-1                   [1, 3, 320, 448]          [1, 16, 160, 224]         (432)                     --
│    │    └─InvertedResidual: 3-2                       [1, 16, 160, 224]         [1, 16, 160, 224]         (400)                     --
│    │    └─InvertedResidual: 3

Layer (type:depth-idx)                                  Input Shape               Output Shape              Param #                   Kernel Shape
FasterRCNN                                              [1, 3, 480, 640]          [0, 4]                    --                        --
├─GeneralizedRCNNTransform: 1-1                         [1, 3, 480, 640]          [1, 3, 320, 448]          --                        --
├─BackboneWithFPN: 1-2                                  [1, 3, 320, 448]          [1, 256, 5, 7]            --                        --
│    └─IntermediateLayerGetter: 2-1                     [1, 3, 320, 448]          [1, 960, 10, 14]          --                        --
│    │    └─Conv2dNormActivation: 3-1                   [1, 3, 320, 448]          [1, 16, 160, 224]         (432)                     --
│    │    └─InvertedResidual: 3-2                       [1, 16, 160, 224]         [1, 16, 160, 224]         (400)                     --
│    │    └─InvertedResidual: 3