In [5]:
# setup chunk

## for deep learning architecture and evaluation
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchsummary import summary
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

## for plotting
from PIL import Image
import matplotlib.pyplot as plt

## for preprocessing
import os
import numpy as np
import random
import math
import pandas as pd

## set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [12]:
image_folder = '../../02_data/Google_OpenImages/images'
label_folder = '../../02_data/Google_OpenImages/labels'
annotation_file = '../../02_data/Google_OpenImages/filtered_person_annotations.csv'

df = pd.read_csv(annotation_file)

# Convert bounding boxes to YOLO format
def convert_bbox_to_yolo_format(xmin, ymin, xmax, ymax):
    center_x = (xmin + xmax) / 2
    center_y = (ymin + ymax) / 2
    w = xmax - xmin
    h = ymax - ymin
    return center_x, center_y, w, h

# Process images in train and validation folders
for split in ['train', 'validation']:
    split_image_folder = os.path.join(image_folder, split)
    split_label_folder = os.path.join(label_folder, split)
    
    # Process each image in the split folder
    for image_file in os.listdir(split_image_folder):
        if not image_file.lower().endswith(('.jpg', '.png')):  # Adjust if needed
            continue

        image_id = os.path.splitext(image_file)[0]
        label_file = os.path.join(split_label_folder, image_id.replace('.jpg', '.txt'))

        # Filter annotations for the current image
        image_annotations = df[df['ImageID'] == image_id]

        with open(label_file, 'w') as f:
            for _, row in image_annotations.iterrows():
                xmin, ymin, xmax, ymax = row[['XMin', 'YMin', 'XMax', 'YMax']]
                class_id = 0  # Assuming single class (person)
                center_x, center_y, w, h = convert_bbox_to_yolo_format(xmin, ymin, xmax, ymax)
                f.write(f"{class_id} {center_x} {center_y} {w} {h}\n")

print("Annotation files have been created.")

0001b46b0b82ee29
0002ae796e1f8eb5
0002a1b8cc4b8f92
0000a1b2fba255e9
00006b13c052138f
0000f8aef032941e
00010d873e81c61e
0001c626b9afb50c
00013d077c604328
0002a3c01c926a49
0001fa6ab562fd2a
0000bcb094764718
000002b66c9c498e
00010bf498b64bab
00005e7429a94ad4
0000fcb8ed0ea243
00011aec5d7324f4
0001b2b3b13cfbe4
0002b83c86da3294
0001e27f4b156f49
00004b19ca2c952f
0002a06d31985d69
0000ce19115ae401
00010d5d0cd3e273
0000aa810854dc2e
0002ab17a812c0d6
0000c4f95a9d5a54
0002c799b0cd7412
0001e595b536c9ec
00006bdb1eb5cd74
00003e2837c7b728
00001bcc92282a38
00009cadede2ed69
0000f8604a4e2cfe
0000dde1f8ec7be1
0001c43f78cd23e1
0000c33c6f4b8518
0000a4e648c5897f
0001a1f45ad8e824
0000f53faa4d14c3
0000a90019e380dc
0000b3e5921ab7ff
00008d167563158c
0001d48938a45d49
0000d59fa570d973
00010f041a2a6fa5
0000bdfa52154160
0000eb5027281f2a
0000f509689e349c
0000b4b26ef88376
000101475b6bc944
000033469fb48bc1
00019bc020b24b32
000045257f66b9e2
0002347a67b7a730
00026a2701e143f5
0001386327595826
0002337b77943386
0001143bfa4f6a

In [None]:
from ultralytics import YOLO

model = YOLO("yolov8n.yaml")  # build a new model from YAML



In [6]:
# load csv file with box annotations
person_annotations = pd.read_csv('../../02_data/Google_OpenImages/filtered_person_annotations.csv')

# specify image folder
image_folder = '../../02_data/Google_OpenImages/person_images_small/'

# Count the number of bounding boxes per image
image_counts = person_annotations.groupby('ImageID').size().reset_index(name='Count')

# Convert to dictionary for easy lookup
image_counts_dict = dict(zip(image_counts['ImageID'], image_counts['Count']))

In [None]:
image_ids = list(image_counts_dict.keys())
train_keys = image_ids[:50]
val_keys = image_ids[50:100]

train_dict = {k: image_counts_dict[k] for k in train_keys}
val_dict = {k: image_counts_dict[k] for k in val_keys}

In [None]:
class YOLODataset(torch.utils.data.Dataset):
    def __init__(self, image_folder, annotation_file, transform=None):
        self.image_folder = image_folder
        self.annotation_file = annotation_file
        self.transform = transform
        
        # Load image file paths and annotations
        self.image_files = []
        self.annotations = {}
        with open(annotation_file, 'r') as file:
            for line in file:
                line = line.strip().split(',')
                image_id = line[0]
                bbox = [float(x) for x in line[1:]]
                
                if image_id not in self.annotations:
                    self.annotations[image_id] = []
                self.annotations[image_id].append(bbox)
                
        self.image_files = [os.path.join(image_folder, img) for img in self.annotations.keys()]
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        image_path = self.image_files[idx]
        image_id = os.path.basename(image_path)
        
        # Load image
        image = Image.open(image_path).convert("RGB")
        width, height = image.size
        
        # Load annotations
        bboxes = self.annotations[image_id]
        boxes = []
        labels = []
        for bbox in bboxes:
            # Convert bounding box to YOLO format
            class_id = 0  # Assuming single class for people
            xmin, ymin, xmax, ymax = bbox
            center_x = (xmin + xmax) / 2 / width
            center_y = (ymin + ymax) / 2 / height
            w = (xmax - xmin) / width
            h = (ymax - ymin) / height
            boxes.append([class_id, center_x, center_y, w, h])
            labels.append(class_id)
        
        boxes = torch.tensor(boxes, dtype=torch.float32)
        
        if self.transform:
            image = self.transform(image)
        
        return image, boxes


In [None]:
# create transformation objects
train_transform = transforms.Compose([
    transforms.Resize((128, 128)), # rescale to uniform size (rescaling can lead to distortions if input not square)
    transforms.RandomHorizontalFlip(), # randomly flip horizontally
    transforms.RandomRotation(10), # randomly rotate by 10 degrees
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.RandomAffine(0, shear=10, scale=(0.8, 1.2)), # random affine transformations
    transforms.ToTensor(), # transform to tensor, brings pixels to range (0, 1)
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) # normalize to range (-1, 1) for all three channels
])

val_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
])

# Create training and validation datasets
train_dataset = YOLODataset(image_folder, train_dict, transform=train_transform)
val_dataset = YOLODataset(image_folder, val_dict, transform=val_transform)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)