<a href="https://colab.research.google.com/github/Tuevu110405/AIO_Module_7/blob/feature%2Ftraining/detection_basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download the lastest dataset version
data_dir = kagglehub.dataset_download("andrewmvd/dog-and-cat-detection")
print("Path to dataset files:" , data_dir)

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import xml.etree.ElementTree as ET

from PIL import Image
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from torchvision.models.resnet import ResNet18_Weights


In [None]:
class ImageDataset(Dataset):
    def __init__(self, annotations_dir, image_dir, transforms = None):
        self.annotations_dir = annotations_dir
        self.image_dir = image_dir
        self.transforms = transforms
        self.image_files = self.filter_images_width_multiple_objects()

    def filter_images_with_multiple_objects(self):
        valid_image_files = []
        for f in os.listdir(self.image_dir):
            if os.path.isfile(os.path.join(self.image_dir, f)):
                img_name = f
                annotation_name = os.path.splitext(img_name)[0] + ".xml"
                annotation_path = os.path.join(self.annotations_dir, annotation_name)

                if self.count_objects_in_annotation(annotation_path) <= 1:
                    valid_image_files.append(img_name)
                else:
                    print(
                        f"Image {image_name} has multiple objects and will be excluded from the dataset"
                    )

        return valid_image_files

    def count_objects_in_annotation(self, annotation_path):
        try:
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            count = 0
            for obj in root.findall("object"):
                count += 1
            return count
        except FileNotFoundError:
            return 0

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert("RGB")

        #Annotation path
        annotation_name = os.path.splitext(img_name)[0] + ".xml"
        annotation_path = os.path.join(self.annotations_dir, annotation_name)

        #Parse annotation file
        label = self.parse_annotation(annotation_path)

        if self.transforms:
            image = self.transforms(image)

        return image, label

    def parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()

        label = None
        for obj in root.findall("object"):
            name = obj.find("name").text
            if(
                label is None
            ):
                label = name

        label_num = 0 if label == "cat" else 1 if label == "dog" else  -1

        return label_num

Phan tich va chuan bi du lieu

In [None]:
#Data directory
annotations_dir = os.path.join(data_dir, "annotations")
image_dir = os.path.join(data_dir, "images")

#Get list of image files and create a dummy dataframe to split the data
image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
df = pd.DataFrame({'image_name' : image_files})

#split data
train_df, val_df = train_test_split(df, test_size = 0.2, random_state = 42)

In [None]:
transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])

train_dataset = ImageDataset(annotations_dir, image_dir, transform = transforms)
val_dataset = ImageDataset(annotations_dir, image_dir, transform = transforms)

# Filter datasets based on train_df and val_df
train_dataset.image_files = [f for f in train_dataset.image_files if f in train_df['image_name'].values]
val_dataset.image_files = [f for f in val_dataset.image_files if f in val_df['image_name'].values]

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = False)

In [None]:
#Model
model = models.resnet18(weights = ResNet18_Weights.DEFAULT)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

print(model)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 167MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [None]:
#Training Loop
num_epochs = 10
for epochs in range(num_epochs):
    model.train()
    for batch_idx, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)

        scores = model(data)
        loss = criterion(scores, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for data, targets in val_loader:
            data = data.to(device)
            targets = targets.to(device)

            scores = model(data)
            _, predictions = scores.max(1)
            total += targets.size(0)
            correct += (predictions == targets).sum()

        print(f"Epoch {epochs+1}/{num_epochs}, Accuracy: {100*correct/total:.2f}%")

classification + bounding box regression

In [None]:
import kagglehub
# Download latest version
data_dir = kagglehub.dataset_download("andrewmvd/dog-and-cat-detection")
print("Path to dataset files:" , data_dir)

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import xml.etree.ElementTree as ET

from PIL import Image
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from torchvision.models.resnet import ResNet18_Weights


In [None]:
class ImageDataset(Dataset):
    def __init__(self, annotations_dir, image_dir, transforms = None):
        self.annotations_dir = annotations_dir
        self.image_dir = image_dir
        self.transforms = transforms
        self.image_files = self.filter_images_width_multiple_objects()

    def filter_images_with_multiple_objects(self):
        valid_image_files = []
        for f in os.listdir(self.image_dir):
            if os.path.isfile(os.path.join(self.image_dir, f)):
                img_name = f
                annotation_name = os.path.splitext(img_name)[0] + ".xml"
                annotation_path = os.path.join(self.annotations_dir, annotation_name)

                if self.count_objects_in_annotation(annotation_path) <= 1:
                    valid_image_files.append(img_name)
                else:
                    print(
                        f"Image {image_name} has multiple objects and will be excluded from the dataset"
                    )

        return valid_image_files

    def count_objects_in_annotation(self, annotation_path):
        try:
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            count = 0
            for obj in root.findall("object"):
                count += 1
            return count
        except FileNotFoundError:
            return 0

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert("RGB")

        #Annotation path
        annotation_name = os.path.splitext(img_name)[0] + ".xml"
        annotation_path = os.path.join(self.annotations_dir, annotation_name)

        #Parse annotation file
        label, bbox = self.parse_annotation(annotation_path)

        if self.transforms:
            image = self.transforms(image)

        return image, label, bbox

    def parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()

        label = None
        bbox = None
        for obj in root.findall("object"):
            name = obj.find("name").text
            if(
                label is None
            ):
                label = name
                xmin = int(obj.find('bndbox/xmin').text)
                ymin = int(obj.find('bndbox/ymin').text)
                xmax = int(obj.find('bndbox/xmax').text)
                ymax = int(obj.find('bndbox/ymax').text)

                bbox = [
                    xmin / image_width,
                    ymin / image_height,
                    xmax / image_width,
                    ymax  / image_height

                ]


        label_num = 0 if label == "cat" else 1 if label == "dog" else  -1

        return label_num, torch.tensor(bbox, dtype = torch.float32)


In [None]:
annotations_dir = os.path.join(data_dir, "annotations")
image_dir = os.path.join(data_dir, "images")

image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
df = pd.DataFrame({'image_name' : image_files})

train_df, val_df = train_test_split(df, test_size = 0.2, random_state = 42)

In [None]:
transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])


])

train_dataset = ImageDataset(annotations_dir, image_dir, transform = transforms)
val_dataset = ImageDataset(annotations_dir, image_dir, transform = transforms)

train_dataset.image_files = [f for f in train_dataset.image_files if f in train_df['image_name'].values]
val_dataset.image_files = [f for f in val_dataset.image_files if f in val_df['image_name'].values]

#Dataloaders
train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = False)

In [None]:
#Model with two heads
class TwoHeadedModel(nn.Module):
    def __init__(self, num_classes = 2):
        super(TwoHeadedModel, self).__init__()
        self.base_model = models.resnet18(weights = ResNet18_Weights.DEFAULT)
        self.num_ftrs = self.base_model.fc.in_features

        self.base_model.fc = nn.Identity()
        self.classifier = nn.Linear(self.num_ftrs, num_classes)
        self.regressor = nn.Linear(self.num_ftrs, 4)

    def forward(self, x):
        x = self.base_model(x)
        class_logits = self.classifier(x)
        bbox_coords = torch.sigmoid(self.regressor(x))
        return class_logits, bbox_coords

In [None]:
#Model
model = TwoHeadedModel()
#device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
#Loss and optimizer
criterion_class = nn.CrossEntropyLoss()
criterion_bbox = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)


In [None]:
#Training loop
num_epochs = 10
for epochs in range(num_epochs):
    model.train()
    for batch_idx, (data, targets, bboxes) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)
        bboxes = bboxes.to(device)

        scores, pred_bboxes = model(data)
        loss_class = criterion_class(scores, targets)
        loss_bbox = criterion_bbox(pred_bboxes, bboxes)
        loss = loss_class + loss_bbox

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    #Validation
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        total_loss_bbox = 0
        total_samples = 0
        for data, targets, bboxes in val_loader:
            data = data.to(device)
            targets = targets.to(device)
            bboxes = bboxes.to(device)

            scores, pred_bboxes = model(data)
            _, predictions = scores.max(1)
            correct += (predictions == targets).sum()
            total += targets.size(0)

            total_loss_bbox += criterion_bbox(pred_bboxes, bboxes).item()*data.size(0)
            total_samples += data.size(0)
        avg_loss_bbox = total_loss_bbox / total_samples

        print(f"Epoch {epochs+1}/{num_epochs}, Accuracy: {100*correct/total:.2f}%, Validation Loss: {total_loss_bbox/total_samples:.4f}")




Many objects classification and bounding box regression

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import matplotlib.patches as patches
import xml.etree.ElementTree as ET
import tqdm.notebook as tqdm

from PIL import Image
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from torchvision.models.resnet import ResNet18_Weights, ResNet50_Weights


In [None]:
class MyDataset(Dataset):
    def __init__(self, annotations_dir, image_dir, transforms = None):
        self.annotations_dir = annotations_dir
        self.image_dir = image_dir
        self.transforms = transforms
        self.image_files = self.filter_images_width_multiple_objects()

    def filter_images_with_multiple_objects(self):
        valid_image_files = []
        for f in os.listdir(self.image_dir):
            if os.path.isfile(os.path.join(self.image_dir, f)):
                img_name = f
                annotation_name = os.path.splitext(img_name)[0] + ".xml"
                annotation_path = os.path.join(self.annotations_dir, annotation_name)

                if self.count_objects_in_annotation(annotation_path) == 1:
                    valid_image_files.append(img_name)


        return valid_image_files

    def count_objects_in_annotation(self, annotation_path):
        try:
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            count = 0
            for obj in root.findall("object"):
                count += 1
            return count
        except FileNotFoundError:
            return 0
    def parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()

        label = None
        bbox = None

        for obj in root.findall("onject"):
            name = obj.find("name").text
            if label is None:
                label = name

                xmin = int(obj.find('bndbox/xmin').text)
                ymin = int(obj.find('bndbox/ymin').text)
                xmax = int(obj.find('bndbox/xmax').text)
                ymax = int(obj.find('bndbox/ymax').text)

                bbox = [
                    xmin / image_width,
                    ymin / image_height,
                    xmax / image_width,
                    ymax / image_height
                ]

        label_num = 0 if label == "cat" else 1 if label == "dog" else -1
        return label_num, torch.tensor(bbox, dtype = torch.float32)
    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img1_file = self.image_files[idx]
        img1_path = os.path.join(self.image_dir, img1_file)

        annotation_name = os.path.splitext(img1_file)[0] + "xml"
        img1_annotations = self.parse_annotation(os.path.join(self.annotations_dir, annotation_name))

        idx2 = random.randint(0, len(self.image_files) - 1)
        img2_file = self.image_files[idx2]
        img2_path = os.path.join(self.image_dir, img2_file)

        annotation_name = os.path.splitext(img2_file)[0] + "xml"
        img2_annotations = self.parse_annotation(os.path.join(self.annotations_dir, annotation_name))

        img1 = Image.open(img1_path).convert("RGB")
        img2 = Image.open(img2_path).convert("RGB")

        merged_image = Image.new("RGB", (img1.width + img2.width, max(img1.height, img2.height)))

        merged_image.paste(img1, (0, 0))
        merged_image.paste(img2, (img1.width, 0))
        merged_w = img1.width + img2.width
        merged_h = max(img1.height, img2.height)

        merged_annotations = []

        merged_annotations.append(
            {"bbox": img1_annotations[1].tolist(), "label" : img1_annotations[0]}
        )

        new_bbox = [
            (img_annotations[1][0] * img2.width + img1.width) / merged_w,
            (img_annotations[1][1] * img2.height) / merged_h,
            (img_annotations[1][2] * img2.width + img1.width) / merged_w,
            (img_annotations[1][3] * img2.height) / merged_h
        ]
        merged_annotations.append(
            {"bbox": new_bbox, "label": img2_annotations[0]}
        )

        if self.transforms:
            merged_image = self.transforms(merged_image)

        return merged_image, merged_annotations



In [None]:
Data directory
annotations_dir = os.path.join(data_dir, "annotations")
image_dir = os.path.join(data_dir, "images")

image_files = [f for f in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, f))]
df = pd.DataFrame({'image_name' : image_files})

train_df, val_df = train_test_split(df, test_size = 0.2, random_state = 42)

In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])
])

train_dataset = ImageDataset(annotations_dir, image_dir, transform = transform)
val_dataset = ImageDataset(annotations_dir, image_dir, transform = transform)

In [None]:
train_dataset.image_files = [f for f in train_dataset.image_files if f in train_df['image_name'].values]
val_dataset.image_files = [f for f in val_dataset.image_files if f in val_df['image_name'].values]

train_loader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 32, shuffle = False)



In [None]:
class TwoHeadModel(nn.Module):
    def __init__(self, num_classes = 2):
        super(TwoHeadModel, self).__init__():
        self.base_model = models.resnet18(weights = ResNet18_Weights.DEFAULT)
        self.num_ftrs = self.base_model.fc.in_features

        self.base_model.fc = nn.Identity()
        self.classifier = nn.Linear(self.num_ftrs, num_classes)
        self.regressor = nn.Linear(self.num_ftrs, 4)

    def forward(self, x):
        x = self.base_model(x)
        class_logits = self.classifier(x)
        bbox_coords = torch.sigmoid(self.regressor(x))
        return class_logits, bbox_coords

In [None]:
model = TwoHeadModel()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion_class = nn.CrossEntropyLoss()
criterion_bbox = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)


In [None]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, targets, bboxes) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)
        bboxes = bboxes.to(device)

        scores, pred_bboxes = model(data)
        loss_class = criterion_class(scores, targets)
        loss_bbox = criterion_bbox(pred_bboxes, bboxes)
        loss = loss_class + loss_bbox

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        total_loss_bbox = 0
        total_sample = 0
        for data, targets, bboxes in val_loader:
            data = data.to(device)
            targets = targets.to(device)
            bboxes = bboxes.to(device)

            scores, pred_bboxes = model(data)
            _, predictions = score.max(1)
            correct += (predictions == targets).sum()
            total += targets.size(0)

            total_loss_bbox += criterion_bbox(pred_bboxes, bboxes).item() * data.size(0)
            total_samples += data.size(0)

        avg_loss_bbox = total_loss_bbox / total_samples
        print(f"Epoch {epoch+1}/{num_epochs}, Accuracy: {100*correct/total:.2f}%, Validation Loss: {total_loss_bbox/total_samples:.4f}")
        print(f'AVG. bbox: {avg_loss_bbox:.4f}')

classification(> 2 objects) + bounding box regression

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torch.nn.functional as F
import matplotlib.patches as patches
import xml.etree.ElementTree as ET
import tqdm.notebook as tqdm

from PIL import Image
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from torchvision.models.resnet import ResNet18_Weights, ResNet50_Weights


In [None]:
class MyDataset(Dataset):
    def __init__(self, annotations_dir, image_dir, transforms = None):
        self.annotations_dir = annotations_dir
        self.image_dir = image_dir
        self.transforms = transforms
        self.image_files = self.filter_images_width_multiple_objects()

    def filter_images_with_multiple_objects(self):
        valid_image_files = []
        for f in os.listdir(self.image_dir):
            if os.path.isfile(os.path.join(self.image_dir, f)):
                img_name = f
                annotation_name = os.path.splitext(img_name)[0] + ".xml"
                annotation_path = os.path.join(self.annotations_dir, annotation_name)

                if self.count_objects_in_annotation(annotation_path) == 1:
                    valid_image_files.append(img_name)


        return valid_image_files

    def count_objects_in_annotation(self, annotation_path):
        try:
            tree = ET.parse(annotation_path)
            root = tree.getroot()
            count = 0
            for obj in root.findall("object"):
                count += 1
            return count
        except FileNotFoundError:
            return 0


    def parse_annotation(self, annotation_path):
        tree = ET.parse(annotation_path)
        root = tree.getroot()

        image_width = int(root.find('size/width').text)
        image_height = int(root.find('size/height').text)

        label = None
        bbox = None

        for obj in root.findall("onject"):
            name = obj.find("name").text
            if label is None:
                label = name

                xmin = int(obj.find('bndbox/xmin').text)
                ymin = int(obj.find('bndbox/ymin').text)
                xmax = int(obj.find('bndbox/xmax').text)
                ymax = int(obj.find('bndbox/ymax').text)

                bbox = [
                    xmin / image_width,
                    ymin / image_height,
                    xmax / image_width,
                    ymax / image_height
                ]

        label_num = 0 if label == "cat" else 1 if label == "dog" else -1
        return label_num, torch.tensor(bbox, dtype = torch.float32)
    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img1_file = self.image_files[idx]
        img1_path = os.path.join(self.image_dir, img1_file)

        annotation_name = os.path.splitext(img1_file)[0] + "xml"
        img1_annotations = self.parse_annotation(os.path.join(self.annotations_dir, annotation_name))

        idx2 = random.randint(0, len(self.image_files) - 1)
        img2_file = self.image_files[idx2]
        img2_path = os.path.join(self.image_dir, img2_file)

        annotation_name = os.path.splitext(img2_file)[0] + "xml"
        img2_annotations = self.parse_annotation(os.path.join(self.annotations_dir, annotation_name))

        img1 = Image.open(img1_path).convert("RGB")
        img2 = Image.open(img2_path).convert("RGB")

        merged_image = Image.new("RGB", (img1.width + img2.width, max(img1.height, img2.height)))

        merged_image.paste(img1, (0, 0))
        merged_image.paste(img2, (img1.width, 0))
        merged_w = img1.width + img2.width
        merged_h = max(img1.height, img2.height)

        merged_annotations = []

        merged_annotations.append(
            {"bbox": img1_annotations[1].tolist(), "label" : img1_annotations[0]}
        )

        new_bbox = [
            (img_annotations[1][0] * img2.width + img1.width) / merged_w,
            (img_annotations[1][1] * img2.height) / merged_h,
            (img_annotations[1][2] * img2.width + img1.width) / merged_w,
            (img_annotations[1][3] * img2.height) / merged_h
        ]
        merged_annotations.append(
            {"bbox": new_bbox, "label": img2_annotations[0]}
        )

        if self.transforms:
            merged_image = self.transforms(merged_image)
        else:
            merged_image = transforms.ToTensor()(merged_image)

        annotations = torch.zeros((len(merged_annotations), 5))
        for i, ann in enumerate(merged_annotations):
            annotations[i] = torch.cat((torch.tensor([ann['bbox']]), torch.tensor(ann['label'])))

        return merged_image, annotations


In [None]:
#Data directory
annotations_dir = os.path.join(data_dir, "annotations")
image_dir = os.path.join(data_dir, "images")

#Define transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset = MyDataset(annotations_dir, image_dir, transform = transform)
train_dataset, val_dataset = train_test_split(dataset, test_size = 0.2, random_state = 42)
train_loader = DataLoader(train_dataset, batch_size = 8, shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = 8, shuffle = False)

In [None]:
class SimpleYOLO(nn.Module):
    def __init__(self, num_classes):
        super(SimpleYOLO, self).__init__():
        self.backbone = models.resnet50(weights = ResNet50_Weights.DEFAULT)

        self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])
        self.fcs = nn.Linear(
            2048, 2 * 2 * (4 + self.num_classes)
        )

    def forward(self, x):
        features = self.backbone(x)
        features = F.adaptive_avg_pool2d(features, (1, 1))
        features = features.view(features.size(0), -1)
        output = self.fcs(features)
        return output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = 2
class_to_idx = {'dog' : 0, 'cat' : 1}

model = SimpleYOLO(num_classes = num_classes).to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [None]:
def calculate_loss(output, targets, device, num_classes):
    mse_loss = nn.MSELoss()
    ce_loss = nn.CrossEntropyLoss()

    batch_size = output.shape[0]
    total_loss = 0

    output = output.view(batch_size, 2, 2, 4 + num_classes)

    for i in range(batch_size):
        for j in range(len(targets[i])):

            bbox_center_x = (targets[i][j][0] + targets[i][j][2]) / 2
            bbox_center_y = (targets[i][j][1] + targets[i][j][3]) / 2

            grid_x = int(bbox_center_x * 2)
            grid_y = int(bbox_center_y * 2)

            label_one_hot = torch.zeros(num_classes, device = device)
            label_one_hot[targets[i][j][4]] = 1

            classification_loss = ce_loss(output[i, grid_y, grid_x, 4:], label_one_hot)

            #2Regression loss for the responsible grid cell
            bbox_target = targets[i][j][:4].to(device)
            regression_loss = mse_loss(output[i, grid_y, grid_x, :4], bbox_target)

            # 3 No object Loss(for other grid cells)
            no_obj_loss = 0
            for other_grid_y in range(2):
                for other_grid_x in range(2):
                    if other_grid_x != grid_x or other_grid != grid_y:
                        no_obj_loss += mse_loss(output[i, other_grid_y, other_grid_x, :4], torch.zeros(4, device = device))

            total_loss += classification_loss + regression_loss + no_obj_loss

    return total_loss / batch_size

def evaluate_model(model, data_loader, device, num_classes):
    model.eval()
    running_loss = 0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for images, targets in tqdm.tqdm(data_loader, desc= 'Validation', leave = False):
            images = images.to(device)
            output = model(images)
            loss = calculate_loss(output, targets, device, num_classes)
            running_loss += loss.item()

            output = output.view(images.shape[0], 2, 2, 4 + num_classes)

            for batch_idx in range(images.shape[0]):
                for target in targets[batch_idx]:
                    bbox_center_x = (target[0] + target[2]) / 2
                    bbox_center_y = (target[1] + target[3]) / 2

                    grid_x = int(bbox_center_x * 2)
                    grid_y = int(bbox_center_y * 2)

                    prediction = output[batch_idx, grid_y, grid_x, :4].argmax().item()
                    all_predictions.append(prediction)
                    all_targets.append(target[4].item())
    val_loss = running_loss / len(data_loader)

    all_predictions = torch.tensor(all_predictions, device = device)
    all_targets = torch.tensor(all_targets, device = device)

    val_accuracy = (all_predictions == all_targets).float().mean().item()
    return val_loss, val_accuracy






In [None]:
def train_model(model, train_loader, val_loader, optimizer, num_epochs, device, num_classes):
    best_val_accuracy = 0
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in tqdm.tqdm(range(num_epochs), desc = "Epochs"):
        model.train()
        running_loss = 0

        for images, targets in tqdm.tqdm(train_loader, desc = 'Batches', leave = False):
            images = images.to(device)
            optimizer.zero_grad()
            output = model(images)

            total_loss = calculate_loss(output, targets, device, num_classes)
            total_loss.backward()
            optimizer.step()
            running_loss += total_loss.item()

            epoch_loss = running_loss / len(train_loader)
            train_losses.append(epoch_loss)

        val_loss, val_accuracy = evaluate_model(model, val_loader, device, num_classes)
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)

        print(
            f"Epoch [{epoch+1}/{num_epochs}], "
            f"Train Loss: {epoch_loss:.4f}, "
            f"Val Loss: {val_loss:.4f}, "
            f"Val Accuracy: {val_accuracy:.4f}"
            )

        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pth')

    return train_losses, val_losses, train_accuracies, val_accuracies





In [None]:
def inference(model, image_path, transform, device, class_to_idx, threshold = 0.5):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    original_width, original_height = image.size

    resized_image = image.resize((448, 448))
    resized_width, resized_height = resized_image.size

    transformed_image = transform(resized_image).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(transformed_image)
        output = output.view(1, 2, 2, 4 + len(class_to_idx))

    fig, ax = plt.subplots(1)
    ax.axis("off")
    ax.imshow(resized_image)

    for grid_y in range(2):
        for grid_x in range(2):
            class_pred = output[0, grid_y, grid_x, 4:].argmax().item()
            bbox = output[0, grid_y, grid_x, :4].tolist()

            confidence = torch.softmax(output[0, grid_y, grid_x, 4:], dim = 0)[class_pred].item()

            x_min = bbox[0] * (resized_width / 2) + grid_x * (resized_width / 2)
            y_min = bbox[1] * (resized_height / 2) + grid_y * (resized_height / 2)
            x_max = bbox[2] * (resized_width / 2) + grid_x * (resized_width / 2)
            y_max = bbox[3] * (resized_height / 2) + grid_y * (resized_height / 2)


            if confidence > threshold:
                rect = patches.Rectangle(
                    (x_min, y_min),
                    x_max - x_min,
                    y_max - y_min,
                    linewidth = 1,
                    edgecolor = 'r',
                    facecolor = 'none'
                )
                ax.add_patch(rect)
                plt.text(
                    x_min,
                    y_min,
                    f"{class_to_idx[class_pred]}: {confidence:.2f}",
                    color = 'white',
                    fontsize = 12,
                    bbox = dict(facecolor = 'red', alpha = 0.5)
                )
    plt.show()

model.load_state_dict(torch.load('best_model.pth'))

image_path = "mnt/c/Study/0D Project/good_1.jpg"
inference(model, image_path, transform, device, class_to_idx, threshold = 0.5)




Yolov1

In [None]:
class CustomVOCDataset(torchvision.datasets.VOCDetection):
    def __init__(self, class_mapping, S = 7, B = 2, C = 20, custom_transforms = None):
        self.S = S
        self.B = B
        self.C = C
        self.class_mapping = class_mapping
        self.custom_transforms = custom_transforms

    def __getitem__(self, index):
        image, target = super(CustomVOCDataset, self).__getitem__(index)
        img_width, img_height = image.size

        bboxes = convert_to_yolo_format(target, img_width, img_height, self.class_mapping)
        just_boxes = boxes[:, 1:]
        labels = boxes[:, 0]

        if self.custom_transforms:
            sample = {
                "image": image,
                "bboxes": just_boxes,
                "labels": labels
            }
            sample = self.custom_transforms(**sample)
            image = sample["image"]
            bboxes = sample["bboxes"]
            labels = sample["labels"]

        label_matrix = torch.zeros((self.S, self.S, self.C+ 5 * self.B))

        boxes = torch.tensor(boxes, dtype = torch.float32)
        labels = torch.tensor(labels, dtype = torch.float32)
        image = torch.as_tensor(image, dtype = torch.float32)

        for box, label in zip(boxes, labels):
            x, y width, height = box.tolist()
            class_label = label.item()

            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i
            width_cell, height_cell = (
                width * self.S,
                height * self.S
            )
            if label_matrix[i, j, 20] == 0:
                label_matrix[i, j, 20] == 1
                box_coordinates = torch.tensor(
                    [x_cell, y_cell, width_cell, height_cell]
                )
                label_matrix[i, j, 21:25] = box_coordinates
                label_matrix[i, j, class_label] = 1

        return image, label_matrix

In [None]:
def convert_to_yolo_format(target, img_width, img_height, class_mapping):
    """
    Convert annotation data from VOC format to YOLO format.

    Parameters:
        target (dict): Annotation data from VOCDetection dataset.
        img_width (int): Width of the original image.
        img_height (int): Height of the original image.
        class_mapping (dict): Mapping from class names to integer IDs.

    Returns:
        torch.Tensor: Tensor of shape [N, 5] for N bounding boxes,
            each with [x_center, y_center, width, height, class_id].
    """

    annotations = target['annotation']['object']

    # Ensure annotations is a list
    if not isinstance(annotations, list):
        annotations = [annotations]

    boxes = []
    for anno in annotations:
        xmin = int(anno['bndbox']['xmin']) / real_width
        xmax = int(anno['bndbox']['xmax']) / real_width
        ymin = int(anno['bndbox']['ymin']) / real_height
        ymax = int(anno['bndbox']['ymax']) / real_height

        x_center = (xmin + xmax) / 2
        y_center = (ymin + ymax) / 2
        width = xmax - xmin
        height = ymax - ymin

        class_name = anno['name']
        class_id = class_mapping[class_name] if class_name in class_mapping else 0

        boxes.append([x_center, y_center, width, height, class_id])

    return torch.tensor(boxes)

In [None]:
import torch

def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    """
    Calculates the Intersection over Union (IoU) between bounding boxes.

    Args:
        boxes_preds (torch.Tensor): Predicted bounding boxes (BATCH_SIZE, 4)
        boxes_labels (torch.Tensor): Ground truth bounding boxes (BATCH_SIZE, 4)
        box_format (str): Box format, can be "midpoint" or "corners".

    Returns:
        torch.Tensor: Intersection over Union scores for each example.
    """

    # Convert to corners format if necessary
    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2
    elif box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]
    else:
        raise ValueError("Invalid box_format. Choose 'midpoint' or 'corners'.")

    # Calculate coordinates of the intersection rectangle
    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    # Compute the area of the intersection rectangle, clamp (0) to handle cases where they do not overlap
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    # Calculate the areas of the predicted and ground truth boxes
    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    # Calculate the Intersection over Union, adding a small epsilon to avoid division by zero
    return intersection / (box1_area + box2_area - intersection + 1e-6)

In [None]:
import torch

def non_max_suppression(bboxes, iou_threshold, threshold, box_format="corners"):
    """
    Performs Non-Maximum Suppression on a list of bounding boxes.

    Args:
        bboxes (list): List of bounding boxes, each represented as
                      [class_pred, prob_score, x1, y1, x2, y2].
        iou_threshold (float): IoU threshold to determine correct predicted
                              bounding boxes.
        threshold (float): Threshold to discard predicted bounding boxes
                           (independent of IoU).
        box_format (str): "midpoint" or "corners" to specify the format of
                          bounding boxes.

    Returns:
        list: List of bounding boxes after performing NMS with a
             specific IoU threshold.
    """

    assert type(bboxes) == list

    # Filter bounding boxes based on probability threshold
    bboxes = [box for box in bboxes if box[1] > threshold]

    # Sort bounding boxes by probability in descending order
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)

    bboxes_after_nms = []

    while bboxes:
        chosen_box = bboxes.pop(0)

        # Remove bounding boxes with IoU greater than the specified threshold
        # with the chosen box
        bboxes = [
            box
            for box in bboxes
            if box[0] != chosen_box[0] or
               intersection_over_union(
                   torch.tensor(chosen_box[2:]),
                   torch.tensor(box[2:]),
                   box_format=box_format
               ) < iou_threshold
        ]

        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms

In [None]:
import torch
from collections import Counter

def mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5,
                           box_format="midpoint", num_classes=20):
    """
    Calculate the mean average precision (mAP).

    Args:
        pred_boxes (list): A list containing predicted bounding boxes with each
                          box defined as [train_idx, class_pred, prob_score,
                          x1, y1, x2, y2].
        true_boxes (list): Similar to pred_boxes but containing information
                          about true boxes.
        iou_threshold (float): IoU threshold, where predicted boxes are
                              considered correct.
        box_format (str): "midpoint" or "corners" used to specify the format
                          of the boxes.
        num_classes (int): Number of classes.

    Returns:
        float: The mAP value across all classes with a specific IoU threshold.
    """

    average_precisions = []
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)

        amount_bboxes = Counter([gt[0] for gt in ground_truths])
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)

        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]

            num_gts = len(ground_truth_img)
            best_iou = 0

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)

In [None]:
architecture_config = {
    (7, 64, 2, 3),
    "M"
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M"
    [(1, 512, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M"
    [(1, 1024, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
}

class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias = False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))


class Yolov1(nn.Module):
    def __init__(self, in_channels = 3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim = 1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == tuple:
                layers += [
                    CNNBlock(
                        in_channels, x[1], kernel_size = x[0], stride = x[2], padding = x[3]
                    )
                ]
                in_channels = x[1]
            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size = (2,2), stride = (2,2))]

            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeats = x[2]

                for _ in range(num_repeats):
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size = conv1[0],
                            stride = conv1[2],
                            padding = conv1[3]
                        )
                    ]
                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size = conv2[0],
                            stride = conv2[2],
                            padding = conv2[3]
                        )
                    ]
                    in_channels = conv2[1]

        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes ):
        S, B, C = split_size, num_boxes, num_classes

        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 496),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(496, S * S *(C + B * 5)),

        )

In [None]:
class YoloLoss(nn.Module):
    """
    Calculate the loss for the YOLO (v1) model.
    """

    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C
        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def forward(self, predictions, target):
        # Reshape predictions for easier indexing
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)

        # Calculate IoU for each bounding box prediction
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

        # Get the box with the highest IoU
        iou_maxes, bestbox = torch.max(ious, dim=0)
        exists_box = target[..., 20].unsqueeze(3)

        # ======================== #
        # FOR BOX COORDINATES #
        # ======================== #

        # Choose the box with the highest IoU
        box_predictions = exists_box * (
            bestbox * predictions[..., 26:30]
            + (1 - bestbox) * predictions[..., 21:25]
        )
        box_targets = exists_box * target[..., 21:25]

        # Square root of width and height
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2),
        )

        # ==================== #
        # FOR OBJECT LOSS #
        # ==================== #

        # Confidence score of the box with the highest IoU
        pred_box = (
            bestbox * predictions[..., 25:26]
            + (1 - bestbox) * predictions[..., 20:21]
        )

        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21]),
        )

        # ======================= #
        # FOR NO OBJECT LOSS #
        # ======================= #

        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )
        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1),
        )

        # ======================= #
        # FOR CLASS LOSS #
        # ======================= #

        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
            torch.flatten(exists_box * target[..., :20], end_dim=-2),
        )

        # Calculate the final loss
        loss = (
            self.lambda_coord * box_loss
            + object_loss
            + self.lambda_noobj * no_object_loss
            + class_loss
        )

        return loss

In [None]:
seed = 123
torch.manual_seed(seed)

# Hyperparameters
LEARNING_RATE = 2e-5
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
EPOCHS = 300
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False
LOAD_MODEL_FILE = "yolov1.pth.tar"



In [None]:
WIDTH = 448
HEIGHT = 448

def get_train_transforms():
    return A.Compose(
        [
            A.OneOf(
                [
                    A.HueSaturationValue(
                        hue_shift_limit=0.2,
                        sat_shift_limit=0.2,
                        val_shift_limit=0.2,
                        p=0.9
                    ),
                    A.RandomBrightnessContrast(
                        brightness_limit=0.2,
                        contrast_limit=0.2,
                        p=0.9
                    ),
                ],
                p=0.9
            ),
            A.ToGray(p=0.01),
            A.HorizontalFlip(p=0.2),
            A.VerticalFlip(p=0.2),
            A.Resize(height=WIDTH, width=WIDTH, p=1),
            # A.Cutout(num_holes=8, max_h_size=64, max_w_size=64, fill_value=0, p=0.5),
            ToTensorV2(p=1.0),
        ],
        p=1.0,
        bbox_params=A.BboxParams(
            format='yolo',
            min_area=0,
            min_visibility=0,
            label_fields=['labels']
        )
    )

def get_valid_transforms():
    return A.Compose(
        [
            A.Resize(height=WIDTH, width=WIDTH, p=1.0),
            ToTensorV2(p=1.0),
        ],
        p=1.0,
        bbox_params=A.BboxParams(
            format='yolo',
            min_area=0,
            min_visibility=0,
            label_fields=['labels']
        )
    )

In [None]:
class_mapping = {
    'aeroplane': 0,
    'bicycle': 1,
    'bird': 2,
    'boat': 3,
    'bottle': 4,
    'bus': 5,
    'car': 6,
    'cat': 7,
    'chair': 8,
    'cow': 9,
    'diningtable': 10,
    'dog': 11,
    'horse': 12,
    'motorbike': 13,
    'person': 14,
    'pottedplant': 15,
    'sheep': 16,
    'sofa': 17,
    'train': 18,
    'tvmonitor': 19
}

In [None]:
def train_fn(train_loader, model, optimizer, loss_fn, epoch):
    """
    Trains the model for one epoch.

    Args:
        train_loader (DataLoader): DataLoader for training data.
        model (nn.Module): The model to train.
        optimizer (Optimizer): The optimizer used for training.
        loss_fn (callable): The loss function.
        epoch (int): The current epoch number.

    Returns:
        float: Average mAP for the epoch.
    """

    model.train()
    epoch_losses = []
    epoch_maps = []
    total_batches = len(train_loader)
    display_interval = total_batches // 5

    for batch_idx, (x, y) in enumerate(train_loader):
        x, y = x.to(DEVICE), y.to(DEVICE)
        out = model(x)
        loss = loss_fn(out, y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pred_boxes, true_boxes = get_bboxes_training(out, y, iou_threshold=0.5, threshold=0.4)
        mAP = mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint")

        epoch_losses.append(loss.item())
        epoch_maps.append(mAP.item())

        if batch_idx % display_interval == 0 or batch_idx == total_batches - 1:
            print(f"Epoch: {epoch:3d} \t Iter: {batch_idx:3d}/{total_batches:3d} \t Loss: {loss.item():.10f} \t mAP: {mAP.item():.10f}")

    avg_loss = sum(epoch_losses) / len(epoch_losses)
    avg_mAP = sum(epoch_maps) / len(epoch_maps)
    print(colored(f"Train \t loss: {avg_loss:.10f} \t mAP: {avg_mAP:.10f}", 'green'))

    return avg_mAP

def test_fn(test_loader, model, loss_fn, epoch):
    """
    Evaluates the model on the test set.

    Args:
        test_loader (DataLoader): DataLoader for test data.
        model (nn.Module): The model to evaluate.
        loss_fn (callable): The loss function.
        epoch (int): The current epoch number.

    Returns:
        float: Average mAP for the epoch.
    """

    model.eval()
    epoch_losses = []
    epoch_maps = []

    with torch.no_grad():
        for batch_idx, (x, y) in enumerate(test_loader):
            x, y = x.to(DEVICE), y.to(DEVICE)
            out = model(x)
            loss = loss_fn(out, y)

            pred_boxes, true_boxes = get_bboxes_training(out, y, iou_threshold=0.5, threshold=0.4)
            mAP = mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint")

            epoch_losses.append(loss.item())
            epoch_maps.append(mAP.item())

    avg_loss = sum(epoch_losses) / len(epoch_losses)
    avg_mAP = sum(epoch_maps) / len(epoch_maps)
    print(colored(f"Test \t loss: {avg_loss:.10f} \t mAP: {avg_mAP:.10f}", 'yellow'))

    model.train()
    return avg_mAP

In [None]:
from torch.utils.data import DataLoader, SubsetRandomSampler

def train():
    """
    Trains the YOLOv1 model.
    """

    # Initialize model, optimizer, loss
    model = Yolov1(split_size=7, num_boxes=2, num_classes=20).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    loss_fn = YoloLoss()

    # Load checkpoint if necessary
    if LOAD_MODEL:
        load_checkpoint(torch.load(LOAD_MODEL_FILE), model, optimizer)

    # Create datasets
    train_dataset = CustomVOCDataset(
        root='./data',
        year='2012',
        image_set='train',
        download=True,
    )
    train_dataset.init_config_yolo(class_mapping=class_mapping,
                                   custom_transforms=get_train_transforms())

    testval_dataset = CustomVOCDataset(
        root='./data',
        year='2012',
        image_set='val',
        download=True,
    )
    testval_dataset.init_config_yolo(class_mapping=class_mapping,
                                    custom_transforms=get_val_transforms())

    # Split testval dataset into validation and test sets
    dataset_size = len(testval_dataset)
    val_size = int(0.15 * dataset_size)
    test_size = dataset_size - val_size
    val_indices = list(range(val_size))
    test_indices = list(range(val_size, val_size + test_size))

    # Create samplers
    val_sampler = SubsetRandomSampler(val_indices)
    test_sampler = SubsetRandomSampler(test_indices)

    # Create DataLoaders
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        drop_last=True,
    )

    val_loader = DataLoader(
        dataset=testval_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        sampler=val_sampler,
        drop_last=False,
    )

    test_loader = DataLoader(
        dataset=testval_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY,
        sampler=test_sampler,
        drop_last=False,
    )

    best_mAP_train = 0
    best_mAP_val = 0
    best_mAP_test = 0

    # Training loop
    for epoch in range(EPOCHS):
        train_mAP = train_fn(train_loader, model, optimizer, loss_fn, epoch)
        val_mAP = test_fn(val_loader, model, loss_fn, epoch)
        test_mAP = test_fn(test_loader, model, loss_fn, epoch, is_test=True)

        # Update best mAP values
        best_mAP_train = max(best_mAP_train, train_mAP)
        best_mAP_val = max(best_mAP_val, val_mAP)
        best_mAP_test = max(best_mAP_test, test_mAP)

        # Save checkpoint when validation mAP improves
        if val_mAP > best_mAP_val:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint, filename=LOAD_MODEL_FILE)

    print(colored(f" Best Train mAP: {best_mAP_train:.10f}", 'green'))
    print(colored(f" Best Val mAP: {best_mAP_val:.10f}", 'blue'))
    print(colored(f" Best Test mAP: {best_mAP_test:.10f}", 'yellow'))