## Setting up CUDA environment

In [1]:
import torch
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Currently using {device}!")

Currently using cuda!


## Get the data ready

This code will unpack every annotation from every image and create .txt files to match the path to the images for dataset building

In [2]:
import os
def process_class_train(class_name, root_dir):
    input_file = os.path.join(root_dir, class_name, f"{class_name}_boxes.txt")
    output_dir = os.path.join(root_dir, class_name, "labels")
    os.makedirs(output_dir, exist_ok=True) # Create the folder for the labels
    with open(input_file, "r") as f:
        for line in f:
            parts = line.strip().split()
            image_name = parts[0].replace('.JPEG', '')
            label = ' '.join(parts[1:])  

            with open(os.path.join(output_dir, f"{image_name}.txt"), "w") as out_f:
                out_f.write(label + "\n")

num_classes = 0
root = os.path.join("tiny-imagenet-200", "train")
for class_name in os.listdir(root):
    process_class_train(class_name, root)
    num_classes += 1

In [16]:
import os
def process_test(root_dir):
    input_file = os.path.join(root_dir, "val_annotations.txt")
    output_dir = os.path.join(root_dir, "labels")
    os.makedirs(output_dir, exist_ok=True) # Create the folder for the labels
    with open(input_file, "r") as f:
        for line in f:
            parts = line.strip().split()
            image_name = parts[0].replace('.JPEG', '')
            label = ' '.join(parts[1:])  

            with open(os.path.join(output_dir, f"{image_name}.txt"), "w") as out_f:
                out_f.write(label + "\n")

root_dir = os.path.join("tiny-imagenet-200", "val")
process_test(root_dir)

In [2]:
import os
num_classes = 200
root_dir = os.path.join("tiny-imagenet-200", "train")
class_names = sorted(os.listdir(root_dir))

 Now we are ready to build our Dataset object so it can be passed into DataLoader

In [3]:
import os
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as T
from glob import glob
import random

class YOLODataset(Dataset):
    def __init__(self, root_dir, class_names, mode, transform=None):
        """
        Args:
            root_dir (str): Path to data/train
            transform (callable, optional): Image transformations

        Output:
            image (float tensor (C, H, W)): Tensor corresponding to Image.
            boxes (float tensor (batch_size, num_objects, 5)) - class, x_mid, y_mid, width, height 
        """
        self.transform = transform
        self.samples = []
        self.mode = mode

        self.class_names = class_names
        self.class_to_idx = {cls: i for i, cls in enumerate(self.class_names)}

        if mode == "train":
            for cls in self.class_names:
                class_path = os.path.join(root_dir, cls)
                for img_path in glob(os.path.join(class_path, "images", "*.JPEG")):
                    label_path = img_path.replace(".JPEG", ".txt").replace("images", "labels")
                    self.samples.append((img_path, label_path, self.class_to_idx[cls]))
        else:
            for img_path in glob(os.path.join(root_dir, "images", "*.JPEG")):
                label_path = img_path.replace(".JPEG", ".txt").replace("images", "labels")
                self.samples.append((img_path, label_path))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        if self.mode == "train":
            img_path, label_path, class_idx = self.samples[idx]
        else:
            img_path, label_path = self.samples[idx]

        image = Image.open(img_path).convert("RGB")
        image_width, image_height = image.size
        inv_w, inv_h = 1.0 / image_width, 1.0 / image_height

        boxes = []
        if os.path.exists(label_path):
            with open(label_path) as f:
                for line in f:
                    all_elements = line.strip().split()
                    if self.mode == "train":
                        x1, y1, x2, y2 = map(float, all_elements)
                    else:
                        class_name = all_elements[0]
                        class_idx = self.class_to_idx[class_name]
                        x1, y1, x2, y2 = map(float, all_elements[1:])  

                    x1, x2 = map(lambda x: x * inv_w, [x1, x2])
                    y1, y2 = map(lambda y: y * inv_h, [y1, y2])
                    
                    x_center = (x1 + x2) / 2
                    y_center = (y1 + y2) / 2
                    width = x2 - x1
                    height = y2 - y1
                    boxes.append([class_idx, x_center, y_center, width, height])

        if boxes:
            boxes = torch.tensor(boxes, dtype=torch.float32)
        else:
            return self.__getitem__(random.randint(0, len(self) - 1))

        if self.transform:
            image = self.transform(image)

        return image, boxes

In [4]:
# Train needs reshaping to 224, 224
transform = T.Compose([
    T.Resize((224, 224
    )),
    T.ToTensor(),
])

mode = "train"
root_dir = "tiny-imagenet-200/train"
train_dataset = YOLODataset(root_dir, class_names, mode, transform)
train_loader = DataLoader(train_dataset, batch_size=50, shuffle=True)

for imgs, boxes in train_loader:
    print(imgs.shape)    # (batch_size, 3, 224, 224)
    print(boxes.shape)  # (batch_size, 1, 5) since there is only 1 object per image
    break

mode = "val"
root_dir = "tiny-imagenet-200/val"
val_dataset = YOLODataset(root_dir, class_names, mode, transform)
val_loader = DataLoader(val_dataset, batch_size=50)
for imgs, boxes in val_loader:
    print(imgs.shape)    # (batch_size, 3, 224, 224)
    print(boxes.shape)  # (batch_size, 1, 5) since there is only 1 object per image
    break

torch.Size([50, 3, 224, 224])
torch.Size([50, 1, 5])
torch.Size([50, 3, 224, 224])
torch.Size([50, 1, 5])


In [5]:
def IOU(box1, box2):
    """
    args:
        box1 (batch_size, 1, 5): Golden label box
        box2 (batch_size, B, 5): All boxes of that cell 
    """
    if isinstance(box1, list):
        box1 = torch.tensor(box1, dtype=float)

    if isinstance(box2, list):
        box2 = torch.tensor(box2, dtype=float)

    _, x_mid1, y_mid1, width1, height1 = box1.unbind(dim=-1) # Removes one dimension and enumerates the resulting tensor. dim = -1 will correspond to the columns
    _, x_mid2, y_mid2, width2, height2 = box2.unbind(dim=-1)

    x11, y11, x21, y21 = get_x1y1x2y2(x_mid1, y_mid1, width1, height1)
    x12, y12, x22, y22 = get_x1y1x2y2(x_mid2, y_mid2, width2, height2)

    dx = torch.minimum(x21, x22) - torch.maximum(x11, x12)
    dy = torch.minimum(y21, y22) - torch.maximum(y11, y12)
    dx = (dx > 0) * dx # Mask if dx < 0 -> No intersection because of x-axis
    dy = (dy > 0) * dy # Mask if dy < 0 -> No intersection because of y-axis
    area_intersection = dx * dy
    
    area_union = width1 * height1 + width2 * height2 - area_intersection
    clipped_area_union = torch.clamp(area_union, min=1e-6)

    return area_intersection/clipped_area_union

def get_x1y1x2y2(x_mid, y_mid, width, height):
    half_height = height / 2
    half_width = width / 2
    
    return x_mid - half_width, y_mid - half_height, x_mid + half_width, y_mid + half_height

In [6]:
box1 = torch.randn(4, 1, 5)
box2 = torch.randn(4, 1, 5)

IOU(box1, box2)

tensor([[0.],
        [0.],
        [-0.],
        [-0.]])

## Architecture of YOLO

In [7]:
class YOLO(torch.nn.Module):
    def __init__(self, S=7, B=2, C=200, slope=0.1, dropout=0.5):
        """
        Args:
            S (int): Dimensions for the final grid SxS
            B (int): Number of boxes per position on the grid
            C (int): Number of classes 
            slope (float): Negative slope in LeakyRELU
            dropout (float): Dropout Probability
        """
        
        super().__init__()
        
        # Initializing attributes
        self.S = S
        self.B = B
        self.C = C

        # Block 1
        conv1 = self.conv_block(3, [7], [64], [2], [2], [2], slope)

        # Block 2
        conv2 = self.conv_block(64, [3], [192], [1], [2], [2], slope)

        # Block 3
        conv3 = self.conv_block(192, [1, 3, 1, 3], [128, 256, 256, 512], [1] * 4, [2], [2], slope)
    
        # Block 4
        conv4 = self.conv_block(512, [1, 3] * 5, [256, 512] * 4 + [512, 1024], [1] * 10, [2], [2], slope)

        # Block 5
        #conv5 = self.conv_block(1024, [1, 3] * 2 + [3, 3], [512, 1024] * 2 + [1024] * 2, [1] * 5 + [2], [], [], slope)

        # Block 6
        #conv6 = self.conv_block(1024, [3, 3], [1024, 1024], [1, 1], [], [], slope)

        # Flatten layer
        flatten = nn.Flatten()

        # First Dense Layer - Assumes images are 448x448x3, since it is faithful to the original YOLO
        linear1 = nn.Linear(7*7*1024, 4096)

        # Activation function after Linear Layer
        act = nn.LeakyReLU(slope, inplace=True)

        # Dropout
        drop = nn.Dropout(dropout)
        
        # Last Dense Layer
        linear2 = nn.Linear(4096, S*S*(B*5 + C))

        # Create the layers object
        self.layers = nn.Sequential(*conv1,
                                    *conv2,
                                    *conv3,
                                    *conv4,
                                    flatten,
                                    linear1,
                                    act,
                                    drop,
                                    linear2
                                    )
        
        
    def conv_block(self, start_channels, size_conv, out_channels, stride_conv, size_pool, stride_pool, slope):
        """
        Args:
            start_channels (int): Number of channels of first input.
            out_channels (List[int]): Number of kernels for each convlutional layer
            size_conv (List[int]): Filter sizes for each convolutional layer.
            stride_conv (List[int]): Stride values for each convolutional layer.
            size_pool (List[int]): Size for the single pooling layer (if exists)
            stride_pool (List[int]): Stride for the single pooling layer (if exists)
            slope (float): Negative slope in LeakyRELU
        
        Output:
            layers (List[nn.Module]): List with all the layers of the block
        """
        layers = []
        in_channels = [start_channels] + out_channels[:-1]
        for inp, out, size, stride in zip(in_channels, out_channels, size_conv, stride_conv):
            layers.append(nn.Conv2d(inp, out, size, stride, size//2))
            layers.append(nn.LeakyReLU(slope))

        for size, stride in zip(size_pool, stride_pool):
            layers.append(nn.MaxPool2d(size, stride))
        
        return layers

    def forward(self, x):
        batch_size = x.shape[0]
        for layer in self.layers:
            x = layer(x)

        return torch.reshape(x, (batch_size, self.S, self.S, self.B*5 + self.C))

    def predict(self, output, IOU_threshold=0.8, conf_threshold=0.0):
        """
        Args:
            output (batch_size, S, S, 5*B + C)

        Output:
            output_boxes (batch_size, S*S*B, 6)
        """
        # Get device
        device = output.device

        # First Step - Get the relevant information from the boxes
        num_grid = self.S * self.S
        num_boxes = num_grid * self.B

        batch_size = output.shape[0]

        boxes = output[:, :, :, :5*self.B].reshape((batch_size, num_boxes, 5)) # (batch_size, S*S*B, 5)
        logits = output[:, :, :, 5*self.B:].reshape((batch_size, num_grid, self.C)) # (batch_size, S*S, 200)
        classes = torch.softmax(logits, dim=-1)

        conf, x_mid, y_mid, width, height = boxes.unbind(dim=-1) # (batch_size, S*S*B)
        x_mid, y_mid = self.grid2img(x_mid, y_mid) # Convert relative to grid to relative to image

        class_prob, class_idx = torch.max(classes, dim=-1) # (batch_size, S*S)
        
        class_prob_repeated = class_prob.unsqueeze(-1).expand(batch_size, self.S * self.S, self.B)\
            .reshape((batch_size, num_boxes)) # (batch_size, S*S, B) -> (batch_size, S*S*B)
        
        class_idx_repeated = class_idx.unsqueeze(-1).expand(batch_size, self.S * self.S, self.B)\
            .reshape((batch_size, num_boxes)) # (batch_size, S*S, B) -> (batch_size, S*S*B)
        
        new_conf = torch.mul(conf, class_prob_repeated)  # (batch_size, S*S*B)

        new_conf, x_mid, y_mid, width, height, class_idx_repeated = map(lambda z: z.unsqueeze(-1), \
                                                                        [new_conf, x_mid, y_mid, width, height, class_idx_repeated])
        
        x_mid, y_mid, width, height = map(lambda x: torch.clamp(x, min=0, max=1), [x_mid, y_mid, width, height])

        output_boxes = torch.cat((new_conf, x_mid, y_mid, width, height, class_idx_repeated), dim=-1) # (batch_size, S*S*B, 6)


        # Second step - Sort by conf
        sorted_conf, sorted_idx = torch.sort(new_conf.squeeze(-1), dim=-1, descending=True) # (batch_size, num_boxes)
        batch_idx = torch.arange(batch_size).unsqueeze(1).expand(batch_size, num_boxes)
        sorted_boxes = output_boxes[batch_idx, sorted_idx, :]
        

        # Third Step - Non Maximum Supression (NMS)
        all_mask = torch.ones((batch_size, num_boxes)).to(device)
        for i in range(num_boxes-1):
            curr_boxes = sorted_boxes[:, i, :-1] # (batch_size, 5)
            other_boxes = sorted_boxes[:, i+1:, :-1] # (batch_size, num_boxes-i, 5)
            IOU_results = IOU(curr_boxes.unsqueeze(1), other_boxes)
            mask = IOU_results <= IOU_threshold
            is_masked = all_mask[:, i].unsqueeze(-1)
            all_mask[:, i+1:] = (1 - is_masked + is_masked * mask) * all_mask[:, i+1:] 


        # Fourth Step - Threshold for confidence and get Final Boxes
        result = [[] for _ in range(batch_size)]
        for batch in range(batch_size):
            for box in range(num_boxes):
               if sorted_conf[batch, box] >= conf_threshold:
                   if all_mask[batch, box]:
                        result[batch].append(tuple(sorted_boxes[batch, box, :].cpu().detach().tolist()))
               else:
                   break


        return result


    def grid2img(self, x, y):
        """
        Args:
            x (batch_size, S*S*B)
            y (batch_size, S*S*B)
        """
        batch_size = x.shape[0] 
        device = x.device
        new_dim = self.S * self.S * self.B
        dx = torch.arange(self.S).unsqueeze(0).unsqueeze(0).unsqueeze(-1).expand(batch_size, self.S, self.S, self.B).reshape(batch_size, new_dim).to(device)
        dy = torch.arange(self.S).unsqueeze(0).unsqueeze(-1).unsqueeze(-1).expand(batch_size, self.S, self.S, self.B).reshape(batch_size, new_dim).to(device)
        inv_S = 1/self.S

        return (x + dx) * inv_S, (y + dy) * inv_S

Test the architecture for an input

In [8]:
x = torch.randn(32, 3, 224, 224).to(device)
model = YOLO(C=num_classes).to(device)
output = model(x)
final_output = model.predict(output, IOU_threshold=1, conf_threshold=0)

final_output[0][:3]

[(0.00011280200124019757,
  0.4291333556175232,
  0.145037442445755,
  0.008986678905785084,
  0.010150418616831303,
  93.0),
 (9.408080950379372e-05,
  0.572654128074646,
  0.2832615077495575,
  0.0,
  0.013043057173490524,
  138.0),
 (9.388793114339933e-05,
  0.28296566009521484,
  0.8584703207015991,
  0.010823607444763184,
  0.0,
  2.0)]

In [9]:
class YOLOLoss(nn.Module):
    def __init__(self, l_coord=5, l_noobj=0.5):
        super().__init__()
        self.l_coord = l_coord
        self.l_noobj = l_noobj
    
    def forward(self, output, target, device, b=2, c=200):
        """
        output (Batch_size, S, S, 5*B + C)
        target (Batch_size, num_objects, 5)
        """ 
        batch_size, N, _ = target.shape
        S = output.shape[1]

        # Will be useful later on
        batch_idx = torch.arange(batch_size, device=device).unsqueeze(1).expand(batch_size, N)
        object_idx = torch.arange(N, device=device).unsqueeze(0).expand(batch_size, N)

        true_conf = torch.zeros((batch_size, S, S, b), device=device)
        predicted_boxes = output[:, :, :, :5*b].reshape((batch_size, S, S, b, 5)) # Get the clean info from the boxes

        # SSE loss
        SSE = nn.MSELoss(reduction = "sum")
        
        true_class, true_x, true_y, true_width, true_height = target.unbind(dim=-1)
        true_x, true_y, true_width, true_height = map(lambda x: torch.clamp(x, min=0, max=1), [true_x, true_y, true_width, true_height])

        j = torch.clamp((true_x * S).long(), max=S-1).to(device) # (batch_size, N)
        i = torch.clamp((true_y * S).long(), max=S-1).to(device) # (batch_size, N) 

        normalized_x = (true_x * S) - j.float() # (batch_size, N)
        normalized_y = (true_y * S) - i.float() # (batch_size, N)

        true_class, normalized_x, normalized_y, true_width, true_height = map(lambda x: x.unsqueeze(-1), [true_class, normalized_x, normalized_y, true_width, true_height])
        normalized_boxes = torch.cat((true_class, normalized_x, normalized_y, true_width, true_height), dim=-1).unsqueeze(-2)\
              # (batch_size, N, 1, 5)
        
        predicted_box = predicted_boxes[batch_idx, i, j, :, :] # (batch_size, N, B, 5)
        IOU_results = IOU(normalized_boxes, predicted_box)
        max_iou,  responsible_box = torch.max(IOU_results, dim=-1) # (batch_size, N, 2)
        true_conf[batch_idx, i, j, responsible_box] = max_iou

        _, pred_x, pred_y, pred_width, pred_height = map(lambda x: x.unsqueeze(-1), \
                                                         predicted_boxes[batch_idx, i, j, responsible_box, :].unbind(dim=-1))

        pred_x, pred_y = map(lambda x: torch.clamp(x, min=0, max=1), [pred_x, pred_y])
        pred_width, pred_height = map(lambda x: torch.clamp(x, min=1e-6, max=1), [pred_width, pred_height])

        # Compute localization losses
        x_localization_loss = SSE(pred_x, normalized_x)
        y_localization_loss = SSE(pred_y, normalized_y)

        width_localization_loss = SSE(torch.sqrt(pred_width), torch.sqrt(true_width))
        height_localization_loss = SSE(torch.sqrt(pred_height), torch.sqrt(true_height))

        # Compute classification loss
        class_logits = output[batch_idx, i, j, 5*b:] # (batch_size, N, C)
        class_probabs = torch.softmax(class_logits, dim=-1)

        true_vec = torch.zeros((batch_size, N, c), device=device)
        true_vec[batch_idx, object_idx, true_class.long().squeeze(-1)] = 1
        class_loss = SSE(true_vec, class_probabs)

        # Compute confidence loss    
        all_conf = predicted_boxes[:, :, :, :, 0]
        obj_mask = (true_conf > 0).float()
        weight = (1-obj_mask) * self.l_noobj + obj_mask * 1.0
        confidence_loss = ((all_conf - true_conf) ** 2 * weight).sum().to(device)

        return self.l_coord * (x_localization_loss + y_localization_loss + width_localization_loss + height_localization_loss) + \
            confidence_loss + class_loss

In [10]:
b = 4
c = 200
N = 7

criterion = YOLOLoss()

output = torch.rand(4, 11, 11, 5*b + c).to(device)
target = torch.rand(4, N, 5).to(device)

loss = criterion(output, target, device, b, c)
loss.item()

397.5947570800781

## Train the model

In [11]:
def train_one_epoch(model, dataloader_train, optimizer, criterion, device, epoch, scheduler):
    print(f"Beginning Training on epoch {epoch}")
    model.train()
    total_loss = 0.0

    for i, (batch_X, batch_y) in enumerate(dataloader_train):
        #if i % 30 == 0 or i == len(dataloader_train) - 1:
            #print(f"{(i / len(dataloader_train)) * 100:.2f}% done")

        optimizer.zero_grad()

        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)

        outputs = model(batch_X)

        loss = criterion(outputs, batch_y, device, model.B, model.C)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()

        total_loss += loss.item() 

    scheduler.step()

    return total_loss / len(dataloader_train.dataset) 

def evaluate(model, dataloader_val, criterion, device):
    print("Evaluating on the Validation set")
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for i, (batch_X, batch_y) in enumerate(dataloader_val):
            #if i % 30 == 0 or i == len(dataloader_val) - 1:
                #print(f"{(i / len(dataloader_val)) * 100:.2f}% done")
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            outputs = model(batch_X)

            loss = criterion(outputs, batch_y, device, model.B, model.C)
            total_loss += loss.item() 

    return total_loss / len(dataloader_val.dataset)

def AP_class(predictions, ground_truths, class_id, iou_threshold=[0.5]):
    """
    Calculate Average Precision for a specific class at a specific IoU threshold
    """

    # Filter predictions and ground truths for this class
    class_preds = [p for p in predictions if p[5] == class_id]
    class_gts = [gt for gt in ground_truths if gt[0] == class_id]
    
    if len(class_gts) == 0:
        return None  

    # Sort predictions by confidence (descending)
    class_preds.sort(key=lambda x: x[0], reverse=True)
    
    total_result = 0.0
    for threshold in iou_threshold:
        # Reset used flags for ground truths
        for gt in class_gts:
            gt[-2] = False
        
        # Calculate precision and recall at each prediction
        true_positives = []
        false_positives = []
        
        for pred in class_preds:
            # Find best matching ground truth
            idx_needed = pred[-1]
            best_iou = 0.0
            best_gt_idx = -1
            
            for gt_idx, gt in enumerate(class_gts):
                if not gt[-2] and gt[-1] == idx_needed:
                    iou = IOU(pred[:5], gt[:5]).item()
                    if iou > best_iou:
                        best_iou = iou
                        best_gt_idx = gt_idx

                
            # Determine if it's False Positive or True positive
            if best_iou >= threshold and best_gt_idx != -1:
                true_positives.append(1)
                false_positives.append(0)
                class_gts[best_gt_idx][-2] = True

            else:
                true_positives.append(0)
                false_positives.append(1)
        
        # Calculate cumulative precision and recall
        tp_cumsum = torch.cumsum(torch.tensor(true_positives, dtype=torch.float32), dim=0)
        fp_cumsum = torch.cumsum(torch.tensor(false_positives, dtype=torch.float32), dim=0)
        
        recalls = tp_cumsum / len(class_gts)
        precisions = tp_cumsum / (tp_cumsum + fp_cumsum + 1e-6)
        
        mono_stack = []
        for i in range(len(precisions) - 1, -1, -1): 
            if not mono_stack or precisions[i] > mono_stack[-1][1]:
                mono_stack.append([recalls[i], precisions[i].item()])  
        mono_stack.reverse()

        ap = 0.0  
        for recall_level in torch.linspace(1.0, 0.0, 11):  
            max_precision = 0.0
            while mono_stack and mono_stack[-1][0] > recall_level:
                _, p = mono_stack.pop()
                max_precision = p

            ap += max_precision
        
        total_result += ap / 11.0

    return total_result / len(iou_threshold)


def mAP(model, dataloader_val, num_classes, IOU_thresholds=[0.5]):
    model.eval()
    all_predicts = []
    all_ground = []
    curr_idx = 0

    with torch.no_grad():
        for i, (batch_X, batch_y) in enumerate(dataloader_val): # batch_y -> (batch_size, N, 5)
            if i % 100 == 0 or i == len(dataloader_val) - 1:
                print(f"{(i / (len(dataloader_val) - 1)) * 100:.2f}% done")

            # Send data to cuda
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            # Output
            output = model(batch_X) # (batch_size, S, S, 5*B + C)
            processed_output = model.predict(output) 
            batch_size = len(processed_output)

            for idx in range(batch_size):
                curr_predict = []
                for box in processed_output[idx]:
                    conf, x, y, w, h, class_idx = box
                    curr_predict.append([conf, x, y, w, h, int(class_idx), curr_idx])

                all_predicts.extend(curr_predict)

                
                curr_ground = []
                for gt_box in batch_y[idx]:
                    class_idx, x, y, w, h = gt_box
                    curr_ground.append([int(class_idx), x, y, w, h, False, curr_idx])
            
                all_ground.extend(curr_ground)
                
                curr_idx += 1

        ap_values = []
        for class_idx in range(num_classes):
            ap = AP_class(all_predicts, all_ground, class_idx, IOU_thresholds)
            if ap is not None:
                ap_values.append(ap)

    return sum(ap_values) / len(ap_values)

In [None]:
model = YOLO().to(device)
model.eval()
mAP(model, val_loader, num_classes=200, IOU_thresholds=[0.5])

In [17]:
best_model = YOLO(C=num_classes).to(device)
weights = torch.load("best_model.pth")
best_model.load_state_dict(weights)
model.eval()
loss = evaluate(best_model, val_loader, criterion, device)

print(loss)

Evaluating on the Validation set
1.789138946533203


In [None]:
model = YOLO(C=num_classes).to(device)
model.eval()
loss = evaluate(model, train_loader, criterion, device)

print(loss)

In [14]:
from torch.optim.lr_scheduler import LambdaLR

model = YOLO(C=num_classes).to(device)
criterion = YOLOLoss().to(device)
lr = 1e-3

optimizer = torch.optim.SGD(
    model.parameters(),
    lr=1e-3,           
    momentum=0.9,
    weight_decay=0.0005
)

total_epochs = 140
warmup_epochs = 5  

def lr_lambda(epoch):
    if epoch <= warmup_epochs:
        return 1 + ((epoch-1) / warmup_epochs) * 9  
    elif epoch < warmup_epochs + 75:
        return 10 
    elif epoch < warmup_epochs + 75 + 30:
        return 1   
    else:
        return 0.1 


In [13]:
def setup_better_training():
    model = YOLO(C=200).to(device)
    
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=1e-3,  
        momentum=0.9,
        weight_decay=5e-4
    )
    
    scheduler = torch.optim.lr_scheduler.MultiStepLR(
        optimizer, 
        milestones=[60, 90], 
        gamma=0.1
    )
    
    return model, optimizer, scheduler

In [None]:
def run_training():
    model, optimizer, scheduler = setup_better_training()
    
    criterion = YOLOLoss().to(device)
    
    total_epochs = 135
    best_val_loss = float('inf')
    patience = 15  
    counter = 0
    
    for epoch in range(1, total_epochs + 1):
        # Train one epoch
        train_loss = train_one_epoch(model, train_loader, optimizer, criterion, device, epoch, scheduler)
        
        # Evaluate on validation set
        val_loss = evaluate(model, val_loader, criterion, device)
        
        # Calculate mAP
        if epoch % 5 == 0:  
            mAP_result = mAP(model, val_loader, num_classes=200, IOU_thresholds=[0.5])
            print(f"mAP at epoch {epoch}: {mAP_result:.3f}")
        
        # Print results
        print(f"Epoch {epoch}")
        print(f"Train Loss: {train_loss:.4f}")
        print(f"Val Loss: {val_loss:.4f}")
        print(f"Best Val Loss: {best_val_loss:.4f}")
        print("-" * 50)

        # Early stopping logic
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            print(f"Found better model at epoch {epoch}. Saving...")
            torch.save(model.state_dict(), "best_model.pth")
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stopping triggered at epoch {epoch}")
                break

In [15]:
run_training()

Beginning Training on epoch 1
Evaluating on the Validation set
Found better model at epoch 1. Saving...
Epoch 1
Train Loss: 1.9735
Val Loss: 1.8149
Best Val Loss: 1.8149
--------------------------------------------------
Beginning Training on epoch 2
Evaluating on the Validation set
Found better model at epoch 2. Saving...
Epoch 2
Train Loss: 1.8077
Val Loss: 1.8095
Best Val Loss: 1.8095
--------------------------------------------------
Beginning Training on epoch 3
Evaluating on the Validation set
Found better model at epoch 3. Saving...
Epoch 3
Train Loss: 1.7949
Val Loss: 1.8011
Best Val Loss: 1.8011
--------------------------------------------------
Beginning Training on epoch 4
Evaluating on the Validation set
Found better model at epoch 4. Saving...
Epoch 4
Train Loss: 1.7890
Val Loss: 1.7996
Best Val Loss: 1.7996
--------------------------------------------------
Beginning Training on epoch 5
Evaluating on the Validation set
0.00% done
50.25% done
100.00% done
mAP at epoch 5: 0