<a href="https://www.kaggle.com/code/renesta/tiny-yolo?scriptVersionId=218810445" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import kagglehub
import os
import pandas as pd
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
from torchvision import transforms
import cv2
import torchvision
from torch.cuda.amp import autocast, GradScaler
from torch.profiler import profile, record_function, ProfilerActivity
import torch.nn.functional as F
import re

# Download latest version
path1 = kagglehub.dataset_download("iamtushara/face-detection-dataset")
path2 = kagglehub.dataset_download("fareselmenshawii/face-detection-dataset")

print("Path to dataset1 files:", path1)
print("Path to dataset2 files:", path2)

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

In [None]:
class Dataset25(Dataset):
    def __init__(self, root_paths, grid_sizes=[13, 26, 52], YOLO_shape=(416, 416), transform=None):
        self.root_paths = root_paths
        self.images_folder = [os.path.join(root_paths[0], "merged/images/train"), os.path.join(root_paths[1], "images/train")]
        self.labels_folder = [os.path.join(root_paths[0], "merged/labels/train"), os.path.join(root_paths[1], "labels/train")]
        self.transform = transform
        self.YOLO_shape = YOLO_shape
        self.grid_sizes = grid_sizes
        self.images = []
        self.targets = []

        for index, images_folder in enumerate(self.images_folder): 

            for filename in os.listdir(images_folder):
                
                if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                    image_path = os.path.join(images_folder, filename)
                    label_path = os.path.join(self.labels_folder[index], filename.replace('.jpg', '.txt'))
                    label_path1 = os.path.join(self.labels_folder[index], filename.replace('.jpeg', '.txt'))
                    if os.path.exists(label_path):
                        self.images.append(image_path)
                        self.targets.append(label_path)
                    elif os.path.exists(label_path1):
                        self.images.append(image_path)
                        self.targets.append(label_path1)
                    else:
                        print(f"Warning: Label file not found for {filename}")

    def __len__(self):
        return len(self.images) 

    def __getitem__(self, index):
        
        image_path = self.images[index]
        label_path = self.targets[index]
        pattern = re.compile(r'.*iamtushara.*')
        match = pattern.search(image_path)
        match = 1

        img = Image.open(image_path).convert("RGB")


        with open(label_path, 'r') as f:
            lines = f.readlines()

        annotations = []
        for line in lines:
            parts = line.strip().split()
            if len(parts) < 4:
                continue

            x, y, width, height = parts[-4:]
            x = float(x)
            y = float(y)
            width = float(width)
            height = float(height)
            annotations.append((x, y, width, height))


        target_list = []
        for grid_size in self.grid_sizes:
            target = np.zeros((grid_size * grid_size, 5), dtype=np.float32)
            img_width, img_height = img.size

            for annot in annotations:
                
                x, y, width, height = annot
                
                x_center = ((x + width/2) * self.YOLO_shape[0] if match else (x) / img_width * self.YOLO_shape[0])
                y_center = ((y + height/2) * self.YOLO_shape[1] if match else (y) / img_height * self.YOLO_shape[1])
                box_width = (width * self.YOLO_shape[0] if match else width / img_width * self.YOLO_shape[0])
                box_height = (height * self.YOLO_shape[1] if match else height / img_height * self.YOLO_shape[1])

                grid_x = ((grid_size / self.YOLO_shape[0]) * x_center)
                grid_y = ((grid_size / self.YOLO_shape[1]) * y_center)

                target_idx = int(grid_y - 0.000001) * grid_size + int(grid_x - 0.000001)

                target[target_idx, 0] = 1.0 
                target[target_idx, 1] = (grid_x - int(grid_x)) * self.YOLO_shape[0] / grid_size
                target[target_idx, 2] = (grid_y - int(grid_y)) * self.YOLO_shape[1] / grid_size
                target[target_idx, 3] = box_width
                target[target_idx, 4] = box_height

            target_list.append(torch.from_numpy(target).float())

        img = img.resize(self.YOLO_shape)
        
        if self.transform:
            img = self.transform(img)

        return img, target_list

In [None]:
dt = Dataset25([path1,path2], transform = transform)

In [None]:
train_data, validation_data = random_split(dt, [0.9, 0.1])

In [None]:
train_loader = DataLoader(train_data, batch_size = 128, shuffle = True, pin_memory=True)
validation_loader = DataLoader(validation_data, batch_size = 128, shuffle = True, pin_memory=True)

In [None]:
print(len(dt))

In [None]:
class EarlyStopping:
    
    def __init__(self, patience=10, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.best_loss = float('inf')
        self.counter = 0
        self.early_stop = False

    def __call__(self, val_loss):
        if val_loss < self.best_loss:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                if self.verbose:
                    print("Early stopping triggered.")

In [None]:
def assign_box_to_best_scale(w, h, THRESH_LARGE, THRESH_MEDIUM):
    
    box_area = w * h
    if box_area > THRESH_LARGE:
        return 13
    elif box_area > THRESH_MEDIUM:
        return 26
    else:
        return 52

In [None]:
class YoloLoss(nn.Module):
    def __init__(self, lambda_l2=0.001, lambda_conf=5.0, lambda_box = 1.0, THRESH_MEDIUM = 3000, THRESH_LARGE = 10000):

        super(YoloLoss, self).__init__()
        self.lambda_l2 = lambda_l2
        self.lambda_conf = lambda_conf
        self.lambda_box = lambda_box
        self.bce_loss = nn.BCEWithLogitsLoss(reduction='none')
        self.THRESH_MEDIUM = 3000
        self.THRESH_LARGE = 10000

    def assign_box_to_best_scale(self, w, h, THRESH_MEDIUM, THRESH_LARGE):
        box_area = w * h
        scale = torch.where(box_area > THRESH_LARGE, torch.tensor(13.0), torch.where(box_area > THRESH_MEDIUM, torch.tensor(26.0), torch.tensor(52.0)))
        return scale

    def compute_iou_vectorized(self, pred_boxes, target_boxes):
       
        xmin_intersect = torch.max(pred_boxes[:, 0], target_boxes[:, 0])
        ymin_intersect = torch.max(pred_boxes[:, 1], target_boxes[:, 1])
        xmax_intersect = torch.min(pred_boxes[:, 2], target_boxes[:, 2])
        ymax_intersect = torch.min(pred_boxes[:, 3], target_boxes[:, 3])
        
        # Compute intersection area
        width_intersect = torch.clamp(xmax_intersect - xmin_intersect, min=0)
        height_intersect = torch.clamp(ymax_intersect - ymin_intersect, min=0)
        area_intersect = width_intersect * height_intersect
        
        # Compute areas of prediction and target boxes
        area_pred = (pred_boxes[:, 2] - pred_boxes[:, 0]) * (pred_boxes[:, 3] - pred_boxes[:, 1])
        area_target = (target_boxes[:, 2] - target_boxes[:, 0]) * (target_boxes[:, 3] - target_boxes[:, 1])
        
        # Compute union area
        area_union = area_pred + area_target - area_intersect
        
        # Compute IoU
        iou = area_intersect / area_union
        return iou

    def forward(self, all_predictions, all_targets):

        total_loss = 0
        scales = [13, 26, 52]
        
        for predictions, targets, scale in zip(all_predictions, all_targets, scales):
            
            w = targets[..., 3]
            h = targets[..., 4]

            assigned_scale = self.assign_box_to_best_scale(w, h, self.THRESH_MEDIUM, self.THRESH_LARGE)
            scale_mask = assigned_scale == scale
            
            object_mask = targets[..., 0] == 1
            no_object_mask = targets[..., 0] == 0

            object_mask_scale = object_mask & scale_mask
    
            pred_conf = torch.sigmoid(predictions[..., 0])
            target_conf = targets[..., 0]
    
            conf_loss_obj = self.bce_loss(pred_conf[object_mask_scale], target_conf[object_mask_scale])
            conf_loss_no_obj = self.bce_loss(pred_conf[no_object_mask], target_conf[no_object_mask])
    
            conf_loss_obj = conf_loss_obj.mean()
            conf_loss_no_obj = conf_loss_no_obj.mean()
    
            pred_boxes = predictions[object_mask_scale]
            target_boxes = targets[object_mask_scale]

            box_loss = 0
    
            if pred_boxes.numel() > 0:

                pred_x = pred_boxes[:, 1]
                pred_y = pred_boxes[:, 2]
                pred_w = pred_boxes[:, 3]
                pred_h = pred_boxes[:, 4]
                pred_xmin = pred_x - pred_w / 2
                pred_ymin = pred_y - pred_h / 2
                pred_xmax = pred_x + pred_w / 2
                pred_ymax = pred_y + pred_h / 2
                pred_boxes_corner = torch.stack([pred_xmin, pred_ymin, pred_xmax, pred_ymax], dim=1)
                
                target_x = target_boxes[:, 1]
                target_y = target_boxes[:, 2]
                target_w = target_boxes[:, 3]
                target_h = target_boxes[:, 4]
                target_xmin = target_x - target_w / 2
                target_ymin = target_y - target_h / 2
                target_xmax = target_x + target_w / 2
                target_ymax = target_y + target_h / 2
                target_boxes_corner = torch.stack([target_xmin, target_ymin, target_xmax, target_ymax], dim=1)
                
                # Compute IoU
                iou = self.compute_iou_vectorized(pred_boxes_corner, target_boxes_corner)
                # Compute box loss
                box_loss = 1 - iou
                box_loss = box_loss.mean()
                    
            else:
                box_loss = torch.tensor(0.0, device=predictions.device)
    
            l2_reg = 0
            for name, param in model.named_parameters():
                if 'bias' not in name:
                    l2_reg += torch.norm(param) ** 2
    
            l2_loss = self.lambda_l2 * l2_reg
    
            total_loss += (self.lambda_box * box_loss + self.lambda_conf * 2 * conf_loss_obj + self.lambda_conf * conf_loss_no_obj + l2_loss)

        return total_loss, self.lambda_conf * (conf_loss_obj + conf_loss_no_obj), self.lambda_box * box_loss, l2_loss

In [None]:

def conv_bn_leaky(in_channels, out_channels, kernel_size=3, stride=1, padding=1):
    """Удобный блок: Conv2d -> BatchNorm -> LeakyReLU."""
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=False),
        nn.BatchNorm2d(out_channels),
        nn.LeakyReLU(0.1, inplace=True)
    )

def conv1x1_bn_leaky(in_channels, out_channels):
    """Блок 1x1 сверка для сжатия/расширения каналов."""
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, 1, 1, 0, bias=False),
        nn.BatchNorm2d(out_channels),
        nn.LeakyReLU(0.1, inplace=True)
    )

# ---------------------------
#    YOLO-like Backbone
# ---------------------------
class TinyDarknetBackbone(nn.Module):
    """
    Упрощённый backbone, напоминающий Tiny-YOLO/Darknet
    """
    def __init__(self):
        super(TinyDarknetBackbone, self).__init__()
        
        # (3 x 416 x 416) -> (16 x 208 x 208)
        self.block1 = nn.Sequential(
            conv_bn_leaky(3, 16, 3, stride=1, padding=1),
            nn.MaxPool2d(2, 2)  # downsample в 2 раза
        )
        
        # (16 x 208 x 208) -> (32 x 104 x 104)
        self.block2 = nn.Sequential(
            conv_bn_leaky(16, 32, 3, stride=1, padding=1),
            nn.MaxPool2d(2, 2)
        )
        
        # (32 x 104 x 104) -> (64 x 52 x 52)
        self.block3 = nn.Sequential(
            conv_bn_leaky(32, 64, 3, stride=1, padding=1),
            nn.MaxPool2d(2, 2)
        )
        
        # (64 x 52 x 52) -> (128 x 26 x 26)
        self.block4 = nn.Sequential(
            conv_bn_leaky(64, 128, 3, stride=1, padding=1),
            nn.MaxPool2d(2, 2)
        )
        
        # (128 x 26 x 26) -> (256 x 13 x 13)
        self.block5 = nn.Sequential(
            conv_bn_leaky(128, 256, 3, stride=1, padding=1),
            nn.MaxPool2d(2, 2)
        )
        
        # (256 x 13 x 13) -> здесь остановка (выход low-res)
        self.block6 = nn.Sequential(
            conv_bn_leaky(256, 512, 3, stride=1, padding=1)
        )
        
    def forward(self, x):
        """
        Возвращаем фичи с двух промежуточных этапов (для skip connection):
         - medium_output (26x26)
         - small_output  (13x13)
        А также фичу (52x52) можно &laquo;снять&raquo; до 4-го maxpool’a, 
        если понадобится “более мелкая” голова.
        """
        x = self.block1(x)  # (16 x 208 x 208)
        x = self.block2(x)  # (32 x 104 x 104)
        x = self.block3(x)  # (64 x 52 x 52)

        feat_52 = x
        
        x = self.block4(x)  # (128 x 26 x 26)
        feat_26 = x
        
        x = self.block5(x)  # (256 x 13 x 13)
        x = self.block6(x)  # (512 x 13 x 13)
        feat_13 = x
        
        return feat_13, feat_26, feat_52

class YoloHead(nn.Module):
    """
    Упрощённо: свёрточные блоки, потом 1x1 Conv -> (out_channels).
    """
    def __init__(self, in_channels_13=512, in_channels_26=128, in_channels_52=64, 
                 out_channels=5):
        super(YoloHead, self).__init__()
        
        # Голова на 13x13
        self.conv_13 = nn.Sequential(
            conv_bn_leaky(in_channels_13, 256, 3, 1, 1),
            nn.Conv2d(256, out_channels, 1, 1, 0)
        )
        
        # Голова на 26x26
        # Но сперва добавим блок, чтобы поднять (upsample) и слить 13x13 => 26x26

        self.skip_13_to_26 = conv1x1_bn_leaky(in_channels_13, 128)
        self.upsample_13 = nn.Upsample(scale_factor=2, mode='nearest')
        
        self.conv_26 = nn.Sequential(
            conv_bn_leaky(in_channels_26 + 128, 128, 3, 1, 1),
            nn.Conv2d(128, out_channels, 1, 1, 0)
        )
        
        # Голова на 52x52
        self.skip_26_to_52 = conv1x1_bn_leaky(in_channels_26 + 128, 64)
        self.upsample_26 = nn.Upsample(scale_factor=2, mode='nearest')
        
        self.conv_52 = nn.Sequential(
            conv_bn_leaky(in_channels_52 + 64, 64, 3, 1, 1),
            nn.Conv2d(64, out_channels, 1, 1, 0)
        )

    def forward(self, feat_13, feat_26, feat_52):
        """
        feat_13: (512, 13, 13)
        feat_26: (128, 26, 26)
        feat_52: (64, 52, 52)
        """
        out_13 = self.conv_13(feat_13)  
        # out_13 -> (N, out_channels, 13, 13)
        
        #    cначала берём feat_13, уменьшаем каналы до 128 и upsample x2 => (128, 26, 26)
        up_13 = self.skip_13_to_26(feat_13) 
        up_13 = self.upsample_13(up_13)     # (128, 26, 26)
        
        #  конкатенируем с feat_26 => (128+128, 26, 26)
        merge_26 = torch.cat([feat_26, up_13], dim=1)
        out_26 = self.conv_26(merge_26)
        
        # для 52x52
        up_26 = self.skip_26_to_52(merge_26) 
        up_26 = self.upsample_26(up_26)     # (64, 52, 52)
        
        # конкатенируем с feat_52 => (64+64, 52, 52)
        merge_52 = torch.cat([feat_52, up_26], dim=1)
        out_52 = self.conv_52(merge_52)
        
        # out_13: (N, 5, 13, 13) -> (N, 13*13, 5)
        N = out_13.size(0)
        
        out_13 = out_13.permute(0, 2, 3, 1).contiguous().view(N, -1, 5)
        out_26 = out_26.permute(0, 2, 3, 1).contiguous().view(N, -1, 5)
        out_52 = out_52.permute(0, 2, 3, 1).contiguous().view(N, -1, 5)

        return [out_13, out_26, out_52]


# ---------------------------
#    Итоговая модель
# ---------------------------
class YoloModel(nn.Module):
    def __init__(self, out_channels=5):
        super(YoloModel, self).__init__()
        self.backbone = TinyDarknetBackbone()
        self.head = YoloHead(
            in_channels_13=512,  # соответствует выходу backbone на 13x13
            in_channels_26=128,
            in_channels_52=64,
            out_channels=out_channels
        )
        
    def forward(self, x):
        feat_13, feat_26, feat_52 = self.backbone(x)
        outputs = self.head(feat_13, feat_26, feat_52)
        return outputs



In [None]:

def train_epoch(model, dataloader, criterion, optimizer, device):
    
    scaler = GradScaler()
    
    model.train()
    total_loss = 0
    total_conf_loss = 0
    total_box_loss = 0
    total_l2_loss = 0


    for batch_idx, (inputs, targets) in enumerate(dataloader):

        inputs = inputs.to(device)
        targets = [t.to(device) for t in targets]

        optimizer.zero_grad()

        with autocast():
            
            outputs = model(inputs)
            loss, conf_loss, box_loss, l2_loss = criterion(outputs, targets)

        loss.backward()

        optimizer.step()

        total_loss += loss.item()
        total_conf_loss += conf_loss.item()
        total_box_loss += box_loss.item()
        total_l2_loss += l2_loss.item()

        if batch_idx % 12 == 0:
            print(f'Batch {batch_idx}, Total Loss: {loss.item():.4f}, '
                  f'Confidence Loss: {conf_loss.item():.4f}, '
                  f'Box Loss: {box_loss.item():.4f}, '
                  f'L2 Loss: {l2_loss.item():.4f}')


    return total_loss / len(dataloader)



In [None]:
def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    tpd = []  # To collect probability differences for objects
    tpd1 = []  # To collect probability differences for non-objects

    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(dataloader):
            inputs = inputs.to(device)
            targets = [t.to(device) for t in targets]

            outputs = model(inputs)
            loss, conf_loss, box_loss, l2_loss = criterion(outputs, targets)
            total_loss += loss.item()

            # Iterate over each output-target pair
            for output, target in zip(outputs, targets):
                # Identify objects and non-objects
                object_mask = target[..., 0] > 0
                no_object_mask = target[..., 0] == 0

                pred_probabilities = torch.sigmoid(output[..., 0])

                # Collect probability differences for objects
                if object_mask.any():
                    predicted_probs_for_objects = pred_probabilities[object_mask]
                    target_probs_for_objects = target[..., 0][object_mask]
                    probability_diff = predicted_probs_for_objects - target_probs_for_objects
                    tpd.extend(probability_diff.cpu().numpy())

                # Collect probability differences for non-objects
                if no_object_mask.any():
                    predicted_probs_for_no_objects = pred_probabilities[no_object_mask]
                    target_probs_for_no_objects = target[..., 0][no_object_mask]
                    probability_diff1 = predicted_probs_for_no_objects - target_probs_for_no_objects
                    tpd1.extend(probability_diff1.cpu().numpy())

    # Compute average loss and probability differences
    avg_loss = total_loss / len(dataloader)
    avg_probability_diff = np.mean(np.abs(tpd)) if tpd else 0
    avg_probability_diff1 = np.mean(np.abs(tpd1)) if tpd1 else 0

    return avg_loss, avg_probability_diff, avg_probability_diff1

In [None]:
def load_model(checkpoint_path, model_class, optimizer_class=None, scheduler_class=None, device=None):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    
    model = model_class()
    chkpt = checkpoint['model_state_dict']
    chkpt = {k.replace('module.', ''): v for k, v in chkpt.items()}
    model.load_state_dict(chkpt)
    model.to(device)
    
    if optimizer_class:
        optimizer = optimizer_class(model.parameters())
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        optimizer = None
    
    if scheduler_class and 'scheduler_state_dict' in checkpoint:
        scheduler = scheduler_class(optimizer)
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    else:
        scheduler = None
    
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    
    return model, optimizer, scheduler, epoch, loss

In [None]:
model = YoloModel()
criterion = YoloLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2,min_lr=10**(-12))

model_class = YoloModel
criterion = YoloLoss()
optimizer_class = torch.optim.Adam
scheduler_class = torch.optim.lr_scheduler.ReduceLROnPlateau
torch.backends.cudnn.benchmark = True


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


checkpoint_path = '/kaggle/working/checkpoint.pth'
model, optimizer, scheduler, start_epoch, loss = load_model(
    checkpoint_path,
    model_class,
    optimizer_class,
    scheduler_class,
    device=device
)

for g in optimizer.param_groups:
    g['lr'] = 0.01


model.to(device)

scheduler.patience = 3

early_stopping = EarlyStopping(patience=5, verbose=True)
num_epochs = 10
for epoch in range(num_epochs):
    
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    
    
    scheduler.step(train_loss)

    val_loss, avg_prob, avg_prob_neg = validate_epoch(model, validation_loader, criterion, device)

    current_lr = scheduler.get_last_lr()[0]

    print(f'parameters: Epoch {epoch+1}/{num_epochs}, Average Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}, Probability: {avg_prob}, Negative Probability: {avg_prob_neg:.4f}, Learning Rate: {current_lr}')


    early_stopping(val_loss)
    if early_stopping.early_stop and current_lr <= 10**(-10):
        print("Training stopped.")
        break

In [None]:

def save_checkpoint(model, optimizer, scheduler, epoch, loss, filename):
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'loss': loss,
    }, filename)


save_checkpoint(model, optimizer, scheduler, epoch, train_loss, 'checkpoint.pth')


In [None]:
def preprocess_image(img, device):
    
    img = img.unsqueeze(0)
    img = img.to(device)
    return img

In [None]:
def process_outputs(outputs, size):
    
    confidence = torch.sigmoid(outputs[0, ..., 0]).reshape(size, size)
    positions = outputs[0, ..., 1:3].reshape(size, size, 2)
    sizes = outputs[0, ..., 3:5].reshape(size, size, 2)

    return [confidence, positions, sizes]

In [None]:
def non_max_suppression(boxes, iou_threshold):
    if not boxes:
        return []

    boxes = sorted(boxes, key=lambda x: x[4], reverse=True)
    selected = []
    while boxes:
        max_box = boxes.pop(0)
        selected.append(max_box)
        boxes = [box for box in boxes if iou(max_box, box) < iou_threshold]
    return selected


In [None]:
def iou(box1, box2):
    x1_min, y1_min, x1_max, y1_max, _ = box1
    x2_min, y2_min, x2_max, y2_max, _ = box2

    overlap_x_min = max(x1_min, x2_min)
    overlap_y_min = max(y1_min, y2_min)
    overlap_x_max = min(x1_max, x2_max)
    overlap_y_max = min(y1_max, y2_max)

    overlap_area = max(overlap_x_max - overlap_x_min, 0) * max(overlap_y_max - overlap_y_min, 0)
    area1 = (x1_max - x1_min) * (y1_max - y1_min)
    area2 = (x2_max - x2_min) * (y2_max - y2_min)
    iou = overlap_area / (area1 + area2 - overlap_area + 1e-6)
    return iou

In [None]:
def draw_bounding_box(img, params, device):
    
    img = img.cpu()
    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    img_denorm = img * std + mean

    img_np = img_denorm.numpy()

    img_np = (np.transpose(img_np[0], (1, 2, 0))).copy()

    img_np = (img_np * 255).astype(np.uint8)

    all_boxes = []
    
    for param in params:

        confidence, positions, sizes, size = param
        
        for x in range(size):
            for y in range(size):
                if confidence[x, y] > 0.999:
                    pos_x, pos_y = positions[x, y]
    
                    pos_x = pos_x.item()
                    pos_y = pos_y.item()
    
                    width, height = sizes[x, y] 
    
                    top_left = (int((x+1) * 416 / size - width / 2 + pos_x), (int((y+1) * 416 / size - height / 2)) + int(pos_y))
                    bottom_right = (int((x+1) * 416 / size + width / 2 + pos_x), (int((y+1) * 416 / size + height / 2)) + int(pos_y))

                    all_boxes.append((top_left[1], top_left[0], bottom_right[1], bottom_right[0], confidence[x, y]))
                    
    selected_boxes = non_max_suppression(all_boxes, iou_threshold=0.2)

    for box in selected_boxes:
        x_min, y_min, x_max, y_max, _ = box
        
        try:
            img_np = img_np.astype(np.uint8)
            img_np = cv2.rectangle(img_np, (x_min, y_min), (x_max, y_max), (255, 0, 0), 10)
            
        except Exception as e:
            print(f"Error drawing rectangle: {e}")
            continue

    return img_np

In [None]:
def video_processing(model, path, sizes, transform = None, size = (416, 416), stream = cv2.COLOR_BGR2GRAY, device = 'cuda'):
    
    model.to(device)
    model.eval()
    cap = cv2.VideoCapture(path)

    if not cap.isOpened():
        
        print("Ошибка при открытии видеофайла")
        

    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fps = cap.get(cv2.CAP_PROP_FPS)
    out = cv2.VideoWriter('/kaggle/working/output_video2.mp4', cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))

    x = 0
    while True:

        ret, frame = cap.read()
        
    
        if not ret:
            break        
        
        frame = Image.fromarray(frame).convert('RGB')
        resized_frame = frame.resize(size)
        frame = frame.resize(size)

        if transform != None:

            frame = transform(frame).float()
            frame_tensor = transform(resized_frame).float()

        else: raise ValueError("processing is anavailable without data transformation")
        
        frame = preprocess_image(frame,device)
        frame_tensor = preprocess_image(frame_tensor, device)

        with torch.no_grad():

            outputs = model(frame_tensor)

        param = [process_outputs(output, sizes[index])+[sizes[index]] for index,output in enumerate(outputs)]
        img = draw_bounding_box(frame, param, device)

        img = cv2.resize(img, (frame_width, frame_height))

        if len(img.shape) == 2:

            img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)

        img = img.astype(np.uint8)

        # plt.imshow(img)
        # plt.axis('on')
        # plt.show()

        out.write(img)

    cap.release()
    out.release()


In [None]:
model_class = YoloModel
criterion = YoloLoss()
optimizer_class = torch.optim.Adam
scheduler_class = torch.optim.lr_scheduler.ReduceLROnPlateau

checkpoint_path = '/kaggle/working/checkpoint.pth'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model, _, _, _, _ = load_model(
    checkpoint_path,
    model_class,
    optimizer_class = optimizer_class,
    scheduler_class = scheduler_class,
    device = device
)
a = video_processing(model, '/kaggle/input/vi2fcd/download (5)', [13, 26, 52], transform)