# Лабораторная работа №7 (Проведение исследований моделями семантической сегментации)
## Выполнил студент группы М8О-406Б-21, Орусский В.Р.

### Выбор исходных данных и обоснование

При выполнении данной ЛР использовался датасет [BDD100K](https://www.kaggle.com/datasets/solesensei/solesensei_bdd100k/), аннотирующий изображения для автомобилей с автопилотом. 

Предназначен для различия объектов на дороге, среди которых выделение тротуаров, дорожных знаков, автомобилей и других различных объектов на дорогах общего пользования. В данном датасете собраны видео с дорожной обстановкой, каждое в среднем по 40 секунд с частотой кадров 30fps (то есть 1200 кадров на видео), снято в разрешении 720p (1280x720px). Помимо этого, в датасет включены данные с GPS, чтобы показать примерную траекторию движения. Датасет охватывает различные погодные условия и времена суток (день / ночь).

Датасет выбран для задачи определения объектов в дорожной обстановке, нужно для создания автопилота (продвинутого круиз-контроля).

### Импорт библиотек

In [None]:
# Data process
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# AI
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torchvision
import torch.nn.functional as F
from torch.autograd import Variable

# Computer Vision
from PIL import Image
import cv2
#import albumentations as A


# utils 
import time
import os
from tqdm.notebook import tqdm
from pathlib import Path


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [None]:
%pip install -q segmentation-models-pytorch
%pip install -q torchsummary

from torchsummary import summary
import segmentation_models_pytorch as smp


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Загружаем открытый датасет с помощью `kagglehub`. Сохраняем путь сохранения изображений для дальнейшей работы с выборками

In [16]:
import kagglehub

path = kagglehub.dataset_download("solesensei/solesensei_bdd100k")
path = os.path.join(path, 'bdd100k_seg/bdd100k/seg')

print("Path to auto dataset:", path)

Path to auto dataset: C:\Users\slava\.cache\kagglehub\datasets\solesensei\solesensei_bdd100k\versions\2\bdd100k_seg/bdd100k/seg


In [17]:
TRAIN_IMAGES = os.path.join(path, 'images/train')
TRAIN_MASKS = os.path.join(path, 'labels/train')

VAL_IMAGES = os.path.join(path, 'images/val')
VAL_MASKS = os.path.join(path, 'labels/val')

def check_path(path_name: str):
    if not os.path.exists(path_name):
        raise Exception(f"Пути {path_name} не существует")

check_path(TRAIN_IMAGES)
check_path(TRAIN_MASKS)
check_path(VAL_IMAGES)
check_path(VAL_MASKS)

### Размеры выборки

In [None]:
TRAIN_IMAGES_LIST = os.listdir(TRAIN_IMAGES)
TRAIN_MASKS_LIST = os.listdir(TRAIN_MASKS)

VAL_IMAGES_LIST = os.listdir(VAL_IMAGES)
VAL_MASKS_LIST = os.listdir(VAL_MASKS)

assert len(TRAIN_IMAGES_LIST) == len(TRAIN_MASKS_LIST)
assert len(VAL_IMAGES_LIST) == len(VAL_MASKS_LIST)

print("Размер тренировочной выборки:", len(TRAIN_IMAGES_LIST))
print("Размер валидационной выборки:", len(VAL_IMAGES_LIST))

Размер тренировочной выборки: 7000
Размер валидационной выборки: 1000


Создаём класс для работы с датасетом. Хранение, трансформация и получение файлов из датасета

In [37]:
class BDD100KDataset(Dataset):
    def __init__(self, image_dir, mask_dir, transform=None, mask_transform=None):
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.transform = transform
        self.mask_transform = mask_transform
        self.images = sorted(os.listdir(image_dir))
        self.masks = sorted(os.listdir(mask_dir))

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.images[idx])
        mask_path = os.path.join(self.mask_dir, self.masks[idx])
        image = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path)

        if self.transform:
            image = self.transform(image)
        if self.mask_transform:
            mask = self.mask_transform(mask)
        
        mask = np.array(mask)
        mask = torch.from_numpy(mask).long()
        return image, mask

Функция для отображения примеров изображения и маски из датасета

In [None]:
def show_images(dataset: BDD100KDataset, cnt_images: int = 5):
    plt.figure(figsize=(14, 5 * cnt_images))

    for idx in range(cnt_images):
        image, mask = dataset[idx]

        plt.subplot(cnt_images, 2, idx * 2 + 1)
        plt.imshow(image)
        plt.title(f'Изображение {idx}')

        plt.subplot(cnt_images, 2, idx * 2 + 2)
        plt.imshow(np.array(mask).squeeze())
        plt.title(f'Маска {idx}')

    plt.tight_layout()
    plt.show()

In [None]:
preview_dataset = BDD100KDataset(TRAIN_IMAGES, TRAIN_MASKS, transform=None, mask_transform=None)
show_images(preview_dataset, cnt_images=3)

Аугментация данных. Производим детерменированные изменения, поэтому синхронность необязательна

In [None]:
image_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

mask_transform = transforms.Compose([
    transforms.Resize((256, 256), interpolation=Image.NEAREST)
])

In [None]:
BATCH_SIZE = 8

train_dataset = BDD100KDataset(TRAIN_IMAGES, TRAIN_MASKS, transform=image_transform, mask_transform=mask_transform)
val_dataset = BDD100KDataset(VAL_IMAGES, VAL_MASKS, transform=image_transform, mask_transform=mask_transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=4)

In [None]:
# Import necessary libraries
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import segmentation_models_pytorch as smp
from sklearn.metrics import jaccard_score, precision_score, recall_score, f1_score
import numpy as np
import os
from PIL import Image
import matplotlib.pyplot as plt


# Define number of classes (BDD100K has 19 classes for semantic segmentation)
NUM_CLASSES = 19


# Evaluation metrics
def evaluate_model(model, data_loader, device):
    model.eval()
    iou_scores = []
    precision_scores = []
    recall_scores = []
    f1_scores = []

    with torch.no_grad():
        for images, masks in data_loader:
            images, masks = images.to(device), masks.to(device)
            outputs = model(images)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            masks = masks.cpu().numpy()

            for pred, mask in zip(preds, masks):
                pred_flat = pred.flatten()
                mask_flat = mask.flatten()
                iou = jaccard_score(mask_flat, pred_flat, average='macro', labels=range(NUM_CLASSES), zero_division=0)
                precision = precision_score(mask_flat, pred_flat, average='macro', labels=range(NUM_CLASSES), zero_division=0)
                recall = recall_score(mask_flat, pred_flat, average='macro', labels=range(NUM_CLASSES), zero_division=0)
                f1 = f1_score(mask_flat, pred_flat, average='macro', labels=range(NUM_CLASSES), zero_division=0)
                iou_scores.append(iou)
                precision_scores.append(precision)
                recall_scores.append(recall)
                f1_scores.append(f1)

    return {
        'mIoU': np.mean(iou_scores),
        'Precision': np.mean(precision_scores),
        'Recall': np.mean(recall_scores),
        'F1': np.mean(f1_scores)
    }

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    best_miou = 0.0
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for images, masks in train_loader:
            images, masks = images.to(device), masks.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, masks)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Evaluate on validation set
        metrics = evaluate_model(model, val_loader, device)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, "
              f"mIoU: {metrics['mIoU']:.4f}, Precision: {metrics['Precision']:.4f}, "
              f"Recall: {metrics['Recall']:.4f}, F1: {metrics['F1']:.4f}")

        # Save best model
        if metrics['mIoU'] > best_miou:
            best_miou = metrics['mIoU']
            torch.save(model.state_dict(), 'best_model.pth')

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. Create and Evaluate Baseline
# Define baseline model (U-Net with ResNet34 backbone)
baseline_model = smp.Unet(
    encoder_name='resnet34',
    encoder_weights='imagenet',
    in_channels=3,
    classes=NUM_CLASSES
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(baseline_model.parameters(), lr=0.001)

# Train baseline
print("Training Baseline U-Net...")
train_model(baseline_model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device=device)

# Evaluate baseline
baseline_metrics = evaluate_model(baseline_model, val_loader, device)
print("Baseline Metrics:", baseline_metrics)

# 3. Improve Baseline
# Define improved transforms with augmentations
improved_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Reload dataset with augmentations
train_dataset_improved = BDD100KDataset(TRAIN_IMAGES, TRAIN_MASKS, transform=improved_transform, mask_transform=mask_transform)
train_loader_improved = DataLoader(train_dataset_improved, batch_size=8, shuffle=True, num_workers=4)

# Define improved model (DeepLabV3+ with ResNet50 backbone)
improved_model = smp.DeepLabV3Plus(
    encoder_name='resnet50',
    encoder_weights='imagenet',
    in_channels=3,
    classes=NUM_CLASSES
).to(device)

optimizer_improved = torch.optim.Adam(improved_model.parameters(), lr=0.0005)

# Train improved model
print("Training Improved DeepLabV3+...")
train_model(improved_model, train_loader_improved, val_loader, criterion, optimizer_improved, num_epochs=10, device=device)

# Evaluate improved model
improved_metrics = evaluate_model(improved_model, val_loader, device)
print("Improved Metrics:", improved_metrics)

# Compare results
print("Baseline vs Improved:")
print(f"Baseline mIoU: {baseline_metrics['mIoU']:.4f}, Improved mIoU: {improved_metrics['mIoU']:.4f}")
print(f"Baseline F1: {baseline_metrics['F1']:.4f}, Improved F1: {improved_metrics['F1']:.4f}")

# 4. Implement Custom Model
class CustomUNet(nn.Module):
    def __init__(self, in_channels=3, out_channels=NUM_CLASSES):
        super(CustomUNet, self).__init__()
        def conv_block(in_channels, out_channels):
            return nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True),
                nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
                nn.BatchNorm2d(out_channels),
                nn.ReLU(inplace=True)
            )

        self.encoder1 = conv_block(in_channels, 64)
        self.encoder2 = conv_block(64, 128)
        self.encoder3 = conv_block(128, 256)
        self.pool = nn.MaxPool2d(2, 2)
        
        self.bottleneck = conv_block(256, 512)
        
        self.upconv3 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.decoder3 = conv_block(512, 256)
        self.upconv2 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.decoder2 = conv_block(256, 128)
        self.upconv1 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.decoder1 = conv_block(128, 64)
        
        self.final_conv = nn.Conv2d(64, out_channels, kernel_size=1)

    def forward(self, x):
        # Encoder
        enc1 = self.encoder1(x)
        enc2 = self.encoder2(self.pool(enc1))
        enc3 = self.encoder3(self.pool(enc2))
        
        # Bottleneck
        bottleneck = self.bottleneck(self.pool(enc3))
        
        # Decoder
        dec3 = self.upconv3(bottleneck)
        dec3 = torch.cat((dec3, enc3), dim=1)
        dec3 = self.decoder3(dec3)
        dec2 = self.upconv2(dec3)
        dec2 = torch.cat((dec2, enc2), dim=1)
        dec2 = self.decoder2(dec2)
        dec1 = self.upconv1(dec2)
        dec1 = torch.cat((dec1, enc1), dim=1)
        dec1 = self.decoder1(dec1)
        
        return self.final_conv(dec1)

# Train custom model
custom_model = CustomUNet(in_channels=3, out_channels=NUM_CLASSES).to(device)
optimizer_custom = torch.optim.Adam(custom_model.parameters(), lr=0.001)

print("Training Custom U-Net...")
train_model(custom_model, train_loader, val_loader, criterion, optimizer_custom, num_epochs=10, device=device)

# Evaluate custom model
custom_metrics = evaluate_model(custom_model, val_loader, device)
print("Custom Model Metrics:", custom_metrics)

# Compare with baseline
print("Baseline vs Custom:")
print(f"Baseline mIoU: {baseline_metrics['mIoU']:.4f}, Custom mIoU: {custom_metrics['mIoU']:.4f}")
print(f"Baseline F1: {baseline_metrics['F1']:.4f}, Custom F1: {custom_metrics['F1']:.4f}")

# Apply improved techniques to custom model
custom_model_improved = CustomUNet(in_channels=3, out_channels=NUM_CLASSES).to(device)
optimizer_custom_improved = torch.optim.Adam(custom_model_improved.parameters(), lr=0.0005)

print("Training Improved Custom U-Net...")
train_model(custom_model_improved, train_loader_improved, val_loader, criterion, optimizer_custom_improved, num_epochs=10, device=device)

# Evaluate improved custom model
custom_improved_metrics = evaluate_model(custom_model_improved, val_loader, device)
print("Improved Custom Model Metrics:", custom_improved_metrics)

# Compare with improved baseline
print("Improved Baseline vs Improved Custom:")
print(f"Improved Baseline mIoU: {improved_metrics['mIoU']:.4f}, Improved Custom mIoU: {custom_improved_metrics['mIoU']:.4f}")
print(f"Improved Baseline F1: {improved_metrics['F1']:.4f}, Improved Custom F1: {custom_improved_metrics['F1']:.4f}")