In [1]:
# pip install --upgrade albumentations

In [2]:
import torch
import torch.nn as nn
from torch import optim

import torchvision
from torchvision import transforms, models

from albumentations import HorizontalFlip, RandomBrightnessContrast, Resize, Compose
from albumentations.pytorch import ToTensorV2
import numpy as np

import os
import cv2 as cv
from PIL import Image
from collections import Counter
import matplotlib.pyplot as plt

In [3]:
import zipfile

# Path to the ZIP file
zip_path = r"C:\Users\4312239\Downloads\archive (3).zip"
extract_to = "cityscapes"  # Folder where files will be extracted

# Open the ZIP file and extract its contents
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

print("Extraction complete!")

Extraction complete!


In [4]:
# Dataset paths
DATASET_DIR = r"C:\Users\4312239\Downloads\cityscapes"

train_images = os.path.join(DATASET_DIR, "train/img")
train_labels = os.path.join(DATASET_DIR, "train/label")

val_images = os.path.join(DATASET_DIR, "val/img")
val_labels = os.path.join(DATASET_DIR, "val/label")

In [5]:
# Define custom dataset
class SegmentationDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, lbl_dir, augmentations=None):

        # Path to the folder containing images & masks
        self.img_dir = img_dir
        self.lbl_dir = lbl_dir

        self.augmentations = augmentations

        # Store the sorted list of filenames for images and masks to ensure they are correctly paired
        self.images = sorted(os.listdir(img_dir))
        self.labels = sorted(os.listdir(lbl_dir))

    def __len__(self):

        # Returns the number of images
        return len(self.images)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.images[idx]) # Get image path
        lbl_path = os.path.join(self.lbl_dir, self.labels[idx]) # Get label path
        img = cv.imread(img_path) # Read the image
        lbl = cv.imread(lbl_path, cv.IMREAD_GRAYSCALE) # Read the label in grayscale

        # If augmentations are provided
        if self.augmentations:
            augmented = self.augmentations(image=img, mask=lbl)
            img, lbl = augmented["image"], augmented["mask"]

        # divide by 255 to normalize them into the range [0, 1]
        img = img.float()/255.0

        return img, lbl

In [6]:
# preprocessing and augmentation
def get_augmentations(train=True):
    if train:
        return Compose([
            Resize(96, 256),  # Resize images to (256, 512)
            HorizontalFlip(p=0.5), # Flip images horizontally with 50% probability
            RandomBrightnessContrast(p=0.2), # Randomly change brightness/contrast with 20% probability
            ToTensorV2() # Convert image/mask to PyTorch tensors
        ])
    else: # No data augmentation for validation (Only preprocessing)
        return Compose([
            Resize(96, 256),
            ToTensorV2()
        ])

In [7]:
# Load datasets
train_augmentations = get_augmentations(train=True)
val_augmentations = get_augmentations(train=False)

train_dataset = SegmentationDataset(train_images, train_labels, augmentations=train_augmentations)
val_dataset = SegmentationDataset(val_images, val_labels, augmentations=val_augmentations)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=8, shuffle=False)

In [8]:
# Define the Fully Convolutional Network (FCN)
# VGG16 as a feature extractor (encoder) & custom decoder to reconstructs the segmentation
class FCN(nn.Module):
    def __init__(self, num_classes):
        super(FCN, self).__init__()
        backbone = models.vgg16(weights=models.VGG16_Weights.DEFAULT)

        # remove fully connected layers & keep only convolutional layers
        self.encoder = backbone.features

        self.decoder = nn.Sequential(
          # nn.ConvTranspose2d(Input channels, Output channels, kernel_size, stride, padding, output_padding)

            nn.ConvTranspose2d(512, 256, 3, (2, 1), 1, (1, 0)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(256, 128, 3, (2, 1), 1, (1, 0)),
            nn.ReLU(inplace=True),
            nn.ConvTranspose2d(128, 64, 3, (2, 1), 1, (1, 0)),
            nn.ReLU(inplace=True),

            # reduce the number of channels to match the number of segmentation classes
            nn.Conv2d(64, num_classes, 1)
        )

    # x = input image [batch_size, channels, height, width]
    def forward(self, x):

        # Extracts features using VGG16 (Encoder)
        features = self.encoder(x)

        # Passing features through the Decoder
        segmentation_map = self.decoder(features)
        segmentation_map = nn.functional.interpolate(segmentation_map, size=(96, 256), mode='bilinear', align_corners=False)

        return segmentation_map

In [9]:
model = FCN(num_classes=19)
print(model)

FCN(
  (encoder): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1,

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(device)

cuda


In [13]:
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [15]:
# Training loop
num_epochs = 10
num_classes = 19

for epoch in range(num_epochs):
    train_loss = 0.0

    for images, masks in train_loader:

        # Ensurs the pixel values in masks are in the range [0, num_classes - 1].
        masks = torch.clamp(masks, 0, num_classes-1)
        images, masks = images.float().to(device), masks.long().to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = loss_func(outputs, masks)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss/len(train_loader):.4f}")

print('Training complete!')

Epoch [1/10], Loss: 0.2123
Epoch [2/10], Loss: 0.1238
Epoch [3/10], Loss: 0.1190
Epoch [4/10], Loss: 0.1172
Epoch [5/10], Loss: 0.1148
Epoch [6/10], Loss: 0.1131
Epoch [7/10], Loss: 0.1109
Epoch [8/10], Loss: 0.1087
Epoch [9/10], Loss: 0.1059
Epoch [10/10], Loss: 0.1033
Training complete!


In [16]:
torch.save(model.state_dict(), 'semantic_segmentation_model.pt')
print('Model saved!')

Model saved!


In [17]:
# Validation loop
num_epochs = 10

for epoch in range(num_epochs):
    model.eval()
    correct_pixels = 0
    total_pixels = 0

    with torch.no_grad():
        for images, masks in val_loader:
            masks = torch.clamp(masks, 0, num_classes-1)
            images, masks = images.float().to(device), masks.long().to(device)

            outputs = model(images)  # Model prediction (logits)
            predictions = torch.argmax(outputs, dim=1)  # Get class with highest probability

            correct_pixels += (predictions == masks).sum().item()
            total_pixels += masks.numel()  # Total number of pixels

accuracy = correct_pixels / total_pixels * 100  # Convert to percentage
print(f"Final Accuracy: {accuracy:.2f}%")

Final Accuracy: 97.84%
