In [80]:
! pip install kaggle



In [81]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [82]:
! cp kaggle.json ~/.kaggle/

In [83]:
! chmod 600 ~/.kaggle/kaggle.json

In [84]:
! kaggle datasets download -d zaraks/pascal-voc-2007

pascal-voc-2007.zip: Skipping, found more recently modified local copy (use --force to force download)


In [76]:
! unzip pascal-voc-2007.zip

Archive:  pascal-voc-2007.zip
replace PASCAL_VOC/PASCAL_VOC/pascal_test2007.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [85]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from torchvision import datasets, transforms
import torch.utils
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [86]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [87]:
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert image to PyTorch tensor
    transforms.Resize(256),  # Resize image to 256x256 (adjust as needed)
    #transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.2,0.2,0.2])
])

In [88]:
class Custom_dataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_dir = os.path.join(root_dir, "JPEGImages")
        self.mask_dir = os.path.join(root_dir, "Annotations")
        self.images = os.listdir(self.image_dir)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_name = os.path.join(self.image_dir, self.images[idx])
        mask_name = os.path.join(self.mask_dir, self.images[idx])  # Assuming mask has same name as image

        image = Image.open(img_name).convert("RGB")
        mask = Image.open(mask_name).convert("L")  # Convert to grayscale

        if self.transform:
            image = self.transform(image)
            mask = self.transform(mask)

        return image, mask


In [89]:
train_dir = "/content/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007"
test_dir = "/content/VOCtest_06-Nov-2007/VOCdevkit/VOC2007"

#using test dataset for validation purposes

In [90]:
#dataset
train_dataset = Custom_dataset(root_dir=train_dir, transform=transform)
test_dataset = Custom_dataset(root_dir=test_dir, transform=transform)

#dataloaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=4, shuffle=False)

In [91]:
class SegNet(nn.Module):
    def __init__(self, num_classes, in_channels=3, pretrained=True):
        super(SegNet, self).__init__()
        vgg_bn = models.vgg16_bn(pretrained=pretrained)
        encoder_layers = list(vgg_bn.features.children())
        if in_channels != 3:
            encoder_layers[0] = nn.Conv2d(in_channels, 64, kernel_size=3, padding=1)

        self.encoder_stages = nn.ModuleList()
        self.decoder_stages = nn.ModuleList()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True)
        self.unpool = nn.MaxUnpool2d(kernel_size=2, stride=2)

        decoder_layers = list(reversed([layer for layer in encoder_layers if not isinstance(layer, nn.MaxPool2d)]))
        decoder_layers[-1] = nn.Conv2d(64, 64, kernel_size=3, padding=1)

        for idx in range(5):
            self.encoder_stages.append(nn.Sequential(*encoder_layers[idx*7: (idx+1)*7]))
            self.decoder_stages.append(nn.Sequential(*decoder_layers[idx*7: (idx+1)*7]))

        self.final_decoder_conv = nn.Conv2d(64, num_classes, kernel_size=3, padding=1)
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        encoder_outputs = []
        for stage in self.encoder_stages:
            x, indices = self.pool(stage(x))
            encoder_outputs.append((x, indices))

        for idx, stage in enumerate(self.decoder_stages):
            x, indices = encoder_outputs.pop()
            x = self.unpool(x, indices=indices, output_size=encoder_outputs[-1][0].shape[-2:]) if encoder_outputs else x
            x = stage(x)

        x = self.final_decoder_conv(x)
        return x


In [92]:
#hyperparameters
num_classes = 2
learning_rate = 0.001
batch_size = 4
num_epochs = 5

In [93]:
import torch.optim as optim
from tqdm import tqdm

model = SegNet(num_classes=num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
#training loop
train_losses = []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    correct_pixels = 0
    total_pixels = 0

    for images, masks in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False):
        images, masks = images.to(device), masks.to(device)

        outputs = model(images)
        loss = criterion(outputs, masks.squeeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        correct_pixels += (predicted == masks).sum().item()
        total_pixels += torch.numel(masks)

    train_loss = epoch_loss / len(train_loader)
    train_pa = correct_pixels / total_pixels #pixel accuracy

    #evaluating on test data
    model.eval()
    test_loss = 0.0
    correct_pixels = 0
    total_pixels = 0

    with torch.no_grad():
        for images, masks in test_loader:
            images, masks = images.to(device), masks.to(device)
            outputs = model(images)
            test_loss += criterion(outputs, masks.squeeze(1)).item()

            _, predicted = torch.max(outputs, 1)
            correct_pixels += (predicted == masks).sum().item()
            total_pixels += torch.numel(masks)

    #test_loss /= len(test_loader)
    #test_pa = correct_pixels / total_pixels

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.3f}, Train Pixel Accuracy: {train_pa:.3f}")

    train_losses.append(train_loss)




FileNotFoundError: [Errno 2] No such file or directory: '/content/VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007/Annotations/003783.jpg'

In [None]:
#plotting training loss vs epoch curve
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss vs Epoch')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
#part 2 - computing various segmentation metrics
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

true_labels = []
predicted_labels = []

model.eval()
with torch.no_grad():
    for images, masks in test_loader:
        images, masks = images.to(device), masks.to(device)
        outputs = model(images)

        _, predicted = torch.max(outputs, 1)

        #flatten tensors to compute metrics
        true_labels.extend(masks.cpu().numpy().flatten())
        predicted_labels.extend(predicted.cpu().numpy().flatten())

pixel_accuracy = accuracy_score(true_labels, predicted_labels) #pixel accuracy
conf_matrix = confusion_matrix(true_labels, predicted_labels) #confusion matrix
f1 = f1_score(true_labels, predicted_labels, average='weighted') #F1 score

#iou
iou_per_class = []
for i in range(num_classes):
    intersection = np.logical_and(true_labels == i, predicted_labels == i).sum()
    union = np.logical_or(true_labels == i, predicted_labels == i).sum()
    iou_per_class.append(intersection / union)

mean_iou = np.mean(iou_per_class)

print(f"Pixel Accuracy: {pixel_accuracy:.4f}")
print(f"Mean IoU: {mean_iou:.4f}")
print(f"F1 Score: {f1:.4f}")

In [97]:
#facing issues in finding the masked images for the dataset - they have different names

Part 3 - changing backbone of segnet to lighter model. Therefore, using, simple cnn instead.

In [94]:
class CNN(nn.Module):
    def __init__(self, num_classes, in_channels=3):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        self.upconv1 = nn.ConvTranspose2d(32, 16, kernel_size=2, stride=2)
        self.upconv2 = nn.ConvTranspose2d(16, num_classes, kernel_size=2, stride=2)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)

        x = F.relu(self.upconv1(x))
        x = F.relu(self.upconv2(x))

        return x

In [95]:
class segnet_CNN(nn.Module):
    def __init__(self, num_classes, in_channels=3):
        super(segnet_CNN, self).__init__()
        self.cnn = CNN(num_classes=num_classes, in_channels=in_channels)

        self.encoder_stages = nn.ModuleList()
        self.decoder_stages = nn.ModuleList()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True)
        self.unpool = nn.MaxUnpool2d(kernel_size=2, stride=2)

        for _ in range(5):
            self.encoder_stages.append(nn.Sequential(nn.Conv2d(32, 32, kernel_size=3, padding=1),
                                                      nn.ReLU(inplace=True)))
            self.decoder_stages.append(nn.Sequential(nn.Conv2d(32, 32, kernel_size=3, padding=1),
                                                      nn.ReLU(inplace=True)))

        self.final_decoder_conv = nn.Conv2d(32, num_classes, kernel_size=3, padding=1)
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        encoder_outputs = []
        for stage in self.encoder_stages:
            x, indices = self.pool(stage(x))
            encoder_outputs.append((x, indices))

        for idx, stage in enumerate(self.decoder_stages):
            x, indices = encoder_outputs.pop()
            x = self.unpool(x, indices=indices, output_size=encoder_outputs[-1][0].shape[-2:]) if encoder_outputs else x
            x = stage(x)

        x = self.final_decoder_conv(x)
        return x

In [96]:
numclasses = 2
epochs = 10

model_2 = segnet_CNN(num_classes=numclasses).to(device)

# Define loss function and optimizer
criterion_2 = nn.CrossEntropyLoss()
optimizer_2 = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    correct_pixels = 0
    total_pixels = 0

    for images, masks in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", leave=False):
        images, masks = images.to(device), masks.to(device)

        outputs = model_2(images)
        loss = criterion_2(outputs, masks.squeeze(1))

        optimizer_2.zero_grad()
        loss.backward()
        optimizer_2.step()

        epoch_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        correct_pixels += (predicted == masks).sum().item()
        total_pixels += torch.numel(masks)

    train_loss = epoch_loss / len(train_loader)
    train_pixel_accuracy = correct_pixels / total_pixels

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss:.3f}, Train Pixel Accuracy: {train_pixel_accuracy:.3f}")


In [None]:
model.eval()
true_labels = []
predicted_labels = []

with torch.no_grad():
    for images, masks in test_loader:
        images, masks = images.to(device), masks.to(device)
        outputs = model(images)

        _, predicted = torch.max(outputs, 1)

        true_labels.extend(masks.cpu().numpy().flatten())
        predicted_labels.extend(predicted.cpu().numpy().flatten())

pixel_accuracy = accuracy_score(true_labels, predicted_labels)
conf_matrix = confusion_matrix(true_labels, predicted_labels)
f1 = f1_score(true_labels, predicted_labels, average='weighted')

print(f"Pixel Accuracy: {pixel_accuracy:.3f}")
print(f"F1 Score: {f1:.3f}")