In [None]:
!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar

!tar -xf VOCtrainval_11-May-2012.tar -C /content/


--2024-12-13 16:33:47--  http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
Resolving host.robots.ox.ac.uk (host.robots.ox.ac.uk)... 129.67.94.152
Connecting to host.robots.ox.ac.uk (host.robots.ox.ac.uk)|129.67.94.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1999639040 (1.9G) [application/x-tar]
Saving to: ‘VOCtrainval_11-May-2012.tar’


2024-12-13 16:35:22 (20.2 MB/s) - ‘VOCtrainval_11-May-2012.tar’ saved [1999639040/1999639040]



In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import albumentations as A
from albumentations.pytorch import ToTensorV2
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

# Class and colormap definitions for PASCAL VOC....
VOC_CLASSES = [
    "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus",
    "car", "cat", "chair", "cow", "diningtable", "dog", "horse",
    "motorbike", "person", "potted plant", "sheep", "sofa", "train", "tv/monitor"
]

VOC_COLORMAP = [
    [0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
    [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], [192, 0, 0],
    [64, 128, 0], [192, 128, 0], [64, 0, 128], [192, 0, 128], [64, 128, 128],
    [192, 128, 128], [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
    [0, 64, 128]
]

class VOCDataset(Dataset):
    def __init__(self, root="/content/VOCdevkit/VOC2012", is_train=True, transform=None, classes=None):
        if is_train:
            img_root = os.path.join(root, "ImageSets", "Segmentation", "train.txt")
        else:
            img_root = os.path.join(root, "ImageSets", "Segmentation", "val.txt")

        img_names = []
        with open(img_root, 'r') as rf:
            names = [name.replace('\n','') for name in rf.readlines()]
            for name in names:
                img_names.append(name)

        self.classes = classes
        self.transform = transform
        self.img_names = img_names
        self.root = root

    def __len__(self):
        return len(self.img_names)

    def _convert_to_segmentation_mask(self, mask):
        height, width = mask.shape[:2]
        segmentation_mask = np.zeros((height, width, len(VOC_COLORMAP)))

        for label_index, label in enumerate(VOC_COLORMAP):
            segmentation_mask[:, :, label_index] = np.all(mask == label, axis=-1).astype(float)

        return segmentation_mask

    def __getitem__(self, item):
        img_name = self.img_names[item]
        img = cv2.imread(os.path.join(self.root, "JPEGImages", img_name + ".jpg"))
        mask = cv2.imread(os.path.join(self.root, "SegmentationClass", img_name + ".png"))

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2RGB)

        # Convert RGB mask to segmentation mask
        mask = self._convert_to_segmentation_mask(mask)

        if self.transform:
            augmented = self.transform(image=img, mask=mask)
            img = augmented['image']
            mask = augmented['mask']
            mask = mask.argmax(dim=2).squeeze()

        return img, mask


  check_for_updates()


In [None]:

transform = A.Compose([
    A.Resize(224, 224),
    A.HorizontalFlip(p=0.5),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

train_dataset = VOCDataset(root='/content/VOCdevkit/VOC2012', is_train=True, transform=transform, classes=VOC_CLASSES)
val_dataset = VOCDataset(root='/content/VOCdevkit/VOC2012', is_train=False, transform=transform, classes=VOC_CLASSES)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")


Train dataset size: 1464
Validation dataset size: 1449


#### Classification

In [None]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2
import os
import cv2
import xml.etree.ElementTree as ET

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Classification Dataset..
class ClassificationDataset(Dataset):
    def __init__(self, root="./VOCdevkit/VOC2012", classes=None, transform=None):
        self.image_dir = os.path.join(root, "JPEGImages")
        self.annotations_dir = os.path.join(root, "Annotations")
        self.image_paths = []
        self.labels = []
        self.classes = classes
        self.transform = transform

        for xml_file in os.listdir(self.annotations_dir):
            file_path = os.path.join(self.annotations_dir, xml_file)
            tree = ET.parse(file_path)
            root = tree.getroot()

            filename = root.find("filename").text
            label_name = root.find("object").find("name").text

            if label_name in self.classes:
                self.image_paths.append(os.path.join(self.image_dir, filename))
                self.labels.append(self.classes.index(label_name))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = cv2.cvtColor(cv2.imread(self.image_paths[idx]), cv2.COLOR_BGR2RGB)
        label = self.labels[idx]
        if self.transform:
            img = self.transform(image=img)["image"]
        return img, label

transform = A.Compose([
    A.Resize(224, 224),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2(),
])

VOC_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow",
    "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]

train_dataset = ClassificationDataset(root="./VOCdevkit/VOC2012", classes=VOC_CLASSES, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

val_dataset = ClassificationDataset(root="./VOCdevkit/VOC2012", classes=VOC_CLASSES, transform=transform)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


# CNN-based Classification Model...
class CNNClassifier(nn.Module):
    def __init__(self, num_classes):
        super(CNNClassifier, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(256 * 28 * 28, 1024),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(1024, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

cnn_model = CNNClassifier(num_classes=len(VOC_CLASSES)).to(device)

optimizer = Adam(cnn_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

def train_model(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for images, labels in loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (torch.argmax(outputs, dim=1) == labels).sum().item()
        total_samples += labels.size(0)

    accuracy = (total_correct / total_samples) * 100
    return total_loss / len(loader), accuracy

def validate_model(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            total_correct += (torch.argmax(outputs, dim=1) == labels).sum().item()
            total_samples += labels.size(0)

    accuracy = (total_correct / total_samples) * 100
    return total_loss / len(loader), accuracy

num_epochs = 8
for epoch in range(num_epochs):
    train_loss, train_accuracy = train_model(cnn_model, train_loader, optimizer, criterion, device)
    val_loss, val_accuracy = validate_model(cnn_model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

torch.save(cnn_model.state_dict(), "cnn_classification_model.pth")
print("Model saved successfully!")


Epoch 1/8, Train Loss: 2.1581, Train Accuracy: 40.90%, Val Loss: 1.8314, Val Accuracy: 45.19%
Epoch 2/8, Train Loss: 1.8049, Train Accuracy: 46.83%, Val Loss: 1.4099, Val Accuracy: 54.12%
Epoch 3/8, Train Loss: 1.3831, Train Accuracy: 57.17%, Val Loss: 0.8492, Val Accuracy: 79.96%
Epoch 4/8, Train Loss: 0.7894, Train Accuracy: 74.76%, Val Loss: 0.2439, Val Accuracy: 95.16%
Epoch 5/8, Train Loss: 0.3287, Train Accuracy: 89.87%, Val Loss: 0.0726, Val Accuracy: 99.31%
Epoch 6/8, Train Loss: 0.1600, Train Accuracy: 95.15%, Val Loss: 0.0281, Val Accuracy: 99.70%
Epoch 7/8, Train Loss: 0.1066, Train Accuracy: 96.85%, Val Loss: 0.0178, Val Accuracy: 99.84%
Epoch 8/8, Train Loss: 0.0827, Train Accuracy: 97.65%, Val Loss: 0.0092, Val Accuracy: 99.87%
Model saved successfully!


#### Segmentation

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset
import albumentations as A
from albumentations.pytorch import ToTensorV2
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

VOC_COLORMAP = [
    [0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
    [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], [192, 0, 0],
    [64, 128, 0], [192, 128, 0], [64, 0, 128], [192, 0, 128], [64, 128, 128],
    [192, 128, 128], [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
    [0, 64, 128]
]

# Map colormap to class indices...
VOC_COLORMAP_TO_INDEX = {tuple(color): idx for idx, color in enumerate(VOC_COLORMAP)}

class SegmentationDataset(Dataset):
    def __init__(self, root="./VOCdevkit/VOC2012", transform=None):
        self.image_dir = os.path.join(root, "JPEGImages")
        self.mask_dir = os.path.join(root, "SegmentationClass")
        self.image_paths = []
        self.mask_paths = []
        self.transform = transform

        for img_name in os.listdir(self.image_dir):
            mask_path = os.path.join(self.mask_dir, img_name.replace(".jpg", ".png"))
            if os.path.exists(mask_path):
                self.image_paths.append(os.path.join(self.image_dir, img_name))
                self.mask_paths.append(mask_path)

    def __len__(self):
        return len(self.image_paths)

    def _rgb_to_class_index(self, mask):
        """Convert RGB mask to class indices."""
        h, w, _ = mask.shape
        class_indices = np.zeros((h, w), dtype=np.int64)
        for rgb, idx in VOC_COLORMAP_TO_INDEX.items():
            class_indices[(mask == rgb).all(axis=2)] = idx
        return class_indices

    def __getitem__(self, idx):
        img = cv2.cvtColor(cv2.imread(self.image_paths[idx]), cv2.COLOR_BGR2RGB)
        mask = cv2.imread(self.mask_paths[idx])

        mask = self._rgb_to_class_index(mask)

        if self.transform:
            augmented = self.transform(image=img, mask=mask)
            img = augmented["image"]
            mask = augmented["mask"]

        return img, torch.tensor(mask, dtype=torch.long)

transform = A.Compose([
    A.Resize(224, 224),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2(),
])

train_dataset = SegmentationDataset(root="./VOCdevkit/VOC2012", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

val_dataset = SegmentationDataset(root="./VOCdevkit/VOC2012", transform=transform)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# U-Net-like Encoder-Decoder Architecture...
class UNet(nn.Module):
    def __init__(self, num_classes):
        super(UNet, self).__init__()

        def conv_block(in_channels, out_channels):
            return nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
                nn.ReLU(inplace=True),
                nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
                nn.ReLU(inplace=True),
            )

        self.encoder1 = conv_block(3, 64)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.encoder2 = conv_block(64, 128)
        self.pool2 = nn.MaxPool2d(2, 2)

        self.encoder3 = conv_block(128, 256)
        self.pool3 = nn.MaxPool2d(2, 2)

        self.bottleneck = conv_block(256, 512)

        self.upconv3 = nn.ConvTranspose2d(512, 256, kernel_size=2, stride=2)
        self.decoder3 = conv_block(512, 256)

        self.upconv2 = nn.ConvTranspose2d(256, 128, kernel_size=2, stride=2)
        self.decoder2 = conv_block(256, 128)

        self.upconv1 = nn.ConvTranspose2d(128, 64, kernel_size=2, stride=2)
        self.decoder1 = conv_block(128, 64)

        self.final_conv = nn.Conv2d(64, num_classes, kernel_size=1)

    def forward(self, x):
        enc1 = self.encoder1(x)
        enc2 = self.encoder2(self.pool1(enc1))
        enc3 = self.encoder3(self.pool2(enc2))
        bottleneck = self.bottleneck(self.pool3(enc3))

        dec3 = self.decoder3(torch.cat((self.upconv3(bottleneck), enc3), dim=1))
        dec2 = self.decoder2(torch.cat((self.upconv2(dec3), enc2), dim=1))
        dec1 = self.decoder1(torch.cat((self.upconv1(dec2), enc1), dim=1))

        return F.interpolate(self.final_conv(dec1), size=(224, 224), mode="bilinear")

num_classes = 21
model = UNet(num_classes).to(device)

optimizer = Adam(model.parameters(), lr=1e-4)
criterion = CrossEntropyLoss()

def train_segmentation(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for images, masks in loader:
        images, masks = images.to(device), masks.to(device)
        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, masks)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        predictions = torch.argmax(outputs, dim=1)
        total_correct += (predictions == masks).sum().item()
        total_samples += masks.numel()

    accuracy = (total_correct / total_samples) * 100
    return total_loss / len(loader), accuracy

def validate_segmentation(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():
        for images, masks in loader:
            images, masks = images.to(device), masks.to(device)

            outputs = model(images)
            loss = criterion(outputs, masks)

            total_loss += loss.item()
            predictions = torch.argmax(outputs, dim=1)
            total_correct += (predictions == masks).sum().item()
            total_samples += masks.numel()

    accuracy = (total_correct / total_samples) * 100
    return total_loss / len(loader), accuracy

num_epochs = 8
for epoch in range(num_epochs):
    train_loss, train_accuracy = train_segmentation(model, train_loader, optimizer, criterion, device)
    val_loss, val_accuracy = validate_segmentation(model, val_loader, criterion, device)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.2f}%, Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.2f}%")

# Save the model
torch.save(model.state_dict(), "cnn_segmentation_model.pth")
print("Model saved successfully!")


  return img, torch.tensor(mask, dtype=torch.long)


Epoch 1/8, Train Loss: 0.8117, Train Acc: 86.86%, Val Loss: 0.5661, Val Acc: 90.39%
Epoch 2/8, Train Loss: 0.5379, Train Acc: 90.39%, Val Loss: 0.5289, Val Acc: 90.39%
Epoch 3/8, Train Loss: 0.5250, Train Acc: 90.39%, Val Loss: 0.5173, Val Acc: 90.39%
Epoch 4/8, Train Loss: 0.5186, Train Acc: 90.39%, Val Loss: 0.5102, Val Acc: 90.39%
Epoch 5/8, Train Loss: 0.5141, Train Acc: 90.39%, Val Loss: 0.5054, Val Acc: 90.39%


KeyboardInterrupt: 