# Semantic Segmentation Using DeepLab

In [1]:
# loading libraries
import numpy as np
import os
import pandas as pd 
import matplotlib.pyplot as plt 
import datetime


import itertools
import torch
from PIL import Image
from sklearn.metrics import confusion_matrix

import torchvision
import warnings
from tqdm import tqdm
import torchvision.models as models
import torch.nn as nn  # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
from torch.optim import lr_scheduler 
import torch.nn.functional as F  # All functions that don't have any parameters
from torch.utils.data import DataLoader  # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets  # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms  # Transformations we can perform on our dataset

torch.set_printoptions(linewidth=120)
torch.set_grad_enabled(True)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"using device: {device}")

using device: cuda:0


In [2]:
from torchvision.models import vgg16

In [3]:
from torchsummary import summary

In [4]:
return_layers = {'layer4': 'out'}
return_layers['layer3'] = 'aux'

In [5]:
return_layers

{'layer4': 'out', 'layer3': 'aux'}

In [6]:
# from pynvml import *
# nvmlInit()
# h = nvmlDeviceGetHandleByIndex(0)
# info = nvmlDeviceGetMemoryInfo(h)
# print(f'total    : {info.total}')
# print(f'free     : {info.free}')
# print(f'used     : {info.used}')

In [7]:
t = torch.cuda.get_device_properties(0).total_memory/(2**30)
r = torch.cuda.memory_reserved(0)/(2**30)
a = torch.cuda.memory_allocated(0)/(2**30)
f = r-a  # free inside reserved
print(t)
print(r)
print(a)
print(f)

4.0
0.0
0.0
0.0


In [8]:
torch.cuda.empty_cache()

# Data Loading

In [9]:
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
im_size = 56*2

x_transform =  transforms.Compose([#transforms.ToPILImage(),
                    transforms.Resize(im_size),
                    transforms.CenterCrop(im_size),
                    transforms.ToTensor(), 
                    transforms.Normalize(mean = [0.485, 0.456, 0.406], std = [0.229, 0.224, 0.225])])

y_transform =  transforms.Compose([#transforms.ToPILImage(),
                    transforms.Resize(im_size),
                    transforms.CenterCrop(im_size),
                    transforms.ToTensor()
                    ])

In [10]:
batch_size = 8

In [11]:
train_dataset = datasets.VOCSegmentation(root="D:\Desktop\Folder\Jupyter\data", year='2012', image_set='train', 
                                             download=False, transform=x_transform, target_transform=y_transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

test_dataset = datasets.VOCSegmentation(root="D:\Desktop\Folder\Jupyter\data", year='2012', image_set='val', 
                                             download=False, transform=x_transform, target_transform=y_transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

# Model

In [12]:
# model = models.segmentation.deeplabv3_resnet101(pretrained=True)
model = models.segmentation.fcn_resnet101(pretrained=True)

Downloading: "https://download.pytorch.org/models/fcn_resnet101_coco-7ecb50ca.pth" to C:\Users\20191039/.cache\torch\hub\checkpoints\fcn_resnet101_coco-7ecb50ca.pth


HBox(children=(IntProgress(value=0, max=217800805), HTML(value='')))




In [55]:
class Fcn8(torch.nn.Module):
    def __init__(self, num_classes = 21):
        super(Fcn8, self).__init__()
        features = list(vgg16(pretrained = True).features)
        self.features = nn.ModuleList(features).eval()
        self.vgg16 = vgg16(pretrained = True).features
        
        self.fc6 = nn.Conv2d(512, 4096, 7)
        self.relu6 = nn.ReLU(inplace=True)
        self.drop6 = nn.Dropout2d()

        # fc7
        self.fc7 = nn.Conv2d(4096, 4096, 1)
        self.relu7 = nn.ReLU(inplace=True)
        self.drop7 = nn.Dropout2d()

        self.score_fr = nn.Conv2d(4096, num_classes, 1)
        self.score_pool3 = nn.Conv2d(256, num_classes, 1)
        self.score_pool4 = nn.Conv2d(512, num_classes, 1)
        
        self.upscore2 = nn.ConvTranspose2d(
            num_classes, num_classes, 4, stride=2, bias=False)
        self.upscore8 = nn.ConvTranspose2d(
            num_classes, num_classes, 16, stride=8, bias=False)
        self.upscore_pool4 = nn.ConvTranspose2d(
            num_classes, num_classes, 4, stride=2, bias=False)
        
    def forward(self, x):
        pool_layers = []
        h = x
        for ii, layer in enumerate(self.features):
            h = layer(h)
            if ii in {16,23,30}: #the 16th layer is pool3, the 23rd layer is pool4
                pool_layers.append(h)
        
        pool3 = pool_layers[0]
        pool4 = pool_layers[1]
        pool5 = pool_layers[2]
        
        
        h = self.relu6(self.fc6(h))
        h = self.drop6(h)

        h = self.relu7(self.fc7(h))
        fc7_response = self.drop7(h)

        h = self.score_fr(fc7_response)
        h = self.upscore2(h)
        upscore2 = h  # 1/16

        h = self.score_pool4(pool4*0.01)  
        h = h[:, :, 5:5 + upscore2.size()[2], 5:5 + upscore2.size()[3]]
        score_pool4c = h  # 1/16

        h = upscore2 + score_pool4c  # 1/16
        h = self.upscore_pool4(h)
        upscore_pool4 = h  # 1/8

        h = self.score_pool3(pool3*0.0001)  
        h = h[:, :,
              9:9 + upscore_pool4.size()[2],
              9:9 + upscore_pool4.size()[3]]
        score_pool3c = h  # 1/8

        h = upscore_pool4 + score_pool3c  # 1/8

        h = self.upscore8(h)
        h = h[:, :, 31:31 + x.size()[2], 31:31 + x.size()[3]].contiguous()

In [56]:
fcn8 = Fcn8()

In [57]:
model = fcn8

In [33]:
summary(vgg16())

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Conv2d: 2-1                       1,792
|    └─ReLU: 2-2                         --
|    └─Conv2d: 2-3                       36,928
|    └─ReLU: 2-4                         --
|    └─MaxPool2d: 2-5                    --
|    └─Conv2d: 2-6                       73,856
|    └─ReLU: 2-7                         --
|    └─Conv2d: 2-8                       147,584
|    └─ReLU: 2-9                         --
|    └─MaxPool2d: 2-10                   --
|    └─Conv2d: 2-11                      295,168
|    └─ReLU: 2-12                        --
|    └─Conv2d: 2-13                      590,080
|    └─ReLU: 2-14                        --
|    └─Conv2d: 2-15                      590,080
|    └─ReLU: 2-16                        --
|    └─MaxPool2d: 2-17                   --
|    └─Conv2d: 2-18                      1,180,160
|    └─ReLU: 2-19                        --
|    └─Conv2d: 2-20              

Layer (type:depth-idx)                   Param #
├─Sequential: 1-1                        --
|    └─Conv2d: 2-1                       1,792
|    └─ReLU: 2-2                         --
|    └─Conv2d: 2-3                       36,928
|    └─ReLU: 2-4                         --
|    └─MaxPool2d: 2-5                    --
|    └─Conv2d: 2-6                       73,856
|    └─ReLU: 2-7                         --
|    └─Conv2d: 2-8                       147,584
|    └─ReLU: 2-9                         --
|    └─MaxPool2d: 2-10                   --
|    └─Conv2d: 2-11                      295,168
|    └─ReLU: 2-12                        --
|    └─Conv2d: 2-13                      590,080
|    └─ReLU: 2-14                        --
|    └─Conv2d: 2-15                      590,080
|    └─ReLU: 2-16                        --
|    └─MaxPool2d: 2-17                   --
|    └─Conv2d: 2-18                      1,180,160
|    └─ReLU: 2-19                        --
|    └─Conv2d: 2-20              

In [32]:
summary(vgg16(pretrained = True).features, (3, 224, 224))

Layer (type:depth-idx)                   Output Shape              Param #
├─Conv2d: 1-1                            [-1, 64, 224, 224]        1,792
├─ReLU: 1-2                              [-1, 64, 224, 224]        --
├─Conv2d: 1-3                            [-1, 64, 224, 224]        36,928
├─ReLU: 1-4                              [-1, 64, 224, 224]        --
├─MaxPool2d: 1-5                         [-1, 64, 112, 112]        --
├─Conv2d: 1-6                            [-1, 128, 112, 112]       73,856
├─ReLU: 1-7                              [-1, 128, 112, 112]       --
├─Conv2d: 1-8                            [-1, 128, 112, 112]       147,584
├─ReLU: 1-9                              [-1, 128, 112, 112]       --
├─MaxPool2d: 1-10                        [-1, 128, 56, 56]         --
├─Conv2d: 1-11                           [-1, 256, 56, 56]         295,168
├─ReLU: 1-12                             [-1, 256, 56, 56]         --
├─Conv2d: 1-13                           [-1, 256, 56, 56]      

Layer (type:depth-idx)                   Output Shape              Param #
├─Conv2d: 1-1                            [-1, 64, 224, 224]        1,792
├─ReLU: 1-2                              [-1, 64, 224, 224]        --
├─Conv2d: 1-3                            [-1, 64, 224, 224]        36,928
├─ReLU: 1-4                              [-1, 64, 224, 224]        --
├─MaxPool2d: 1-5                         [-1, 64, 112, 112]        --
├─Conv2d: 1-6                            [-1, 128, 112, 112]       73,856
├─ReLU: 1-7                              [-1, 128, 112, 112]       --
├─Conv2d: 1-8                            [-1, 128, 112, 112]       147,584
├─ReLU: 1-9                              [-1, 128, 112, 112]       --
├─MaxPool2d: 1-10                        [-1, 128, 56, 56]         --
├─Conv2d: 1-11                           [-1, 256, 56, 56]         295,168
├─ReLU: 1-12                             [-1, 256, 56, 56]         --
├─Conv2d: 1-13                           [-1, 256, 56, 56]      

In [24]:
summary(model, (3, 224, 224))

Layer (type:depth-idx)                   Output Shape              Param #
├─ModuleList: 1                          []                        --
|    └─Conv2d: 2-1                       [-1, 64, 224, 224]        1,792
|    └─ReLU: 2-2                         [-1, 64, 224, 224]        --
|    └─Conv2d: 2-3                       [-1, 64, 224, 224]        36,928
|    └─ReLU: 2-4                         [-1, 64, 224, 224]        --
|    └─MaxPool2d: 2-5                    [-1, 64, 112, 112]        --
|    └─Conv2d: 2-6                       [-1, 128, 112, 112]       73,856
|    └─ReLU: 2-7                         [-1, 128, 112, 112]       --
|    └─Conv2d: 2-8                       [-1, 128, 112, 112]       147,584
|    └─ReLU: 2-9                         [-1, 128, 112, 112]       --
|    └─MaxPool2d: 2-10                   [-1, 128, 56, 56]         --
|    └─Conv2d: 2-11                      [-1, 256, 56, 56]         295,168
|    └─ReLU: 2-12                        [-1, 256, 56, 56]      

Layer (type:depth-idx)                   Output Shape              Param #
├─ModuleList: 1                          []                        --
|    └─Conv2d: 2-1                       [-1, 64, 224, 224]        1,792
|    └─ReLU: 2-2                         [-1, 64, 224, 224]        --
|    └─Conv2d: 2-3                       [-1, 64, 224, 224]        36,928
|    └─ReLU: 2-4                         [-1, 64, 224, 224]        --
|    └─MaxPool2d: 2-5                    [-1, 64, 112, 112]        --
|    └─Conv2d: 2-6                       [-1, 128, 112, 112]       73,856
|    └─ReLU: 2-7                         [-1, 128, 112, 112]       --
|    └─Conv2d: 2-8                       [-1, 128, 112, 112]       147,584
|    └─ReLU: 2-9                         [-1, 128, 112, 112]       --
|    └─MaxPool2d: 2-10                   [-1, 128, 56, 56]         --
|    └─Conv2d: 2-11                      [-1, 256, 56, 56]         295,168
|    └─ReLU: 2-12                        [-1, 256, 56, 56]      

In [47]:
criterion = nn.CrossEntropyLoss(ignore_index=255) # ignore (instance) contour
optimizer = optim.SGD(model.parameters(), lr=1e-3, momentum=0.8)

In [48]:
def accuracy(out,labels):
    im = out.argmax(1) # [batch_size x classes x H x W] -> [batch_size x H x W]
    correct = 0
    correct += (out.argmax(1) == labels).sum() # for each pixel check if match

    return 100*correct/(im_size*im_size*batch_size)

In [49]:
class RunningConfusionMatrix():
    def __init__(self, labels, ignore_label=255):
        
        self.labels = labels
        self.ignore_label = ignore_label
        self.overall_confusion_matrix = None
        self.SMOOTH = 1e-6
        self.miou = 0
        self.acc  = 0 
        
    def update_matrix(self, ground_truth, prediction):
        # if (ground_truth == self.ignore_label).all():
        #     return
        
        current_confusion_matrix = confusion_matrix(y_true=ground_truth,
                                                    y_pred=prediction,
                                                    labels=self.labels)
        
        if self.overall_confusion_matrix is not None:
            
            self.overall_confusion_matrix += current_confusion_matrix
        else:
            
            self.overall_confusion_matrix = current_confusion_matrix
    
    def compute_current_mean_intersection_over_union(self):
        
        intersection = np.diag(self.overall_confusion_matrix)
        ground_truth_set = self.overall_confusion_matrix.sum(axis=1)
        predicted_set = self.overall_confusion_matrix.sum(axis=0)
        union =  ground_truth_set + predicted_set - intersection

        intersection_over_union = (intersection + self.SMOOTH) / (union.astype(np.float32) + self.SMOOTH)
        mean_intersection_over_union = np.mean(intersection_over_union)

        pixel_accuracy = np.sum(intersection)/np.sum(self.overall_confusion_matrix)

        self.miou = mean_intersection_over_union
        self.acc  = pixel_accuracy


        return mean_intersection_over_union,pixel_accuracy

In [50]:
y_actu = [2, 0, 2, 2, 0, 1, 1, 2, 2, 0, 1, 2]
y_pred = [0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 2]
conf = confusion_matrix(y_actu, y_pred)
conf

array([[3, 0, 0],
       [0, 1, 2],
       [2, 1, 3]], dtype=int64)

In [51]:
dia = np.diag(conf).sum()
sumall = np.sum(conf)
print(dia)
print(sumall)
print(dia/sumall)

7
12
0.5833333333333334


In [52]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')


    plt.figure(figsize=(12,12))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

## Training

In [58]:
%%time

num_epochs = 1

model.to(device)
model.train()
for epoch in range(num_epochs):
    losses = []
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    for batch_idx, (data, targets) in loop:
        data = data.to(device=device)
        if (int(targets.max()) == int(1)): # scale from [0,1] -> [0,255]
            targets = targets*255
        targets = targets.squeeze(1).to(device=device, dtype=torch.int64)
        
        # forward pass
        out = model(data)['out']
        loss = criterion(out, targets)
        losses.append(loss.item())

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent
        optimizer.step()

        # update progress bar
        acc = accuracy(out, targets)
        loop.set_description(f"Epoch [{epoch}/{num_epochs}]")
        loop.set_postfix(loss = loss.item(), acc=f"{acc:.0f}%")


    print(f"Cost at epoch {epoch} is {sum(losses)/len(losses):.5f}")





  0%|                                                                                          | 0/183 [00:00<?, ?it/s][A[A[A[A

RuntimeError: Calculated padded input size per channel: (3 x 3). Kernel size: (7 x 7). Kernel size can't be greater than actual input size

In [None]:
# one epoch with batch_size:16, im_size: 112 was 13 minutes

# Testing

In [None]:
plt.plot(losses)

In [None]:
# Define the helper function
def decode_segmap(image, nc=21):
  
  label_colors = np.array([(0, 0, 0),  # 0=background
               # 1=aeroplane, 2=bicycle, 3=bird, 4=boat, 5=bottle
               (128, 0, 0), (0, 128, 0), (128, 128, 0), (0, 0, 128), (128, 0, 128),
               # 6=bus, 7=car, 8=cat, 9=chair, 10=cow
               (0, 128, 128), (128, 128, 128), (64, 0, 0), (192, 0, 0), (64, 128, 0),
               # 11=dining table, 12=dog, 13=horse, 14=motorbike, 15=person
               (192, 128, 0), (64, 0, 128), (192, 0, 128), (64, 128, 128), (192, 128, 128),
               # 16=potted plant, 17=sheep, 18=sofa, 19=train, 20=tv/monitor
               (0, 64, 0), (128, 64, 0), (0, 192, 0), (128, 192, 0), (0, 64, 128)])

  r = np.zeros_like(image).astype(np.uint8)
  g = np.zeros_like(image).astype(np.uint8)
  b = np.zeros_like(image).astype(np.uint8)
  
  for l in range(0, nc):
    idx = image == l
    r[idx] = label_colors[l, 0]
    g[idx] = label_colors[l, 1]
    b[idx] = label_colors[l, 2]
    
  rgb = np.stack([r, g, b], axis=2)
  return rgb

In [None]:
# confusion matrix
labels = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
cm = RunningConfusionMatrix(labels=labels, ignore_label=255)

In [None]:
def test(model, test_loader, batch_size=4, overlay=False):
    model.eval()

    loop = tqdm(enumerate(test_loader), total=len(test_loader), leave=False)
    for batch_idx, (data, targets) in loop:
        image, target = next(iter(test_loader))
        image, target = image.to(device), target.to(device)
        if (int(target.max()) == int(1)):
            target = target*255
        out = model(image)['out']

        image, target, out = image.cpu(), target.cpu(), out.cpu()
        image, target, out = image.to('cpu'), target.to('cpu'), out.to('cpu')


        cm.update_matrix(target.view(-1), out.argmax(1).view(-1))

        if batch_idx == 0:
            for idx in range(batch_size):
                
                om = torch.argmax(out[idx].squeeze(), dim=0).detach().cpu().numpy()

                input_image  = image[idx].permute(1,2,0) * std[0] + torch.FloatTensor(mean)
                target_image = decode_segmap(target[idx].squeeze(0))
                output_image = decode_segmap(om)

                if (overlay):
                    axes=[]
                    fig=plt.figure(figsize=(6,6))
                    plt.imshow(torch.FloatTensor(output_image) * 0.0033 + input_image * 0.65 )
                    plt.show()
                else:
                    axes=[]
                    fig=plt.figure(figsize=(21,21))
                    axes.append( fig.add_subplot(1, 3, 1) )
                    plt.imshow( input_image  )
                    axes[-1].set_title("input")

                    axes.append( fig.add_subplot(1, 3, 2) )
                    plt.imshow( target_image )
                    axes[-1].set_title("target")

                    axes.append( fig.add_subplot(1, 3, 3) )
                    plt.imshow( output_image )
                    axes[-1].set_title("output")

                    plt.show()
        
    fig.tight_layout()
    model.train()
    print(f"mean intersection over union: {cm.compute_current_mean_intersection_over_union()}")   
    plot_confusion_matrix(cm.overall_confusion_matrix,labels)
    
    


In [None]:
test(model, test_loader, overlay=True )

In [None]:
labels = ["0: background","1: aeroplane", "2: bicycle", "3: bird", "4: boat", "5: bottle", "6: bus", "7: car", "8: cat", "9: chair", "10: cow", "11: dining table", "12: dog", "13: horse", "14: motorbike", "15: person ","16: potted plant", "17: sheep", "18: sofa", "19: train", "20: tv/monitor"]
plot_confusion_matrix(cm.overall_confusion_matrix,labels, normalize=True)

In [None]:
metrics = cm.compute_current_mean_intersection_over_union()
print("the mIoU is: \t\t%.2f"  % metrics[0])
print("the pixel accuracy is: \t%.2f"  % metrics[1])

In [None]:
# palette = torch.tensor([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
# colors = torch.as_tensor([i for i in range(21)])[:, None] * palette
# colors = (colors % 255).numpy().astype("uint8")

# # plot the semantic segmentation predictions of 21 classes in each color
# r = Image.fromarray(out[0].argmax(0).byte().cpu().numpy()).resize(im.size)
# r.putpalette(colors)

# import matplotlib.pyplot as plt
# plt.imshow(r)

In [None]:
labels = ["0: background", "1: aeroplane", "2: bicycle", "3: bird", "4: boat", "5: bottle", "6: bus", "7: car", "8: cat", "9: chair", "10: cow", "11: dining table", "12: dog", "13: horse", "14: motorbike", "15: person ","16: potted plant", "17: sheep", "18: sofa", "19: train", "20: tv/monitor"]

width = 2
height = 2
rows = 6
cols = 7

axes=[]
fig=plt.figure(figsize=(13,13))

for a in range(21):
    # b = np.random.randint(7, size=(height,width))
    axes.append( fig.add_subplot(rows, cols, a+1) )
    subplot_title=(str(labels[a]))
    axes[-1].set_title(subplot_title,fontweight="bold")
    z = np.array([[a, a],[a, a]])
    plt.imshow(decode_segmap(z))
    plt.axis('off')  
# fig.tight_layout()    
plt.show()


In [None]:
# torch.save(model.state_dict(), 'model1.ckpt')