<a href="https://colab.research.google.com/github/R12942159/DeepLearning/blob/main/DLCV_hw1_p3_VGG16FCN32s.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch


# Get cuda from GPU device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using: {device}")

Using: cuda


In [3]:
import os


train_paths = '/content/drive/MyDrive/NTU_DLCV/p3_data/train'
img_paths_train = sorted([os.path.join(train_paths, i) for i in os.listdir(train_paths) if i.endswith('.jpg')])
mask_paths_train = sorted([os.path.join(train_paths, i) for i in os.listdir(train_paths) if i.endswith('.png')])

In [4]:
import os


val_paths = '/content/drive/MyDrive/NTU_DLCV/p3_data/validation'
img_paths_val = sorted([os.path.join(val_paths, i) for i in os.listdir(val_paths) if i.endswith('.jpg')])
mask_paths_val = sorted([os.path.join(val_paths, i) for i in os.listdir(val_paths) if i.endswith('.png')])

#### Data Processing

In [6]:
import torch
import random
import numpy as np
from PIL import Image
from copy import deepcopy
from torchvision.transforms.functional import hflip, vflip, to_tensor, normalize


class LandDataset(torch.utils.data.Dataset):
    def __init__(self, img_paths, mask_paths, transform, mode, augment=False):
        self.img_paths = img_paths
        self.mask_paths = mask_paths
        self.transform = transform
        self.mode = mode

        # Data Augmentation
        def original(x): return x
        if augment:
            self.augment = [original, hflip, vflip]
        else:
            self.augment = [original]

        # double check the len of img and mask.
        assert len(self.img_paths) == len(self.mask_paths)

    def __len__(self):
        return len(self.img_paths)

    def __getitem__(self, idx):
        # get img path
        img_path = self.img_paths[idx]
        # Read img
        img = Image.open(img_path).convert('RGB')
        # Normalize Image with VGG16's mean and std
        # (H, W, C) -> (C, H, W)
        img = self.transform(img)

        if self.mode != 'test':
            # get mask path
            mask_path = self.mask_paths[idx]
            mask = Image.open(mask_path).convert('RGB')
            mask = np.array(mask)
            # Binarize mask from [0~255] to (0 or 1)
            mask = (mask >= 128).astype(int)

            # squeeze [a,b,c] into [x,y] with 7 category(6 classes + 1 background)
            mask = 100 * mask[:, :, 0] + 10 * mask[:, :, 1] + 1 * mask[:, :, 2]
            raw_mask = deepcopy(mask) # Only perform numerical conversion on the most original data
            mask[raw_mask == 11] = 0  # (Cyan: 011) Urban land
            mask[raw_mask == 110] = 1  # (Yellow: 110) Agriculture land
            mask[raw_mask == 101] = 2  # (Purple: 101) Rangeland
            mask[raw_mask == 10] = 3  # (Green: 010) Forest land
            mask[raw_mask == 1] = 4  # (Blue: 001) Water
            mask[raw_mask == 111] = 5  # (White: 111) Barren land
            mask[raw_mask == 0] = 6  # (Black: 000) Unknown
            mask = torch.tensor(mask)

            # random Data Augmentation
            augmentor = random.choice(self.augment)
            img = augmentor(img)
            mask = augmentor(mask)

            # mask = mask.to(torch.float)  # 将目标标签转换为浮点数类型
            return img, mask
        else:
            return img

In [7]:
import torchvision.transforms as tr


# IMG_SIZE : 512*512
BATCH_SIZE = 8

# VGG16_V1, https://pytorch.org/vision/main/models/generated/torchvision.models.vgg16.html
mean=[0.485, 0.456, 0.406]
std=[0.229, 0.224, 0.225]

train_ds = LandDataset(img_paths_train,
                       mask_paths_train,
                       transform = tr.Compose([
                          tr.ToTensor(),
                          tr.Normalize(mean=mean, std=std),
                          ]),
                       mode = 'train',
                       augment = True,)
val_ds = LandDataset(img_paths_val,
                     mask_paths_val,
                     transform = tr.Compose([
                          tr.ToTensor(),
                          tr.Normalize(mean=mean, std=std),
                          ]),
                     mode = 'val',
                     augment = False,)

# num_workers > 0: accelerate loading data by muli-process
train_loader = torch.utils.data.DataLoader(train_ds, BATCH_SIZE, shuffle=True, num_workers=4)
val_loader = torch.utils.data.DataLoader(val_ds, BATCH_SIZE, shuffle=False, num_workers=4)

In [15]:
# https://github.com/zijundeng/pytorch-semantic-segmentation/blob/master/models/fcn32s.py#L17C4-L17C4
from torch import nn
import torchvision
from torchvision import models
from torchvision.models import vgg16


class VGG16FCN32s(nn.Module):
    def __init__(self, n_classes=7) -> None:
        super(VGG16FCN32s, self).__init__()

        pretrained_vgg16 = vgg16(pretrained=True)

        # CRCRM, CRCRM, CRCRCRM, CRCRCRM, CRCRCRM
        self.features = pretrained_vgg16.features
        # Able to capture information near the edge of the image and improve segmentation accuracy
        self.features[0].padding = (100,100)
        classifier = pretrained_vgg16.classifier


        # make Conv(512,4096,7) Conv(4096,4096,1) TransposeCoonv
        self.fc6 = nn.Sequential(
            nn.Conv2d(512, 4096, kernel_size=7),
            nn.ReLU(inplace=True),
            nn.Dropout2d() # 二维数据（通常是图像数据)的dropout
        )

        self.fc6[0].weight.data = classifier[0].weight.data.view(4096, 512, 7, 7)
        self.fc6[0].bias.data = classifier[0].bias.data

        self.fc7 = nn.Sequential(
            nn.Conv2d(4096, 4096, kernel_size=1),
            nn.ReLU(inplace=True),
            nn.Dropout2d()
        )

        self.fc7[0].weight.data = classifier[3].weight.data.view(4096, 4096, 1, 1)
        self.fc7[0].bias.data = classifier[3].bias.data

        self.score_fr = nn.Sequential(
            nn.Conv2d(4096, n_classes, kernel_size=1),
            nn.ReLU(inplace=True)
        )

        self.upscore = nn.Sequential(
            nn.ConvTranspose2d(n_classes, n_classes, kernel_size=64, stride=32)
        )

    def forward(self, x):
        x_size = x.size()
        bottleneck = self.features(x)
        f6 = self.fc6(bottleneck)
        f7 = self.fc7(f6)
        score_fr = self.score_fr(f7)
        upscore = self.upscore(score_fr)
        # crop to original size
        return upscore[:, :, upscore.shape[2]-x_size[2]:, upscore.shape[3]-x_size[3]:]

In [16]:
def mean_iou_score(pred, labels, num_classes=6):
    '''
    Compute mean IoU score over 6 classes
    '''
    mean_iou = []
    # labels = labels.squeeze(1)  # 使其与 pred 张量的形状匹配
    for i in range(num_classes):
        tp_fp = torch.sum(pred == i)
        tp_fn = torch.sum(labels == i)
        tp = torch.sum((pred == i) * (labels == i))
        # avoid nan
        if (tp_fp + tp_fn - tp) == 0:
            iou = 0.0
        else:
            iou = tp / (tp_fp + tp_fn - tp)
        # mean_iou += iou / num_classes
        mean_iou.append(iou)
    #     print('class #%d : %1.5f'%(i, iou))
    # print('\nmean_iou: %f\n' % mean_iou)

    return sum(mean_iou) / len(mean_iou)
    # return mean_iou

In [17]:
# from tqdm import tqdm


# model = my_FCN32s()
# model = model.to(device)
# loss_fn = nn.CrossEntropyLoss() # classification for each pixel
# optimizer = torch.optim.Adam(model.parameters())


# epoch_loss = 0
# epoch_iou = 0
# for batch_i, (x, y) in enumerate(tqdm(train_loader, leave=False)):
#     x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) # move data to GPU

#     optimizer.zero_grad()
#     # Ensure the target labels are of type Long
#     y = y.to(torch.long)
#     pred = model(x)
#     loss = loss_fn(pred, y)
#     loss.backward() # backpropagation to compute gradients
#     optimizer.step() # update model params

#     pred = torch.argmax(pred, dim=1) # 沿着通道维度选择具有最高分数的通道
#     pred = pred.to(torch.float)  # 将目标标签转换为浮点数类型

#     epoch_loss += loss.item() # tensor -> python value
#     epoch_iou += mean_iou_score(pred, y).item()

#     break

In [18]:
from tqdm import tqdm


def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset) # number of samples
    num_batches = len(dataloader) # batches per epoch
    model.train() # to training mode.
    epoch_loss = 0
    epoch_iou = 0
    for batch_i, (x, y) in enumerate(tqdm(dataloader, leave=False)):
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True) # move data to GPU

        optimizer.zero_grad()
        # Ensure the target labels are of type Long
        y = y.to(torch.long)
        pred = model(x)
        loss = loss_fn(pred, y)
        loss.backward() # backpropagation to compute gradients
        optimizer.step() # update model params

        pred = torch.argmax(pred, dim=1) # 沿着通道维度选择具有最高分数的通道
        pred = pred.to(torch.float)  # 将目标标签转换为浮点数类型

        epoch_loss += loss.item() # tensor -> python value
        epoch_iou += mean_iou_score(pred, y).item()

    # return avg loss of epoch, iou of epoch
    return epoch_loss/num_batches, epoch_iou/num_batches


def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset) # number of samples
    num_batches = len(dataloader) # batches per epoch

    model.eval() # model to test mode.
    epoch_loss = 0
    epoch_iou = 0
    # No gradient for test data
    with torch.no_grad():
        for batch_i, (x, y) in enumerate(dataloader):
            x, y = x.to(device), y.to(device)

            y = y.to(torch.long)
            # Compute prediction loss
            pred = model(x)
            loss = loss_fn(pred, y)

            pred = torch.argmax(pred, dim=1) # 沿着通道维度选择具有最高分数的通道
            pred = pred.to(torch.float)  # 将目标标签转换为浮点数类型

            # write to logs
            epoch_loss += loss.item()
            epoch_iou += mean_iou_score(pred, y).item()

    return epoch_loss/num_batches, epoch_iou/num_batches

In [20]:
EPOCHS = 100
logs = {
    'train_loss': [], 'val_loss': [],
    'train_mean_iou': [], 'val_mean_iou': [],
}

model = VGG16FCN32s()
model = model.to(device)

loss_fn = nn.CrossEntropyLoss() # classification for each pixel
optimizer = torch.optim.Adam(model.parameters())

# Earlystopping
patience = 5
counter = 0
best_IOU = 0

for epoch in tqdm(range(EPOCHS)):
    train_loss, train_mean_iou = train(train_loader, model, loss_fn, optimizer)
    val_loss, val_mean_iou = test(val_loader, model, loss_fn)
    logs['train_loss'].append(train_loss)
    logs['val_loss'].append(val_loss)
    logs['train_mean_iou'].append(train_mean_iou)
    logs['val_mean_iou'].append(val_mean_iou)

    print(f'EPOCH: {(epoch+1):04d} train_loss: {train_loss:.4f} val_loss: {val_loss:.4f}, train_mean_iou: {train_mean_iou:.3f}, val_mean_iou: {val_mean_iou:.3f}')

    # On epoch end
    torch.save(model.state_dict(), "VGG16FCN32s_lastmodel.pth")
    # check improvement
    if best_IOU < val_mean_iou:
        counter = 0
        best_IOU = val_mean_iou
        torch.save(model.state_dict(), "VGG16FCN32s_bestIOU.pth")
        print('best_IOU saved!')
    else:
        counter += 1
    if counter >= patience:
        print("Earlystop!")
        break

  0%|          | 0/100 [00:00<?, ?it/s]
  0%|          | 0/250 [00:00<?, ?it/s][A
  0%|          | 1/250 [00:01<04:13,  1.02s/it][A
  1%|          | 2/250 [00:01<03:08,  1.31it/s][A
  1%|          | 3/250 [00:02<02:46,  1.48it/s][A
  2%|▏         | 4/250 [00:02<02:36,  1.57it/s][A
  2%|▏         | 5/250 [00:03<02:30,  1.63it/s][A
  2%|▏         | 6/250 [00:03<02:26,  1.66it/s][A
  3%|▎         | 7/250 [00:04<02:23,  1.69it/s][A
  3%|▎         | 8/250 [00:05<02:22,  1.70it/s][A
  4%|▎         | 9/250 [00:05<02:20,  1.71it/s][A
  4%|▍         | 10/250 [00:06<02:19,  1.72it/s][A
  4%|▍         | 11/250 [00:06<02:18,  1.73it/s][A
  5%|▍         | 12/250 [00:07<02:17,  1.73it/s][A
  5%|▌         | 13/250 [00:07<02:16,  1.73it/s][A
  6%|▌         | 14/250 [00:08<02:16,  1.73it/s][A
  6%|▌         | 15/250 [00:09<02:15,  1.73it/s][A
  6%|▋         | 16/250 [00:09<02:14,  1.73it/s][A
  7%|▋         | 17/250 [00:10<02:14,  1.74it/s][A
  7%|▋         | 18/250 [00:10<02:13,  1.7

EPOCH: 0000 train_loss: 1.3411 val_loss: 1.3135, train_mean_iou: 0.096, val_mean_iou: 0.104


  1%|          | 1/100 [02:33<4:12:41, 153.14s/it]
  0%|          | 0/250 [00:00<?, ?it/s][A
  0%|          | 1/250 [00:00<04:03,  1.02it/s][A
  1%|          | 2/250 [00:01<03:03,  1.35it/s][A
  1%|          | 3/250 [00:02<02:44,  1.50it/s][A
  2%|▏         | 4/250 [00:02<02:35,  1.58it/s][A
  2%|▏         | 5/250 [00:03<02:29,  1.63it/s][A
  2%|▏         | 6/250 [00:03<02:26,  1.67it/s][A
  3%|▎         | 7/250 [00:04<02:24,  1.68it/s][A
  3%|▎         | 8/250 [00:05<02:22,  1.69it/s][A
  4%|▎         | 9/250 [00:05<02:21,  1.70it/s][A
  4%|▍         | 10/250 [00:06<02:19,  1.71it/s][A
  4%|▍         | 11/250 [00:06<02:19,  1.72it/s][A
  5%|▍         | 12/250 [00:07<02:18,  1.72it/s][A
  5%|▌         | 13/250 [00:07<02:17,  1.73it/s][A
  6%|▌         | 14/250 [00:08<02:16,  1.73it/s][A
  6%|▌         | 15/250 [00:09<02:15,  1.73it/s][A
  6%|▋         | 16/250 [00:09<02:15,  1.73it/s][A
  7%|▋         | 17/250 [00:10<02:14,  1.73it/s][A
  7%|▋         | 18/250 [00:10<

EPOCH: 0001 train_loss: 1.3571 val_loss: 1.3773, train_mean_iou: 0.104, val_mean_iou: 0.104


  2%|▏         | 2/100 [05:06<4:10:12, 153.18s/it]
  0%|          | 0/250 [00:00<?, ?it/s][A
  0%|          | 1/250 [00:00<04:00,  1.04it/s][A
  1%|          | 2/250 [00:01<03:02,  1.36it/s][A
  1%|          | 3/250 [00:02<02:43,  1.51it/s][A
  2%|▏         | 4/250 [00:02<02:34,  1.59it/s][A
  2%|▏         | 5/250 [00:03<02:29,  1.64it/s][A
  2%|▏         | 6/250 [00:03<02:26,  1.67it/s][A
  3%|▎         | 7/250 [00:04<02:23,  1.69it/s][A
  3%|▎         | 8/250 [00:04<02:22,  1.70it/s][A
  4%|▎         | 9/250 [00:05<02:20,  1.71it/s][A
  4%|▍         | 10/250 [00:06<02:19,  1.71it/s][A
  4%|▍         | 11/250 [00:06<02:18,  1.72it/s][A
  5%|▍         | 12/250 [00:07<02:17,  1.73it/s][A
  5%|▌         | 13/250 [00:07<02:16,  1.73it/s][A
  6%|▌         | 14/250 [00:08<02:16,  1.73it/s][A
  6%|▌         | 15/250 [00:09<02:16,  1.73it/s][A
  6%|▋         | 16/250 [00:09<02:15,  1.73it/s][A
  7%|▋         | 17/250 [00:10<02:14,  1.73it/s][A
  7%|▋         | 18/250 [00:10<

EPOCH: 0002 train_loss: 1.1347 val_loss: 0.8902, train_mean_iou: 0.146, val_mean_iou: 0.183


  3%|▎         | 3/100 [07:39<4:07:37, 153.17s/it]
  0%|          | 0/250 [00:00<?, ?it/s][A
  0%|          | 1/250 [00:00<04:01,  1.03it/s][A
  1%|          | 2/250 [00:01<03:03,  1.35it/s][A
  1%|          | 3/250 [00:02<02:44,  1.50it/s][A
  2%|▏         | 4/250 [00:02<02:34,  1.59it/s][A
  2%|▏         | 5/250 [00:03<02:29,  1.64it/s][A
  2%|▏         | 6/250 [00:03<02:26,  1.67it/s][A
  3%|▎         | 7/250 [00:04<02:23,  1.69it/s][A
  3%|▎         | 8/250 [00:05<02:21,  1.71it/s][A
  4%|▎         | 9/250 [00:05<02:20,  1.72it/s][A
  4%|▍         | 10/250 [00:06<02:19,  1.72it/s][A
  4%|▍         | 11/250 [00:06<02:18,  1.73it/s][A
  5%|▍         | 12/250 [00:07<02:18,  1.72it/s][A
  5%|▌         | 13/250 [00:07<02:17,  1.73it/s][A
  6%|▌         | 14/250 [00:08<02:16,  1.73it/s][A
  6%|▌         | 15/250 [00:09<02:16,  1.73it/s][A
  6%|▋         | 16/250 [00:09<02:15,  1.73it/s][A
  7%|▋         | 17/250 [00:10<02:14,  1.73it/s][A
  7%|▋         | 18/250 [00:10<

EPOCH: 0003 train_loss: 1.0429 val_loss: 0.9322, train_mean_iou: 0.172, val_mean_iou: 0.151


  4%|▍         | 4/100 [10:12<4:05:03, 153.16s/it]
  0%|          | 0/250 [00:00<?, ?it/s][A
  0%|          | 1/250 [00:01<04:10,  1.01s/it][A
  1%|          | 2/250 [00:01<03:06,  1.33it/s][A
  1%|          | 3/250 [00:02<02:45,  1.49it/s][A
  2%|▏         | 4/250 [00:02<02:35,  1.58it/s][A
  2%|▏         | 5/250 [00:03<02:30,  1.63it/s][A
  2%|▏         | 6/250 [00:03<02:27,  1.66it/s][A
  3%|▎         | 7/250 [00:04<02:24,  1.69it/s][A
  3%|▎         | 8/250 [00:05<02:22,  1.70it/s][A
  4%|▎         | 9/250 [00:05<02:20,  1.71it/s][A
  4%|▍         | 10/250 [00:06<02:19,  1.72it/s][A
  4%|▍         | 11/250 [00:06<02:18,  1.72it/s][A
  5%|▍         | 12/250 [00:07<02:17,  1.73it/s][A
  5%|▌         | 13/250 [00:07<02:17,  1.73it/s][A
  6%|▌         | 14/250 [00:08<02:16,  1.73it/s][A
  6%|▌         | 15/250 [00:09<02:16,  1.73it/s][A
  6%|▋         | 16/250 [00:09<02:15,  1.73it/s][A
  7%|▋         | 17/250 [00:10<02:14,  1.73it/s][A
  7%|▋         | 18/250 [00:10<

EPOCH: 0004 train_loss: 0.9855 val_loss: 0.9316, train_mean_iou: 0.178, val_mean_iou: 0.180


  4%|▍         | 4/100 [12:45<5:06:21, 191.47s/it]

Earlystop!



