In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torchvision.utils import save_image
import torchvision.transforms as transforms
from torchvision import datasets, models, transforms
import torchvision

import os
import sys
sys.path.insert(0, "../utils")
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm

import src.datasets.cityscapes_loader as cityscapes_loader
import utils.train_eval as train_eval
import importlib
import visualizations as vis

%load_ext autoreload
%autoreload 2

In [2]:
importlib.reload(cityscapes_loader)

is_sequence = True

dataset_root_dir = "/home/nfs/inf6/data/datasets/cityscapes/"

train_ds = cityscapes_loader.cityscapesLoader(root=dataset_root_dir, split='train', img_size=(512, 1024), is_transform=True, is_sequence=is_sequence)
val_ds = cityscapes_loader.cityscapesLoader(root=dataset_root_dir, split='val', img_size=(1024, 2048), is_transform=True, is_sequence=is_sequence)
#val_ds = cityscapes_loader.cityscapesLoader(root=dataset_root_dir, split='val', img_size=(512, 1024), is_transform=True, is_sequence=is_sequence)

train_loader = torch.utils.data.DataLoader(train_ds, batch_size=2, shuffle=True, drop_last=True)
valid_loader = torch.utils.data.DataLoader(val_ds, batch_size=1, shuffle=False, drop_last=True)

Found 2975 train images
Found 500 val images


In [3]:
from src.architectures.architecture_configs import *
import src.architectures.Temporal_UNET_Template as Temporal_UNET_Template
import utils.utils
"""
encoder_blocks = SmallDeep_NetworkSize.encoder_blocks
decoder_blocks = SmallDeep_NetworkSize.decoder_blocks

config = Temporal_ResUNetConfig(
    encoder_blocks=encoder_blocks,
    decoder_blocks=decoder_blocks,
    temporal_cell= Conv2dGRUCell
    )

temp_unet = Temporal_UNET_Template.Temporal_UNet(config)
"""


encoder_blocks = SmallDeep_NetworkSize.encoder_blocks
decoder_blocks = SmallDeep_NetworkSize.decoder_blocks

config = Temporal_ConvUNextConfig(
        encoder_blocks=encoder_blocks,
        decoder_blocks=decoder_blocks,
        temporal_cell= Conv2dGRUCell
        )

"""
encoder_blocks = MediumDeep_NetworkSize.encoder_blocks
decoder_blocks = MediumDeep_NetworkSize.decoder_blocks

config = Temporal_ConvUNextConfig(
        encoder_blocks=encoder_blocks,
        decoder_blocks=decoder_blocks,
        temporal_cell= Conv2dGRUCell
        )
"""



temp_unet = Temporal_UNET_Template.Temporal_UNet(config)

temp_unet_optim = torch.optim.Adam(temp_unet.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

epochs=100
temp_unet_trainer = utils.train_eval.Trainer(
            temp_unet, temp_unet_optim, criterion,
            train_loader, valid_loader, "cityscapes", epochs,
            sequence=True, all_labels=20, start_epoch=62)

load_model = True
if load_model:
    temp_unet_trainer.load_model("cityscapes")

/home/user/sheludzk/CudaLab_Project/tboard_logs/Temporal_ConvUNextConfig_Conv2dGRUCell/Layers4_InitDim16


In [5]:
vis.save_vis_seq(temp_unet, valid_loader, model_name="test")

0: torch.Size([1, 12, 20, 512, 1024])
1: torch.Size([1, 12, 20, 512, 1024])
2: torch.Size([1, 12, 20, 512, 1024])
3: torch.Size([1, 12, 20, 512, 1024])
4: torch.Size([1, 12, 20, 512, 1024])
5: torch.Size([1, 12, 20, 512, 1024])
6: torch.Size([1, 12, 20, 512, 1024])
7: torch.Size([1, 12, 20, 512, 1024])
8: torch.Size([1, 12, 20, 512, 1024])
9: torch.Size([1, 12, 20, 512, 1024])
10: torch.Size([1, 12, 20, 512, 1024])
11: torch.Size([1, 12, 20, 512, 1024])
12: torch.Size([1, 12, 20, 512, 1024])
13: torch.Size([1, 12, 20, 512, 1024])
14: torch.Size([1, 12, 20, 512, 1024])
15: torch.Size([1, 12, 20, 512, 1024])
16: torch.Size([1, 12, 20, 512, 1024])
17: torch.Size([1, 12, 20, 512, 1024])
18: torch.Size([1, 12, 20, 512, 1024])
19: torch.Size([1, 12, 20, 512, 1024])
20: torch.Size([1, 12, 20, 512, 1024])
21: torch.Size([1, 12, 20, 512, 1024])


In [6]:
vis.create_gifs("test", mode="overlay", transparency=0.45, fps=8)

## Visualizations 

### Baselines

Vanilla Original size:

<center><img style="width: 70%" src="resources/gifs/Baselines/BaselineVanillaOriginalSizes_high_res/1.gif"></center>

VanillaSmallDeep:

<center><img style="width: 70%" src="resources/gifs/Baselines/BaselineVanillaSmallDeep_high_res/1.gif"></center>

BaselineVanillaSmallShallow:

<center><img style="width: 70%" src="resources/gifs/Baselines/BaselineVanillaSmallShallow_high_res/1.gif"></center>

According to the validation metrics (mIoU and mAcc) "Vanilla Original size"-model should perform betterthen other 2. But visually we can see that VanillaSmallDeep is the leader. All the predictions (even by VanillaSmallDeep) have some chaotic flickering by changing frames.

You can find more visualizations for this 3 models here: resources/gifs/Baselines



### Temporal models

ResUNet with Conv2dGRUCell SmallDeep:
<center><img style="width: 70%" src="resources/gifs/Temporal/Temporal_ResUNetConfig_Conv2dGRUCell_SmallDeep_high_res/1.gif"></center>

ResUNet with Conv2dGRUCell SmallShallow:
<center><img style="width: 70%" src="resources/gifs/Temporal/Temporal_ResUNetConfig_Conv2dGRUCell_SmallShallow_high_res/1.gif"></center>

ConvUNext with Conv2dGRUCell SmallDeep:
<center><img style="width: 70%" src="resources/gifs/Temporal/Temporal_ConvUNextConfig_Conv2dGRUCell_SmallDeep_high_res/1.gif"></center>

VanillaUNet with Conv2dGRUCell SmallShallow:
<center><img style="width: 70%" src="resources/gifs/Temporal/Temporal_VanillaUNetConfig_Conv2dGRUCell_SmallShallow_high_res/1.gif"></center>

VanillaUNet  with Conv2dRNNCell SmallShallow:
<center><img style="width: 70%" src="resources/gifs/Temporal/Temporal_VanillaUNetConfig_Conv2dRNNCell_SmallShallow_high_res/1.gif"></center>


According to the validation metrics (mAcc and mIoU) the leader in the group of temporal models has to be "ResUNet with Conv2dGRUCell SmallDeep". That corresponds to the observed results.<br>

In general we can observe the following pattern: <br>
* first image in the sequence has some level of false predicted regions
* every following frame has better prediction

This results proove the advantage of usage of recurrent modules as part of U-Net



### Side remark

During the training we have observed some interesting detail.<br>

Training of all the models was done using 512x1024 frames. According to the conditions of the final project the evaluation should be done using 1024x2048 frames.
Some examples of this evaluation can be found here: /resources/gifs <br>

We have also done the evaluation of the models using 512x1024 frames.<br>
The results of these predictions looks better. Here is an example:

ResUNet with Conv2dGRUCell SmallDeep:
left-1024x2048, right-512x1024
<p float="left">
  <img style="width: 49%" src="resources/gifs/Temporal/Temporal_ResUNetConfig_Conv2dGRUCell_SmallDeep_high_res/10.gif" />
  <img style="width: 49%" src="resources/gifs/Temporal_low_res/Temporal_ResUNetConfig_Conv2dGRUCell_SmallDeep_low_res/10.gif"  /> 
</p>

We have used random crop augmentation to compensate this effect, but could not completry get rid of it.