In [1]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

import torch

torch.set_float32_matmul_precision("medium")  # Try "high" as well

import sys

try:
    import src
except:
    sys.path.append('../')
    import src


import matplotlib.pyplot as plt

from torch.utils.data import DataLoader
from pytorch_lightning.loggers import MLFlowLogger
import pytorch_lightning as pl
import mlflow
from typing import Tuple, List
import numpy as np

import coremltools as ct

from src.dataset import SimpleWallADE20KDataset
from src.model import WallModel
from src import config
from src.transform import get_preprocessing_transform, get_train_augmentations, get_val_augmentations, \
    get_val_augmentations_single
from src.callbacks import ModelCheckpoint, EarlyStopping, MLFlowImageLogging, LearningRateLogging

scikit-learn version 1.2.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.


In [2]:
if config.RESUME_FROM_CHECKPOINT_PATH is not None:
    print(f'Resuming from checkpoint: {config.RESUME_FROM_CHECKPOINT_PATH}')
    wall_model = WallModel.load_from_checkpoint(config.RESUME_FROM_CHECKPOINT_PATH, init_datasets=True)
else:
    # TODO: try learning without pre-trained imagenet weights (and strategy used by these Serbian dudes in WallSegmentation)
    print('Training from scratch')
    wall_model = WallModel(
        architecture=config.ARCHITECTURE,
        encoder_name=config.ENCODER,
        in_channels=3,
        out_classes=1,
        learning_rate=config.LEARNING_RATE,
        init_datasets=True,
        encoder_depth=config.ENCODER_DEPTH,
        train_size=config.TRAIN_SIZE,
        val_size=config.TRAIN_SIZE
    )
wall_model.params

Training from scratch




{'input_space': 'RGB',
 'input_range': [0, 1],
 'mean': [0.485, 0.456, 0.406],
 'std': [0.229, 0.224, 0.225]}

In [3]:
if config.FREEZE_ENCODER:
    print("Freezing encoder params")
    for p in wall_model.model.encoder.parameters():
        p.requires_grad = False
else:
    print('Not freezing encoder')

Not freezing encoder


In [4]:
checkpoint_callback = ModelCheckpoint(
    dirpath=os.getcwd(),
    filename='{epoch}-{train_loss:.4f}-{val_loss:.4f}-{train_dataset_iou:.4f}-{val_dataset_iou:.4f}',
    save_top_k=1,
    verbose=True,
    # monitor='val_loss',
    # mode='min',
    monitor='val_dataset_iou',
    mode='max'
)

# TODO: save both best and last checkpoint (to conitnue training)

# earlystop_callback = EarlyStopping(monitor="val_loss", patience=config.EARLYSTOP_PATIENCE)
earlystop_callback = EarlyStopping(
    # monitor="val_loss",
    # mode='min',
    monitor="val_dataset_iou",
    mode='max',
    patience=config.EARLYSTOP_PATIENCE
)

MLFLOW_EXPERIMENT_NAME = 'wall_segmentation'

# TODO: use model name from config.py
# logger = TensorBoardLogger("tb_logs", name="wall_model")

In [5]:
# TODO: Adjust parameters as described in MobileOne and DeepLab papers
# MobileOne:
#  - weight decay in early stages (probably not needed, since we do transfer learning)
#  - cosine schedule for learning rate
#  - S0 and S1 we use standard augmentation – random resized cropping and horizontal flipping.
#  - We also use EMA (Exponential Moving Average) weight averaging with decay constant of 0.9995 for training all versions of MobileOne.
#  - We use MobileOne as the backbone for a Deeplab V3 segmentation network [4] using the cvnets library [45].
#  - The VOC models were trained on the augmented Pascal VOC dataset [16] for 50 epochs following the training procedure of [45] (MobileVIT). 
#    The ADE 20k [65] models were trained using the same hyperparameters and augmentations
# DeepLab:
#  - TODO

# TODO: try this config: https://github.com/apple/ml-cvnets/blob/main/config/segmentation/ade20k/deeplabv3_mobilenetv2.yaml
# batch-size 4 per GPU (4 GPUs => effective 16 batch size)
# max image size: 1024, short side min: 256, short side max: 768
# loss: cross-entropy
# lr scheduler: cosine, max
# aspp_out_channels: 512
# aspp_rates: (12, 24, 36)
# DeepLabV3 (uses encoder, hecne DeepLabV3+)
# Backbone: ResNet50
# optimizer: SGD, weight_decay: 1e-4, no_decay_bn_filter_bias: true, momentum: 0.9

# TODO: acknowledge that we calculate metrics incorrectly - we pad all images to square form, thus effectively increasing share of 'background' class
#   Ideally we would only consider original image size during calculations.

# Considerations
# foreground (walls) and background classes are already heavily imbalanced (see data exploration).
# Adding padding and augmentations adds more 'background' area, thus increasing the imbalance. To mediate that, use ignore_index in metrics and loss functions
# To compensate for imbalance - we experimented with Focal loss and per-class loss weights (TODO)

In [6]:
NUM_IMAGES_LOG = 32

val_augmentation_fn = wall_model.val_dataset.augmentation_fn

demo_dataset = SimpleWallADE20KDataset(
    root=config.DATA_ROOT,
    length=NUM_IMAGES_LOG,
    preprocessing_fn=get_preprocessing_transform(config.ENCODER),
    augmentation_fn=val_augmentation_fn,
    mode='val'
)
demo_dataset_vis = SimpleWallADE20KDataset(
    root=config.DATA_ROOT,
    length=NUM_IMAGES_LOG,
    preprocessing_fn=None,
    augmentation_fn=val_augmentation_fn,
    mode='val'
)
demo_dataloader = DataLoader(demo_dataset, batch_size=1, shuffle=False)

imagelog_callback = MLFlowImageLogging(dataset=demo_dataset, dataset_vis=demo_dataset_vis)

In [None]:
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
with mlflow.start_run() as run:
    params = {
        'max_epochs': config.MAX_EPOCHS,
        'encoder_params': wall_model.params,
        'encoder_depth': wall_model.encoder_depth,
        'learning_rate': config.LEARNING_RATE,
        'batch_size': config.BATCH_SIZE,
        'val_dataset_size': len(wall_model.val_dataset),
        'train_dataset_size': len(wall_model.train_dataset),
        'architecture': config.ARCHITECTURE,
        'encoder': config.ENCODER,
        'freeze_encoder': config.FREEZE_ENCODER,
        'input_size': config.INPUT_IMAGE_SIZE,
        'checkpoint_monitor': checkpoint_callback.monitor,
        'loss': repr(wall_model.losses),
        'optimizer': type(wall_model.optimizer).__name__,
        'scheduler': type(wall_model.scheduler).__name__
    }
    if config.RESUME_FROM_CHECKPOINT_PATH is not None:
        params['resume_from_checkpoint_path'] = config.RESUME_FROM_CHECKPOINT_PATH
    for (key, value) in wall_model.optimizer.param_groups[0].items():
        if key == 'params':
            continue
        params[f'optimizer__{key}'] = str(value)
    for (key, value) in wall_model.scheduler.state_dict().items():
        params[f'scheduler__{key}'] = str(value)
    mlflow.log_params(params)

    logger = MLFlowLogger(
        MLFLOW_EXPERIMENT_NAME,
        save_dir="./mlruns",
        log_model=True,
        run_id=run.info.run_id,
    )
    
    def log_learning_rate_metric(params):
        mlflow.log_metric('learning_rate', params['learning_rate'], step=params['global_step'])
        
    lr_logging_callback = LearningRateLogging(
        log_fn=log_learning_rate_metric
    )

    trainer = pl.Trainer(
        devices=1,
        logger=logger,
        max_epochs=config.MAX_EPOCHS,
        callbacks=[
            checkpoint_callback,
            earlystop_callback,
            imagelog_callback,
            lr_logging_callback
        ],
        default_root_dir='./pl_logs',
        enable_checkpointing=True,
    )

    trainer.fit(wall_model, ckpt_path=config.RESUME_FROM_CHECKPOINT_PATH)

    mlflow.log_param('best_checkpoint', checkpoint_callback.best_model_path)
    best_checkpoint_score = checkpoint_callback.best_model_score.cpu().detach().numpy().item()
    mlflow.log_metric('best_checkpoint_score', best_checkpoint_score)

    # TODO: log best val_loss, train_loss, and other metrics

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type          | Params
----------------------------------------
0 | model | DeepLabV3Plus | 6.1 M 
----------------------------------------
6.1 M     Trainable params
0         Non-trainable params
6.1 M     Total params
24.358    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 2759: 'val_dataset_iou' reached 0.70534 (best 0.70534), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=0-train_loss=0.1108-val_loss=0.0961-train_dataset_iou=0.6672-val_dataset_iou=0.7053.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 1, global step 5518: 'val_dataset_iou' reached 0.73650 (best 0.73650), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=1-train_loss=0.0974-val_loss=0.0860-train_dataset_iou=0.7093-val_dataset_iou=0.7365.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 2, global step 8277: 'val_dataset_iou' reached 0.74420 (best 0.74420), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=2-train_loss=0.0916-val_loss=0.0840-train_dataset_iou=0.7262-val_dataset_iou=0.7442.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 3, global step 11036: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 4, global step 13795: 'val_dataset_iou' reached 0.75704 (best 0.75704), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=4-train_loss=0.0859-val_loss=0.0814-train_dataset_iou=0.7445-val_dataset_iou=0.7570.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 5, global step 16554: 'val_dataset_iou' reached 0.76481 (best 0.76481), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=5-train_loss=0.0836-val_loss=0.0808-train_dataset_iou=0.7506-val_dataset_iou=0.7648.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 6, global step 19313: 'val_dataset_iou' reached 0.76563 (best 0.76563), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=6-train_loss=0.0818-val_loss=0.0791-train_dataset_iou=0.7557-val_dataset_iou=0.7656.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 7, global step 22072: 'val_dataset_iou' reached 0.77088 (best 0.77088), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=7-train_loss=0.0805-val_loss=0.0795-train_dataset_iou=0.7584-val_dataset_iou=0.7709.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 8, global step 24831: 'val_dataset_iou' reached 0.77243 (best 0.77243), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=8-train_loss=0.0797-val_loss=0.0782-train_dataset_iou=0.7621-val_dataset_iou=0.7724.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 9, global step 27590: 'val_dataset_iou' reached 0.77338 (best 0.77338), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=9-train_loss=0.0791-val_loss=0.0761-train_dataset_iou=0.7623-val_dataset_iou=0.7734.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 10, global step 30349: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 11, global step 33108: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 12, global step 35867: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 13, global step 38626: 'val_dataset_iou' reached 0.77660 (best 0.77660), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=13-train_loss=0.0787-val_loss=0.0793-train_dataset_iou=0.7663-val_dataset_iou=0.7766.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 14, global step 41385: 'val_dataset_iou' reached 0.78151 (best 0.78151), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=14-train_loss=0.0775-val_loss=0.0772-train_dataset_iou=0.7691-val_dataset_iou=0.7815.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 15, global step 44144: 'val_dataset_iou' reached 0.78610 (best 0.78610), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=15-train_loss=0.0765-val_loss=0.0756-train_dataset_iou=0.7722-val_dataset_iou=0.7861.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 16, global step 46903: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 17, global step 49662: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 18, global step 52421: 'val_dataset_iou' reached 0.78848 (best 0.78848), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=18-train_loss=0.0740-val_loss=0.0728-train_dataset_iou=0.7784-val_dataset_iou=0.7885.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 19, global step 55180: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 20, global step 57939: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 21, global step 60698: 'val_dataset_iou' reached 0.78926 (best 0.78926), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=21-train_loss=0.0704-val_loss=0.0766-train_dataset_iou=0.7896-val_dataset_iou=0.7893.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 22, global step 63457: 'val_dataset_iou' reached 0.79384 (best 0.79384), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=22-train_loss=0.0699-val_loss=0.0720-train_dataset_iou=0.7902-val_dataset_iou=0.7938.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 23, global step 66216: 'val_dataset_iou' reached 0.79586 (best 0.79586), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=23-train_loss=0.0691-val_loss=0.0750-train_dataset_iou=0.7934-val_dataset_iou=0.7959.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 24, global step 68975: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 25, global step 71734: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 26, global step 74493: 'val_dataset_iou' reached 0.79637 (best 0.79637), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=26-train_loss=0.0667-val_loss=0.0740-train_dataset_iou=0.7995-val_dataset_iou=0.7964.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 27, global step 77252: 'val_dataset_iou' reached 0.79978 (best 0.79978), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=27-train_loss=0.0664-val_loss=0.0726-train_dataset_iou=0.8007-val_dataset_iou=0.7998.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 28, global step 80011: 'val_dataset_iou' was not in top 1


Validation: 0it [00:00, ?it/s]

Epoch 29, global step 82770: 'val_dataset_iou' reached 0.80040 (best 0.80040), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=29-train_loss=0.0658-val_loss=0.0722-train_dataset_iou=0.8025-val_dataset_iou=0.8004.ckpt' as top 1


Validation: 0it [00:00, ?it/s]

Epoch 30, global step 85529: 'val_dataset_iou' was not in top 1


In [None]:
# Load best weights
wall_model.load_from_checkpoint(checkpoint_callback.best_model_path)
wall_model = wall_model.eval().cpu()
print(f'Loaded {checkpoint_callback.best_model_path}')

In [None]:
demo_dataset = SimpleWallADE20KDataset(
    root=config.DATA_ROOT,
    length=10,
    preprocessing_fn=get_preprocessing_transform(config.ENCODER),
    augmentation_fn=get_val_augmentations(),
    mode='val'
)
demo_dataset_vis = SimpleWallADE20KDataset(
    root=config.DATA_ROOT,
    length=10,
    preprocessing_fn=None,
    augmentation_fn=get_val_augmentations(),
    mode='val'
)

In [None]:
demo_dataloader = DataLoader(demo_dataset, batch_size=1, shuffle=False)

In [None]:
def show_images(images: List[Tuple[str, np.ndarray]]):
    num_images = len(images)
    plt.figure(figsize=(16, 5))
    for i, (name, image) in enumerate(images):
        plt.subplot(1, num_images, i + 1)
        plt.axis('off')
        plt.title(name)
        plt.imshow(image)
    plt.show()

In [None]:
wall_model = wall_model.eval().cpu()

In [None]:
dataloader_iter = iter(demo_dataloader)
threshold = 0.5
for i in range(0, 10):
    image, mask = demo_dataset_vis[i]
    x, _ = next(dataloader_iter)
    # x = x.cuda()
    y = wall_model(x).cpu().detach()
    y = y.sigmoid().numpy().squeeze()
    y[y >= threshold] = 1.0
    y[y < threshold] = 0.0
    # TODO: threshold

    show_images([
        ('Image', image),
        ('Prediction', y),
        ('Ground truth', mask),
    ])

In [None]:
y[y >= 0.5] = 1.0
y[y < 0.5] = 0.0

In [None]:
plt.imshow(y);