In [1]:
import os

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

import torch

torch.set_float32_matmul_precision("medium")  # Try "high" as well

import matplotlib.pyplot as plt

from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger, MLFlowLogger
import segmentation_models_pytorch as smp
import mlflow
from typing import Tuple, List
import numpy as np

import coremltools as ct

from src.dataset import SimpleWallADE20KDataset
from src.model import WallModel
from src import config
from src.transform import get_preprocessing_transform, get_train_augmentations, get_val_augmentations

scikit-learn version 1.2.2 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.


In [2]:
# TODO: move to config
ARCHITECTURE = 'DeepLabV3Plus'
ENCODER = 'mobileone_s1'

In [3]:
# Use this for debug or set to None for actual training
# TRAIN_SIZE = 200
# VAL_SIZE = 200
TRAIN_SIZE = None
VAL_SIZE = None

train_augmentations = get_train_augmentations()
train_dataset = SimpleWallADE20KDataset(
    root=config.DATA_ROOT,
    mode='train',
    length=TRAIN_SIZE,
    augmentation_fn=train_augmentations,
    preprocessing_fn=get_preprocessing_transform(ENCODER)
)

val_augmentations = get_val_augmentations()
val_dataset = SimpleWallADE20KDataset(
    root=config.DATA_ROOT,
    mode='val',
    length=TRAIN_SIZE,
    augmentation_fn=val_augmentations,
    preprocessing_fn=get_preprocessing_transform(ENCODER)
)

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(val_dataset)}")

n_cpu = os.cpu_count()
print(f'Number of CPUs: {n_cpu}')
train_dataloader = DataLoader(train_dataset, batch_size=config.BATCH_SIZE, shuffle=True, num_workers=n_cpu)
val_dataloader = DataLoader(val_dataset, batch_size=config.BATCH_SIZE, shuffle=False, num_workers=n_cpu)



Train size: 11035
Validation size: 969
Number of CPUs: 20


In [4]:
images, masks = next(iter(train_dataloader))
type(images), images.shape, images.min(), images.max(), images.mean(), images.std()

(torch.Tensor,
 torch.Size([8, 3, 512, 512]),
 tensor(-2.1179),
 tensor(2.6400),
 tensor(-0.6403),
 tensor(1.2807))

In [5]:
# TODO: check if encoder is frozen and try unfreezing it
wall_model = WallModel(
    architecture=ARCHITECTURE,
    encoder_name=ENCODER,  # s0, s1, s2, s3, s4
    in_channels=3,
    out_classes=1,
    learning_rate=config.LEARNING_RATE,
)
wall_model.params

{'input_space': 'RGB',
 'input_range': [0, 1],
 'mean': [0.485, 0.456, 0.406],
 'std': [0.229, 0.224, 0.225]}

In [6]:
if config.FREEZE_ENCODER:
    print("Freezing encoder params")
    for p in wall_model.model.encoder.parameters():
        p.requires_grad = False

In [7]:
checkpoint_callback = ModelCheckpoint(
    dirpath=os.getcwd(),
    filename='{epoch}-{val_loss:.2f}-{val_dataset_iou:.2f}',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min',
)

earlystop_callback = EarlyStopping(monitor="val_loss", patience=5)

MLFLOW_EXPERIMENT_NAME = 'wall_segmentation'

# TODO: use model name from config.py
# logger = TensorBoardLogger("tb_logs", name="wall_model")

In [8]:
# TODO: Adjust parameters as described in MobileOne and DeepLab papers
# MobileOne:
#  - weight decay in early stages (probably not needed, since we do transfer learning)
#  - cosine schedule for learning rate
#  - S0 and S1 we use standard augmentation – random resized cropping and horizontal flipping.
#  - We also use EMA (Exponential Moving Average) weight averaging with decay constant of 0.9995 for training all versions of MobileOne.
#  - We use MobileOne as the backbone for a Deeplab V3 segmentation network [4] using the cvnets library [45].
#  - The VOC models were trained on the augmented Pascal VOC dataset [16] for 50 epochs following the training procedure of [45] (MobileVIT). 
#    The ADE 20k [65] models were trained using the same hyperparameters and augmentations
# DeepLab:
#  - TODO

In [None]:
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
with mlflow.start_run() as run:
    mlflow.log_params({
        'max_epochs': config.MAX_EPOCHS,
        'encoder_params': wall_model.params,
        'learning_rate': config.LEARNING_RATE,
        'batch_size': config.BATCH_SIZE,
        'val_dataset_size': len(val_dataset),
        'train_dataset_size': len(train_dataset),
        # 'train_augmentations': repr(train_augmentations),
        # 'val_augmentations': repr(train_augmentations),
        'architecture': ARCHITECTURE,
        'encoder': ENCODER,
        'freeze_encoder': config.FREEZE_ENCODER,
    })

    logger = MLFlowLogger(
        MLFLOW_EXPERIMENT_NAME,
        save_dir="./mlruns",
        log_model=True,
        run_id=run.info.run_id,
    )

    trainer = pl.Trainer(
        devices=1,
        logger=logger,
        max_epochs=config.MAX_EPOCHS,
        callbacks=[
            checkpoint_callback,
            earlystop_callback
        ]
    )

    trainer.fit(
        wall_model,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader
    )

# TODO visualize training curves + in real time (tensorbboard?)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type          | Params
------------------------------------------
0 | model   | DeepLabV3Plus | 5.7 M 
1 | loss_fn | DiceLoss      | 0     
------------------------------------------
5.7 M     Trainable params
0         Non-trainable params
5.7 M     Total params
22.809    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 1380: 'val_loss' reached 0.24652 (best 0.24652), saving model to '/home/ricardsku/Development/Bath/bath_wall_segmentation_model/notebooks/epoch=0-val_loss=0.25-val_dataset_iou=0.62.ckpt' as top 1


In [None]:
wall_model = wall_model.eval()

In [None]:
demo_dataset = SimpleWallADE20KDataset(
    root=config.DATA_ROOT,
    length=10,
    preprocessing_fn=get_preprocessing_transform(ENCODER),
    augmentation_fn=get_val_augmentations()
)
demo_dataset_vis = SimpleWallADE20KDataset(
    root=config.DATA_ROOT,
    length=10,
    preprocessing_fn=None
)

In [None]:
demo_dataloader = DataLoader(demo_dataset, batch_size=1, shuffle=False)

In [None]:
def show_images(images: List[Tuple[str, np.ndarray]]):
    num_images = len(images)
    plt.figure(figsize=(16, 5))
    for i, (name, image) in enumerate(images):
        plt.subplot(1, num_images, i + 1)
        plt.axis('off')
        plt.title(name)
        plt.imshow(image)
    plt.show()

In [None]:
dataloader_iter = iter(demo_dataloader)
for i in range(0, 10):
    image, mask = demo_dataset_vis[i]
    x, _ = next(dataloader_iter)
    x = x.cuda()
    y = wall_model(x).cpu().detach().numpy().squeeze()

    show_images([
        ('Image', image),
        ('Prediction', y),
        ('Ground truth', mask),
    ])