In [1]:
from pathlib import Path
import os
import numpy as np
import torch
import torchvision.transforms.functional as T
from matplotlib import pyplot as plt
from torchvision.utils import draw_segmentation_masks, make_grid
import pytorch_lightning as pl
import mlflow
from tqdm import tqdm
import torchmetrics
import torch.nn.functional as F
from pytorch_lightning.loggers import MLFlowLogger
from pytorch_lightning.callbacks import RichProgressBar, ModelCheckpoint

In [2]:
from inz.data.event import Event, Hold, Test, Tier1, Tier3
from inz.data.data_module import XBDDataModule
from inz.models.unet_basic import UNet
from inz.models.unet_basic_pl import SemanticSegmentor, OrdinalCrossEntropyLoss, DiceLoss

In [3]:
RANDOM_SEED = 123
pl.seed_everything(RANDOM_SEED)
device = torch.device("cuda")
torch.set_float32_matmul_precision("high")

Seed set to 123


In [4]:
dm = XBDDataModule(
    path=Path("data/xBD_processed"),
    events={
        # Tier1: [
        #     Event.hurricane_florence,
        #     Event.hurricane_harvey,
        #     Event.hurricane_matthew,
        #     Event.hurricane_michael,
        # ],
        Tier3: [
            Event.joplin_tornado,
            # Event.moore_tornado,
            # Event.tuscaloosa_tornado
        ],
        # Hold: [
        #     Event.hurricane_florence,
        #     Event.hurricane_harvey,
        #     Event.hurricane_matthew,
        #     Event.hurricane_michael,
        # ],
        # Test: [
        #     Event.hurricane_florence,
        #     Event.hurricane_harvey,
        #     Event.hurricane_matthew,
        #     Event.hurricane_michael,
        # ],
    },
    val_faction=0.15,
    test_fraction=0.,
    train_batch_size=32,
    val_batch_size=32,
    test_batch_size=32,
)
dm.prepare_data()
dm.setup("fit")

print(
    f"{len(dm.train_dataloader())} train batches, {len(dm.val_dataloader())} val batches, {len(dm.test_dataloader())} test batches, "
)

64 train batches, 12 val batches, 0 test batches, 


In [5]:
aaa_loc = []
aaa_cls = []
for batch in tqdm(dm.train_dataloader()):
    pre_images, pre_masks, post_images, post_masks = batch
    counts_post = torch.bincount(post_masks.argmax(dim=1).reshape(-1), minlength=6)
    aaa_cls.append(counts_post)
    counts_pre = torch.bincount(pre_masks.argmax(dim=1).reshape(-1), minlength=6)
    aaa_loc.append(torch.tensor([counts_pre[0], counts_pre[1:].sum()]))

loc_counts = torch.stack(aaa_loc).sum(dim=0).to(torch.float)
cls_counts = torch.stack(aaa_cls).sum(dim=0).to(torch.float)

print(cls_counts)

loc_weights = loc_counts.sum() / loc_counts
loc_weights = (loc_weights / loc_weights.sum()).cuda()
cls_weights = cls_counts.sum() / cls_counts
cls_weights = (cls_weights / cls_weights.sum()).cuda()

print(f"Localization weights: {loc_weights}\nClassification weights: {cls_weights}")

100%|██████████| 64/64 [00:20<00:00,  3.11it/s]


tensor([1.1890e+08, 7.7670e+06, 2.1349e+06, 1.3483e+06, 2.4458e+06, 2.4971e+05])
Localization weights: tensor([0.1052, 0.8948], device='cuda:0')
Classification weights: tensor([0.0015, 0.0223, 0.0813, 0.1287, 0.0710, 0.6952], device='cuda:0')


In [6]:
model = SemanticSegmentor(
    model=UNet(in_channels=3, out_channels=6),
    localization_loss=torch.nn.BCEWithLogitsLoss(pos_weight=loc_weights[1]),
    # classification_loss=DiceLoss(weight=cls_weights.cuda()),
    # classification_loss=torch.nn.CrossEntropyLoss(weight=cls_weights.cuda(), reduction='sum'),
    classification_loss=OrdinalCrossEntropyLoss(n_classes=6, weights=cls_weights),
    n_classes=6,
)

/home/tomek/inz/inz/.venv/lib/python3.11/site-packages/pytorch_lightning/utilities/parsing.py:199: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.
/home/tomek/inz/inz/.venv/lib/python3.11/site-packages/pytorch_lightning/utilities/parsing.py:199: Attribute 'localization_loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['localization_loss'])`.
/home/tomek/inz/inz/.venv/lib/python3.11/site-packages/pytorch_lightning/utilities/parsing.py:199: Attribute 'classification_loss' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['classification_loss'])`.


In [7]:
mlflow.pytorch.autolog()


checkpoint_callback = ModelCheckpoint(
    save_top_k=1, verbose=True, monitor="f1", mode="max", filename="{epoch}{f1:.5f}"
)

trainer = pl.Trainer(
    max_epochs=300,
    callbacks=[RichProgressBar(), checkpoint_callback],
    logger=MLFlowLogger(),
    precision="16-mixed",
)
trainer.fit(model, datamodule=dm)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2024/06/10 00:04:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '43164e6e282b499394df70d6f735d7c9', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current pytorch workflow
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 23.67 GiB of which 251.62 MiB is free. Process 27031 has 846.00 MiB memory in use. Process 269541 has 19.67 GiB memory in use. Including non-PyTorch memory, this process has 1.92 GiB memory in use. Of the allocated memory 1.23 GiB is allocated by PyTorch, and 400.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)