In [None]:
# Set up plotting options
!pip install torchmetrics
%matplotlib inline
!pip install -U git+https://github.com/albu/albumentations --no-cache-di
!pip install segmentation-models-pytorch
!pip install psutil
!pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12==24.8.* dask-cudf-cu12==24.8.* cuml-cu12==24.8.* cugraph-cu12==24.8.* cuspatial-cu12==24.8.* cuproj-cu12==24.8.* cuxfilter-cu12==24.8.* cucim-cu12==24.8.* pylibraft-cu12==24.8.* raft-dask-cu12==24.8.* cuvs-cu12==24.8.*
#!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
#!python rapidsai-csp-utils/colab/pip-install.py

In [None]:
import os
import sys
import time
import json
import random
import logging
import psutil
from glob import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, RandomSampler, Subset
from torch.optim.lr_scheduler import ReduceLROnPlateau

from torchmetrics import ConfusionMatrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

import albumentations as A
import segmentation_models_pytorch as smp

from tqdm import tqdm


In [None]:
class iouTracker:
    def __init__(self, n_classes=2, smooth=1e-6):
        self.n_classes = n_classes
        self.smooth = smooth
        self.reset()

    def reset(self):
        self.cm = np.zeros((self.n_classes, self.n_classes), dtype=np.float64)

    def update(self, pred, target):
        # pred: [B, 2, H, W]
        # target: [B, H, W]

        # reshape and move to CPU
        pred = pred.argmax(dim=1).flatten().detach().cpu().numpy()
        target = target.flatten().detach().cpu().numpy()

        # ensure 2x2 confusion matrix
        self.cm += confusion_matrix(target, pred, labels=[0, 1])

    def get_mean(self):
        # Extract TP, FP, FN for class 1 (Flood)
        TP = self.cm[1, 1]
        FP = self.cm[0, 1]
        FN = self.cm[1, 0]

        # Compute IoU for class 1
        iou = TP / (TP + FP + FN + self.smooth)
        return iou


class ETCIDataset(Dataset):
    def __init__(self, dataframe, split, transform=None):
        self.split = split
        self.dataset = dataframe
        self.transform = transform

    def __len__(self):
        return self.dataset.shape[0]


    def __getitem__(self, index):
        example = {}

        df_row = self.dataset.iloc[index]

        # load vv and vh images
        vv_image = cv2.imread(df_row['vv_image_path'], 0) / 255.0
        vh_image = cv2.imread(df_row['vh_image_path'], 0) / 255.0

        # convert vv and ch images to rgb
        rgb_image = s1_to_rgb(vv_image, vh_image)

        if self.split == 'test':
            # no flood mask should be available
            example['image'] = rgb_image.transpose((2,0,1)).astype('float32')
        else:
            # load ground truth flood mask
            flood_mask = cv2.imread(df_row['flood_label_path'], 0) / 255.0

            # compute transformations
            if self.transform:
                augmented = self.transform(image=rgb_image, mask=flood_mask)
                rgb_image = augmented['image']
                flood_mask = augmented['mask']

            example['image'] = rgb_image.transpose((2,0,1)).astype('float32')
            example['mask'] = flood_mask.astype('int64')

        return example

    
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.RandomResizedCrop(size=[256, 256])
])


def get_filename(filepath):
    return os.path.split(filepath)[1]



def s1_to_rgb(vv_image, vh_image):
    ratio_image = np.clip(np.nan_to_num(vh_image/vv_image, 0), 0, 1)
    rgb_image = np.stack((vv_image, vh_image, 1-ratio_image), axis=2)
    return rgb_image



def create_model():
  # load model from package
  model = smp.Unet(
      encoder_name="resnet34",        
      encoder_weights=None,           
      in_channels=3,                  
      classes=2,                      
  )
  return model



model.to(device)


def log_system_usage(task, gpu_memory_allocated, gpu_memory_reserved, cpu_percent, memory_percent, elapsed_time):
    log_message = (f"Task: {task}, GPU Memory Allocated: {gpu_memory_allocated:.2f} MB, "
                   f"GPU Memory Reserved: {gpu_memory_reserved:.2f} MB, CPU Usage: {cpu_percent:.2f}%, "
                   f"Memory Usage: {memory_percent:.2f}%, Elapsed Time: {elapsed_time:.2f} seconds")
    logger.info(log_message)



In [None]:
# Set up the logger
logger = logging.getLogger('TrainingLogger')
logger.setLevel(logging.INFO)
fh = logging.FileHandler('training_logs.txt')
fh.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
logger.addHandler(fh)
logger.addHandler(ch)


In [None]:


def prepare_dataset_paths(dset_root):
    train_dir = os.path.join(dset_root, 'train')
    n_train_regions = len(glob(train_dir + '/*/'))
    print('Number of training temporal-regions: {}'.format(n_train_regions))

    vv_image_paths = sorted(glob(train_dir + '/**/vv/*.png', recursive=True))
    vv_image_names = [get_filename(pth) for pth in vv_image_paths]
    region_name_dates = ['_'.join(n.split('_')[:2]) for n in vv_image_names]

    vh_image_paths, flood_label_paths, region_names = [], [], []
    for i in range(len(vv_image_paths)):
        vh_image_name = vv_image_names[i].replace('vv', 'vh')
        vh_image_path = os.path.join(train_dir, region_name_dates[i], 'tiles', 'vh', vh_image_name)
        vh_image_paths.append(vh_image_path)

        flood_image_name = vv_image_names[i].replace('_vv', '')
        flood_label_path = os.path.join(train_dir, region_name_dates[i], 'tiles', 'flood_label', flood_image_name)
        flood_label_paths.append(flood_label_path)

        region_name = region_name_dates[i].split('_')[0]
        region_names.append(region_name)

    return pd.DataFrame({
        'vv_image_path': vv_image_paths,
        'vh_image_path': vh_image_paths,
        'flood_label_path': flood_label_paths,
        'region': region_names
    })


def split_dataset(train_df, seed=4):
    sub_train_df, development_df = train_test_split(train_df, test_size=0.2, random_state=seed)
    sub_train_df = sub_train_df.reset_index(drop=False)
    development_df = development_df.reset_index(drop=False)

    development_df, test_df = train_test_split(development_df, test_size=0.5, random_state=seed)
    development_df = development_df.reset_index(drop=False)
    test_df = test_df.reset_index(drop=False)

    print(f"Training set size: {sub_train_df.shape[0]}")
    print(f"Development set size: {development_df.shape[0]}")
    print(f"Test set size: {test_df.shape[0]}")

    return sub_train_df, development_df, test_df


def get_dataloaders(sub_train_df, development_df, batch_size, transform=None, coreset_indices=None):
    if coreset_indices is not None:
        train_df = sub_train_df[sub_train_df['index'].isin(core_set_indices)]
        print(f"Using core set of size {len(train_df)}")
    else:
        train_df = sub_train_df
        print(f"Using full training set of size {len(train_df)}")

    train_dataset = ETCIDataset(train_df, split='train', transform=transform)
    sampler = RandomSampler(train_dataset, generator=torch.Generator().manual_seed(4))

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        sampler=sampler,
        num_workers=2,
        pin_memory=True
    )

    development_dataset = ETCIDataset(development_df, split='devlopment', transform=None)
    development_loader = DataLoader(
        development_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=2,
        pin_memory=True
    )

    return train_loader, development_loader



def initialize_training(model, learning_rate):
    model = model.to('cuda')
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5, verbose=True)
    loss_fn = nn.CrossEntropyLoss()
    return model, optimizer, scheduler, loss_fn


def set_seed(seed=42):
    generator = torch.Generator().manual_seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    return generator


def load_coreset(path):
    with open(path, 'r') as json_file:
        core_set_indices = json.load(json_file)
    print(f"coreset size: {len(core_set_indices)}")
    return core_set_indices


def validate_model(model, dataloader, loss_fn, device='cuda'):
    model.eval()
    iou_logger = iouTracker()
    total_loss = 0

    pbar = tqdm(dataloader)
    with torch.no_grad():
        for batch in pbar:
            image = batch['image'].to(device)
            mask = batch['mask'].to(device)

            pred = model(image)
            loss = loss_fn(pred, mask)
            total_loss += loss.item()

            iou_logger.update(pred, mask)
            mIoU = iou_logger.get_mean()
            pbar.set_description(f"Loss: {loss.item():.4f} | mIoU {mIoU:.4f}")

    mean_iou = iou_logger.get_mean()
    avg_loss = total_loss / len(dataloader)
    
    return avg_loss, mean_iou

def train_model(model, train_loader, development_loader, optimizer, scheduler, criteria, samples, epochs=100, patience=10, device='cuda'):
    loss_list = []
    iou_list = []
    results_log = {"train_loss": [], "train_iou": [], "dev_loss": [], "dev_iou": []}

    best_val_loss = float('inf')
    epochs_no_improve = 0

    for epoch in range(epochs):
        logger.info(f"samples: {samples}")
        logger.info('Epoch: [{}/{}]'.format(epoch, epochs))

        # TRAIN 
        model.train()
        iou_logger = iouTracker()
        total_loss = 0
        pbar = tqdm(train_loader)
        current_lr = optimizer.param_groups[0]['lr']
        logger.info(f"Current Learning Rate: {current_lr:.6f}")

        for batch in pbar:
            image = batch['image'].to(device)
            mask = batch['mask'].to(device)

            pred = model(image)
            loss = criteria(pred, mask)
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            iou_logger.update(pred, mask)
            mIoU = iou_logger.get_mean()
            pbar.set_description(f"Loss: {loss.item():.4f} | mIoU {mIoU:.4f}")

        mean_iou = iou_logger.get_mean()
        avg_loss = total_loss / len(train_loader)
        results_log["train_loss"].append(round(avg_loss, 4))
        results_log["train_iou"].append(round(mean_iou, 4))
        logger.info(f"Training Set Evaluation - Loss: {avg_loss:.4f}, mIoU: {mean_iou:.4f}")

        # VALIDATION 
        val_loss, val_iou = validate_model(model, development_loader, criteria, device)
        results_log["dev_loss"].append(round(val_loss, 4))
        results_log["dev_iou"].append(round(val_iou, 4))
        logger.info(f"Development Set Evaluation - Loss: {val_loss:.4f}, mIoU: {val_iou:.4f}")

        # --- EARLY STOPPING ---
        if avg_loss < best_val_loss:
            best_val_loss = avg_loss
            epochs_no_improve = 0
            logger.info("Validation loss improved, saving the model.")
            torch.save(model.state_dict(), f'models/{samples}_epoch{epoch}.pt')
        else:
            epochs_no_improve += 1
            logger.info(f"No improvement in validation loss for {epochs_no_improve} epochs.")
            if epochs_no_improve == patience:
                logger.info(f"Early stopping triggered. No improvement for {patience} epochs.")
                break

        scheduler.step(avg_loss)
        torch.save(model.state_dict(), f'models/{samples}_current.pt')

    with open(f'results/{samples}_training_results.json', 'w') as outfile:
        json.dump(results_log, outfile)

    final_model_name = f'models/{samples}_epoch{epoch}.pt'
    torch.save(model.state_dict(), final_model_name)
    logger.info("Model trained on full set saved as " + final_model_name)

    return results_log




In [None]:
# run training
dset_root = 'dataset/ETCI_2021_Competition_Dataset/'
train_df = prepare_dataset_paths(dset_root)
sub_train_df, development_df, test_df = split_dataset(train_df)


coreset_indices = load_coreset('experiments/kmeans_encoder_sampling/samples_kmeans_03_encoder_01.json')

train_loader, development_loader = get_dataloaders(
    sub_train_df,
    development_df,
    batch_size=64,
    transform=transform,
    coreset_indices=coreset_indices  
)

set_seed(42)
model = create_model()
model, optimizer, scheduler, criteria = initialize_training(model, learning_rate=1e-3)

results_log = train_model(
    model=model,
    train_loader=train_loader,
    development_loader=development_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    criteria=criteria,
    samples='samples_kmeans_03_encoder_01',
    epochs=100,
    patience=10,
    device='cuda'
)


In [None]:
# run testing

def evaluate_model(model, dataloader, device='cuda', num_classes=2):
    model.eval()
    confmat = ConfusionMatrix(task="multiclass", num_classes=num_classes).to(device)

    with torch.no_grad():
        for batch in tqdm(dataloader):
            image = batch['image'].to(device)
            mask = batch['mask'].to(device)

            pred = model(image)
            pred_labels = torch.argmax(pred, dim=1)
            confmat.update(pred_labels, mask)

    cm = confmat.compute().cpu().numpy()
    return cm

def plot_confusion_matrix(cm, class_names, filename):
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()
    print(f"Confusion matrix saved as {filename}")

def compute_metrics(cm):
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    TP = cm[1, 1]

    iou_bg = TN / (TN + FN + FP + 1e-6)
    iou_flood = TP / (TP + FP + FN + 1e-6)
    precision = TP / (TP + FP + 1e-6)
    recall = TP / (TP + FN + 1e-6)
    f1 = 2 * precision * recall / (precision + recall + 1e-6)

    return {
        "TN": TN, "FP": FP, "FN": FN, "TP": TP,
        "IoU_bg": iou_bg, "IoU_flood": iou_flood,
        "Precision": precision, "Recall": recall, "F1": f1
    }


test_dataset = ETCIDataset(test_df, split='devlopment', transform=None)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=2, pin_memory=True)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = UNetWithEmbeddings(encoder_name="resnet34", encoder_weights=None, in_channels=3, classes=2)
model.load_state_dict(torch.load('models/samples_kmeans_03_encoder_01_epoch68.pt', map_location=device))
model.to(device)

cm = evaluate_model(model, test_loader, device=device)
metrics = compute_metrics(cm)

for k, v in metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

plot_confusion_matrix(cm, ["Background", "Flood"], "confusion_matrix_samples_kmeans_03_encoder_01.png")
