In [1]:
import sys

sys.path.append('..')
from src.tools.experiments import print_env, set_seed
print_env()
set_seed(42)

DATE : 2023-09-19
Pyton Version : 3.10.12
PyTorch Version : 2.0.1
OS : Linux 5.15.0-78-generic
CPU spec : x86_64
RAM spec : 122.84 GB
Device 0:
Name: NVIDIA GeForce RTX 3090
Total Memory: 24576.0 MB
Driver Version: 530.41.03
Device 1:
Name: NVIDIA GeForce RTX 3090
Total Memory: 24576.0 MB
Driver Version: 530.41.03


In [2]:
import os
import cv2
from PIL import Image
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2

from src.tools.rle_encoder import rle_encode
from src.data.dataset import SourceDataset, TargetDataset
#os.environ["CUDA_VISIBLE_DEVICES"] = "4"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [3]:
import torch
import albumentations as A

from src.data.fisheye_transform import FisheyeTransform

augmentation = A.Compose([
    #FisheyeTransform(p=0.5),
    #A.RandomCrop(width=224, height=224),
    #A.RandomScale(scale_limit=0.2, p=0.2),
    #A.RandomRotate90(p=0.5),
    #A.HorizontalFlip(p=0.2),
    #A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.2),
    #A.ColorJitter(p=0.3),
    #A.GaussianBlur(blur_limit=(3, 7), p=0.2),
    #A.CoarseDropout(max_holes=8, max_height=16, max_width=16, fill_value=0, p=0.5),
    A.Resize(224, 224),
    A.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),
    ToTensorV2()
])


transform = A.Compose(
    [   
        A.Resize(224, 224),
        A.Normalize(mean=[0.485, 0.456, 0.406], # ImageNet 데이터의 통계량으로 정규화
                    std=[0.229, 0.224, 0.225]),
        ToTensorV2()
    ]
)

In [4]:
from src.data.dataset import SourceDataset, TargetDataset

train_dataset = SourceDataset(csv_file='train_source.csv', transform=augmentation, is_training=True)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)

valid_dataset = SourceDataset(csv_file='val_source.csv', transform=transform, is_training=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False, num_workers=4)

target_domain = TargetDataset(csv_file='train_target.csv', transform=augmentation, is_training=False)
target_loader = DataLoader(target_domain, batch_size=32, shuffle=True, num_workers=4)

In [5]:
# DeepLabV3 with ResNet101 backbone:
from src.model.pretrained import DeepLabV3

model = DeepLabV3(backbone='mobilenet', pretrained=True, mode='all')
model.to(device)

DeepLabV3(
  (model): DeepLabV3(
    (backbone): IntermediateLayerGetter(
      (0): Conv2dNormActivation(
        (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        (2): Hardswish()
      )
      (1): InvertedResidual(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
            (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
            (2): ReLU(inplace=True)
          )
          (1): Conv2dNormActivation(
            (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          )
        )
      )
      (2): InvertedResidual(
        (block): Sequential(
          (0): Conv2dNormAc

In [6]:
from src.model import DomainAdaptation

backbone = DomainAdaptation.get_backbone(model)
semantic_classifier, aux_classifier = DomainAdaptation.get_classifier(model, aux=True)


DANN = DomainAdaptation.DANN(feature_backbone=backbone, semantic_classifier=semantic_classifier, aux_classifier=aux_classifier)

In [7]:
DANN

DANN(
  (feature_backbone): IntermediateLayerGetter(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        )
      )
    )
    (2): InvertedResidual(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 64, kernel_size=(1, 1), str

In [8]:
DANN.to(device)

DANN(
  (feature_backbone): IntermediateLayerGetter(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        )
      )
    )
    (2): InvertedResidual(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 64, kernel_size=(1, 1), str

In [9]:
class AuxMixLoss(nn.Module):
    def __init__(self, criterion, alpha):
        super(AuxMixLoss, self).__init__()
        self.alpha = alpha
        self.criterion = criterion

    def forward(self, inputs, targets):
        main_loss = self.criterion(inputs['out'], targets)
        aux_loss = self.criterion(inputs['aux'], targets)
        total_loss = main_loss + self.alpha * aux_loss
        return total_loss

def aux_mix_loss(criterion, output, mask, alpha, valid=False):
    main_loss = criterion(output['out'], mask)
    aux_loss = criterion(output['aux'], mask)
    if valid:
        return main_loss
    else:
        total_loss = main_loss + alpha * aux_loss
        return total_loss

In [25]:
from typing import Optional, Union
import datetime
import os
import torch
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.optim.lr_scheduler import StepLR, ExponentialLR, OneCycleLR
from tqdm import tqdm
from src.tools.metrics import compute_mIoU
from src.visualization.plotting import monitor_training_process


class TrainingConfig:
    """
    A configuration and utility class for training a DANN model.
    """

    def __init__(self,
                 model: torch.nn.Module,
                 learning_rate: float = 0.001,
                 epochs: int = 1000,
                 batch_size: int = 32,
                 weight_decay: float = 0,
                 device: str = "cuda" if torch.cuda.is_available() else "cpu",
                 verbose: bool = True,
                 print_every: int = 10,
                 criterion: torch.nn.Module = nn.CrossEntropyLoss(),
                 optimizer_choice: str = 'Adam',
                 scheduler_choice: Optional[str] = None,
                 optimizer_params: Optional[dict] = None,
                 scheduler_params: Optional[dict] = None,
                 early_stopping_patience: int = 10,
                freeze_bn: bool = True
):
        """
        Initialize the training configuration.

        Args:
            model (torch.nn.Module): The model to be trained.
            learning_rate (float, optional): Learning rate for the optimizer. Default is 0.001.
            epochs (int, optional): Number of epochs for training. Default is 1000.
            batch_size (int, optional): Batch size for the dataloaders. Default is 32.
            weight_decay (float, optional): Weight decay for the optimizer. Default is 0.
            device (str, optional): Device for training ("cuda" or "cpu"). Default is "cuda" if available, otherwise "cpu".
            verbose (bool, optional): Whether to print training progress. Default is True.
            print_every (int, optional): How often to print training progress. Default is 10.
            optimizer_choice (str, optional): Choice of optimizer ("Adam" or "SGD"). Default is "Adam".
            scheduler_choice (str, optional): Choice of learning rate scheduler. Default is None.
            optimizer_params (dict, optional): Additional parameters for the optimizer. Default is None.
            scheduler_params (dict, optional): Additional parameters for the scheduler. Default is None.
            early_stopping_patience (int, optional): Number of epochs to wait before early stopping. Default is 10.
        """
        # if torch.cuda.device_count() > 1:
        #     print(f"Using {torch.cuda.device_count()} GPUs!")
        #     model = nn.DataParallel(model)
        #     multi_gpu_train = True
        # elif torch.cuda.device_count() == 1:
        #     print(f"Using only 1 GPU!")
        #     model.to(device)
        #     multi_gpu_train = False
        # else:
        #     print(f"Using CPU")
        #     model.to(device)
        #     multi_gpu_train = False
            
        self.model = model
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.weight_decay = weight_decay
        self.device = device
        self.verbose = verbose
        self.print_every = print_every
        self.criterion = criterion
        self.optimizer_choice = optimizer_choice
        self.scheduler_choice = scheduler_choice
        self.optimizer_params = optimizer_params or {}
        self.scheduler_params = scheduler_params or {}
        self.early_stopping_patience = early_stopping_patience
        self.freeze_bn = freeze_bn
        if self.freeze_bn:
            self.model.freeze_bn()

        # Set the checkpoint directory
        self.model_name = type(model).__name__
        self.start_time = datetime.datetime.now().strftime('%y%m%d_%H%M')
        self.checkpoint_dir = os.path.join(self.start_time + '_' + self.model_name)
        os.makedirs(self.checkpoint_dir, exist_ok=True)
        self.checkpoint_path = os.path.join(self.checkpoint_dir, f"best_model_{self.start_time}.pth")

    def train_DANN(self, source_dataloader, target_dataloader, val_dataloader=None, save_checkpoint_every=None):
            """
            Train a DANN model using the provided data and configuration.

            Args:
                source_dataloader (DataLoader): DataLoader for the source domain data.
                target_dataloader (DataLoader): DataLoader for the target domain data.
                val_dataloader (DataLoader, optional): DataLoader for validation data.
                save_checkpoint_every (int, optional): Epoch interval to save checkpoints. If None, checkpoints are not saved.

            Returns:
                nn.Module: Trained DANN model.
            """



            source_iter = iter(source_dataloader)
            target_iter = iter(target_dataloader)
            
            #optimizer = self._initialize_optimizer(self.model.parameters())
            optimizer = torch.optim.Adam(self.model.parameters(), self.learning_rate)
            # Learning Rate Scheduler Initialization (if provided)
            scheduler = None
            if self.scheduler_choice is not None:
                scheduler = self._initialize_scheduler(optimizer)


            best_mIoU = 0.0

            for epoch in tqdm(range(self.epochs), desc="Training"):
                self.model.train()
                total_loss = 0
                total_domain_loss = 0
                total_semantic_loss = 0
                
                for _ in range(max(len(source_dataloader), len(target_dataloader))):
                    # Source dataloader에서 데이터 가져오기
                    try:
                        source_data, source_labels = next(source_iter)
                        source_data, source_labels = source_data.float().to(self.device), source_labels.long().to(self.device)
                    except StopIteration:
                        source_iter = iter(source_dataloader)
                        source_data, source_labels = next(source_iter)
                        source_data, source_labels = source_data.float().to(self.device), source_labels.long().to(self.device)
                    
                    # Target dataloader에서 데이터 가져오기
                    try:
                        target_data = next(target_iter)
                        target_data = target_data.float().to(self.device)
                    except StopIteration:
                        target_iter = iter(target_dataloader)
                        target_data = next(target_iter)
                        target_data = target_data.float().to(self.device)

                    # Training step
                    semantic_loss, domain_loss, loss = self._train_step(self.model, source_data, source_labels, target_data, optimizer)
                    total_loss += loss.item()
                    total_domain_loss += domain_loss.item()
                    total_semantic_loss += semantic_loss.item()

                # Print training stats
                if self.verbose and (epoch+1) % self.print_every == 0:
                    avg_loss = total_loss / len(source_dataloader)
                    avg_domain_loss = total_domain_loss / len(source_dataloader)
                    avg_semantic_loss = total_semantic_loss / len(source_dataloader)
                    print(f"Epoch [{epoch+1}/{self.epochs}], Average Loss: {avg_loss:.4f}, Domain Loss: {avg_domain_loss:.4f}, Semantic Loss: {avg_semantic_loss:.4f}")

                    # Compute mIoU for source domain training data and validation data (if provided)
                    train_mIoU = self._evaluate_segmentation_mIoU(self.model, source_dataloader)
                    print(f"Epoch [{epoch+1}/{self.epochs}],Training mIoU: {train_mIoU:.4f}")

                    # Validation and early stopping
                    if val_dataloader:
                        avg_val_loss, val_mIoU = self._valid_step(self.model, val_dataloader)
                        print(f"Epoch [{epoch+1}/{self.epochs}], Valid Loss: {avg_val_loss:.4f},Validation mIoU: {val_mIoU:.4f}")
                        
                        # Save checkpoint if the current model has better performance
                        if val_mIoU > best_mIoU:
                            best_mIoU = val_mIoU
                            if save_checkpoint_every and (epoch + 1) % save_checkpoint_every == 0:
                                self._save_checkpoint(self.model, epoch, best_mIoU, self.checpoint_path)


                    # Display segmentation results on training data
                    monitor_training_process(source_dataloader, self.model, self.device, is_domain_classification=False)

                    # Display domain classification results on target data
                    monitor_training_process(target_dataloader, self.model, self.device, is_domain_classification=True)

                # Update the learning rate if the scheduler is provided
                if scheduler:
                    scheduler.step()

            return model

    def _train_step(self, model, source_data, source_labels, target_data, optimizer):
        # Forward pass for source domain
        model.train()
        src_semantic_outputs, src_domain_outputs = model(source_data)
        src_semantic_loss = self.criterion(src_semantic_outputs, source_labels)
        src_domain_loss = F.binary_cross_entropy_with_logits(src_domain_outputs, torch.zeros_like(src_domain_outputs))

        # Forward pass for target domain
        _, tgt_domain_outputs = model(target_data, lamda=-1)  # Set lamda=-1 for gradient reversal
        tgt_domain_loss = F.binary_cross_entropy_with_logits(tgt_domain_outputs, torch.ones_like(tgt_domain_outputs))

        # Combine losses
        domain_loss = src_domain_loss + tgt_domain_loss
        loss = src_semantic_loss + domain_loss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        return src_semantic_loss, domain_loss, loss

    def _valid_step(self, model, val_dataloader):
        model.eval()
        total_val_loss = 0.0
        with torch.no_grad():
            for val_data, val_labels in val_dataloader:
                val_data, val_labels = val_data.to(self.device), val_labels.to(self.device)
                val_outputs, _ = model(val_data)
                val_loss = nn.CrossEntropyLoss()(val_outputs['out'], val_labels)
                total_val_loss += val_loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader) if val_dataloader else 0.0
        # Compute mIoU
        _, predictions = torch.max(val_outputs, 1)
        mIoU = self._evaluate_segmentation_mIoU(model, val_dataloader)

        return avg_val_loss, mIoU

    def _evaluate_segmentation_mIoU(self, model, dataloader):
        model.eval()
        total_mIoU = 0

        with torch.no_grad():
            for images, labels in dataloader:
                images, labels = images.to(self.device), labels.to(self.device)
                outputs, _ = model(images)
                _, preds = torch.max(outputs, 1)
                mIoU = compute_mIoU(preds, labels)
                total_mIoU += mIoU

        return total_mIoU / len(dataloader)
    
    def _save_checkpoint(self, model, epoch, best_mIoU, filename):
        """
        Save the model checkpoint.

        Args:
            model (nn.Module): The DANN model to save.
            epoch (int): Current epoch.
            best_mIoU (float): Best mIoU score so far.
            filename (str): Name of the file to save the checkpoint.
        """
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'best_mIoU': best_mIoU
        }
        torch.save(checkpoint, filename)
        print(f"Checkpoint saved to {filename} at epoch {epoch} with mIoU {best_mIoU:.4f}.")


    def load_checkpoint(self, model, filename):
        """
        Load the model checkpoint.

        Args:
            model (nn.Module): The DANN model to load.
            filename (str): Name of the checkpoint file to load.

        Returns:
            epoch (int): Epoch of the loaded checkpoint.
            best_mIoU (float): Best mIoU of the loaded checkpoint.
        """
        checkpoint = torch.load(filename)
        model.load_state_dict(checkpoint['model_state_dict'])
        print(f"Checkpoint loaded from {filename} at epoch {checkpoint['epoch']} with mIoU {checkpoint['best_mIoU']:.4f}.")
        return checkpoint['epoch'], checkpoint['best_mIoU']
    
    def _initialize_optimizer(self, parameters) -> torch.optim.Optimizer:
        """
        Initialize the optimizer based on user's choice and parameters.

        Args:
            parameters: Parameters of the model.

        Returns:
            Initialized optimizer.
        """
        optimizer_choices = {
            'Adam': Adam(filter(lambda p: p.requires_grad, parameters), lr=self.learning_rate, weight_decay=self.weight_decay, **self.optimizer_params),
            'AdamW': AdamW(filter(lambda p: p.requires_grad, parameters), lr=self.learning_rate, weight_decay=self.weight_decay, **self.optimizer_params),
            'SGD': SGD(filter(lambda p: p.requires_grad, parameters), lr=self.learning_rate, weight_decay=self.weight_decay, **self.optimizer_params)
        }
        return optimizer_choices[self.optimizer_choice]

    def _initialize_scheduler(self, optimizer):
        scheduler_choices = {
            'StepLR': StepLR(optimizer, step_size=200, gamma=0.5),
            'ExponentialLR': ExponentialLR(optimizer, gamma=0.95),
            'OneCycleLR': OneCycleLR(optimizer, max_lr=1e-2, total_steps=self.epochs * self.batch_size, 
                       div_factor=25, pct_start=0.3, anneal_strategy='cos')
        }
        return scheduler_choices[self.scheduler_choice]

    

In [26]:
learning_rate = 1e-5
epochs = 1000
batch_size = 32
weight_decay = 1e-4
device = "cuda" if torch.cuda.is_available() else "cpu"
verbose = True
print_every = 10
criterion = AuxMixLoss(criterion = nn.CrossEntropyLoss(), alpha = 0.3)
optimizer_choice = 'AdamW'
scheduler_choice = 'OneCycleLR'
optimizer_params = None,
scheduler_params = None,
early_stopping_patience = 20
freeze_bn = True


trainer = TrainingConfig(model=DANN,
        learning_rate = 1e-5,
        epochs = 1000,
        batch_size = 32,
        weight_decay = 1e-4,
        device = "cuda" if torch.cuda.is_available() else "cpu",
        verbose = True,
        print_every = 10,
        criterion = AuxMixLoss(criterion = nn.CrossEntropyLoss(), alpha = 0.3),
        optimizer_choice = 'AdamW',
        scheduler_choice = 'OneCycleLR',
        optimizer_params = None,
        scheduler_params = None,
        early_stopping_patience = 20,
        freeze_bn = True)

In [27]:
trained_model = trainer.train_DANN(train_loader, target_loader, valid_loader)

Training:   0%|          | 0/1000 [00:03<?, ?it/s]


KeyboardInterrupt: 