In [1]:
!nvidia-smi

Thu Feb 29 02:19:45 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.113.01             Driver Version: 535.113.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla V100-SXM2-32GB           Off | 00000000:06:00.0 Off |                    0 |
| N/A   32C    P0              43W / 300W |      0MiB / 32768MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           Off | 00000000:07:00.0 Off |  

In [2]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU device:',torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')
    print('No GPU avaialable, Using CPU')

torch.cuda.set_device(4)

GPU device: Tesla V100-SXM2-32GB


# Maintaining Distribution with Stratified Sampling

- To ensure that the distribution of the dataset splits is maintained in the subsets, especially when dealing with imbalanced datasets like DAA, we should ideally use stratified sampling. 
- Stratified sampling involves dividing the dataset into homogeneous subgroups before sampling, then drawing samples from these subgroups in such a way that the proportion of each subgroup in the sample matches the proportion in the full dataset.

In [3]:
train_dir = "/net/polaris/storage/deeplearning/sur_data/rgb_daa/split_0/train"
val_dir = "/net/polaris/storage/deeplearning/sur_data/rgb_daa/split_0/val"
test_dir = "/net/polaris/storage/deeplearning/sur_data/rgb_daa/split_0/test"

In [5]:
import os
import sys
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import logging
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

# Local Imports
sys.path.append('/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline')
from torchvision import transforms
from torch.utils.data import DataLoader
from src.components import data_setup

In [6]:
pretrained_vit_weights = torchvision.models.ViT_B_16_Weights.DEFAULT
pretrained_vit = torchvision.models.vit_b_16(weights=pretrained_vit_weights)

# Freeze the base parameters
for parameter in pretrained_vit.parameters():
    parameter.requires_grad = False

# Change the classifier head
pretrained_vit.heads = nn.Linear(in_features=768, out_features=34)
pretrained_vit_transforms = pretrained_vit_weights.transforms()

# Make sure the head parameters are trainable: two print statements for two parameters : (Weight & Biases)
for param in pretrained_vit.heads.parameters():
    print(f" The parameters of the head layer in model requires gradient or are trainable ? Answer: {param.requires_grad}")


 The parameters of the head layer in model requires gradient or are trainable ? Answer: True
 The parameters of the head layer in model requires gradient or are trainable ? Answer: True


# Dataset for Grid Seach Experiment with slight modifications

In [7]:
import os
import pathlib
import torch
from typing import Tuple, Dict, List
from torch.utils.data import Dataset
from PIL import Image

class ImageFolderCustom(Dataset):
    def __init__(self, targ_dir: str, transform=None, target_transform=None) -> None:
        # Get all image paths
        self.paths = list(pathlib.Path(targ_dir).glob("*/*/*.png"))  # Adjust for different file types as needed
        # Setup transforms
        self.transform = transform
        self.target_transform = target_transform
        # Create classes and class_to_idx attributes
        self.classes, self.class_to_idx = self.find_classes(targ_dir)
        # Extract labels for all images
        self.labels = [self.class_to_idx[path.parent.parent.name] for path in self.paths]

    def load_image(self, index: int) -> Image.Image:
        "Opens an image via a path and returns it."
        image_path = self.paths[index]
        image = Image.open(image_path).convert("RGB")
        return image

    def __len__(self) -> int:
        "Returns the total number of samples."
        return len(self.paths)

    def find_classes(self, directory: str) -> Tuple[List[str], Dict[str, int]]:
        classes = sorted(entry.name for entry in os.scandir(directory) if entry.is_dir())
        if not classes:
            raise FileNotFoundError(f"Couldn't find any classes in {directory}.")
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
        "Returns one sample of data, data and label (X, y)."
        image = self.load_image(index)
        class_name = self.paths[index].parent.parent.name
        class_idx = self.class_to_idx[class_name]
        
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            class_idx = self.target_transform(class_idx)

        return image, class_idx

In [8]:
train_dataset = ImageFolderCustom(train_dir, transform=pretrained_vit_transforms)
val_dataset = ImageFolderCustom(val_dir, transform=pretrained_vit_transforms)
test_dataset = ImageFolderCustom(test_dir, transform=pretrained_vit_transforms)

In [9]:
print(f"The length of the train dataset is : {train_dataset.__len__()}")
print(f"The length of the val dataset is : {val_dataset.__len__()}")
print(f"The length of the test dataset is : {test_dataset.__len__()}")

The length of the train dataset is : 259865
The length of the val dataset is : 56024
The length of the test dataset is : 87315


In [10]:
sample = train_dataset[0]
print(f" The label of the first sample in the train dataset is : {sample[1]}")

 The label of the first sample in the train dataset is : 2


In [12]:
len(train_dataset.labels)

259865

# DAA Dataset : rgb_daa/split_0
- The length of the train dataset is : 259865
- The length of the val dataset is : 56024
- The length of the test dataset is : 87315

# Original Proportions: 
- 403204 Total Samples, 64.445% Train, 13.89% Val, 21.65% Test Set

- Train : 20% 259865 = 51973
- Val: 20% 56024 = 11204
- Test: 20% 87315 = 17463
- Total samples = 80640

- New Proportions: 64.45% Train, 13.89% Val, 21.65% Test


# Calculate Stratified Indices: 

- First, we need to use the train_test_split function from sklearn.model_selection to generate indices for stratified sampling. 
- This requires knowledge of the labels for each sample in your dataset.

In [13]:
# We are taking 20% of the entire dataset for Grid search
# This means we will take 20% samples by preserving their distribution from
# each of the train, test and validation datasets.

from sklearn.model_selection import train_test_split

def get_stratified_indices(labels, test_size=0.2):
    """
    The get_stratified_indices function:
    It is a utility for generating a stratified subset of a dataset in PyTorch using indices, 
    leveraging sklearn's train_test_split for stratification. 
    This ensures that the class distribution in the subset matches that of the original dataset, 
    which is important for maintaining the integrity of machine learning models, 
    especially when dealing with imbalanced classes as in DAA.
    """
    # Generate indices for a stratified split
    # X_train, X_test, y_train, y_test
    _, stratified_idx, _, _ = train_test_split(
        range(len(labels)), labels, test_size=test_size, stratify=labels, random_state=42)
    
    return stratified_idx


In [14]:
# Generate stratified indices for each split
train_indices = get_stratified_indices(train_dataset.labels)
val_indices = get_stratified_indices(val_dataset.labels)
test_indices = get_stratified_indices(test_dataset.labels)

In [17]:
print(f" the length of the stratified train indices is: {len(train_indices)}")
print(f" the length of the stratified val indices is: {len(val_indices)}")
print(f" the length of the stratified test indices is: {len(test_indices)}")

 the length of the stratified train indices is: 51973
 the length of the stratified val indices is: 11205
 the length of the stratified test indices is: 17463


- the length of the stratified train indices is: 51973
- the length of the stratified val indices is: 11205
- the length of the stratified test indices is: 17463

In [16]:
import json

def save_indices(train_indices, val_indices, test_indices, file_name='dataset_indices.json'):
    """
    This function takes the train_indices, val_indices, and test_indices along with an optional 
    file_name parameter specifying the name of the file to save the indices to. 
    It first converts the indices into lists (necessary if your indices are in a format 
    that is not directly serializable to JSON, such as NumPy arrays or PyTorch tensors) 
    and then writes them to a file in JSON format.

    Args:
    train_indices: A list of stratified train samples subset, 
    val_indices: A list of stratified val samples subset, 
    test_indices:A list of stratified test samples subset,
    file_name: A file name to store the data.

    """
    # Create a dictionary to hold the indices
    indices_dict = {
        'train_indices': train_indices,  # Convert to list if using numpy arrays or tensors .tolist()
        'val_indices': val_indices,
        'test_indices': test_indices,
    }
    
    # Open a file in write mode and save the JSON
    with open(file_name, 'w') as file:
        json.dump(indices_dict, file)

# train_indices, val_indices, test_indices are available
save_indices(train_indices, val_indices, test_indices)


In [None]:
def load_indices(file_name='dataset_indices.json'):
    """
    This function reads the indices from a JSON file specified by file_name,
    assuming the structure matches what was saved by save_indices. 
    It then returns the train_indices, val_indices, and test_indices 
    for use in our dataset handling or training script.
    """
    # Open the file and load the JSON
    with open(file_name, 'r') as file:
        indices_dict = json.load(file)
    
    # Convert lists back to the desired format, e.g., lists, numpy arrays, or tensors
    train_indices = indices_dict['train_indices']
    val_indices = indices_dict['val_indices']
    test_indices = indices_dict['test_indices']
    
    return train_indices, val_indices, test_indices

train_indices, val_indices, test_indices = load_indices()

# Apply Stratified Indices to PyTorch Datasets:

In [18]:
from torch.utils.data import Subset

# Creating subsets
train_subset = Subset(train_dataset, train_indices)
val_subset = Subset(val_dataset, val_indices)
test_subset = Subset(test_dataset, test_indices)


In [19]:
print(f" the length of the stratified train subset is: {len(train_subset)}")
print(f" the length of the stratified val subset is: {len(val_subset)}")
print(f" the length of the stratified test subset is: {len(test_subset)}")

 the length of the stratified train subset is: 51973
 the length of the stratified val subset is: 11205
 the length of the stratified test subset is: 17463


In [20]:
train_subset[0][1]

5

# Create DataLoaders

In [32]:
# Turn images into data loaders
train_dataloader = DataLoader(train_subset,batch_size=1024,shuffle=True,drop_last = False)

In [33]:
# Turn images into data loaders
val_dataloader = DataLoader(val_subset,batch_size=1024,shuffle=False,drop_last = False)

# Trainer Class for experimentation

In [34]:
"""
Dataset: ImageNet | Steps: 20,000 | BaseLR {0.003, 0.01, 0.03, 0.06}

Hyperparameters for fine-tuning (DAA Dataset):
    SGD with a momentum of 0.9, 
    cosine learning rate decay,
    a batch size of 1024, 
    no weight decay, 
    and grad clipping at global norm 1.
    fine-tuning resolution is 224.
    Epochs>79: 90 | Steps: 22,860
    No Grid Search for Best LR.
    Base LR: 0.003
"""

class Trainer:
    def __init__(self,
                 model: torch.nn.Module, 
                 train_dataloader, 
                 val_dataloader, 
                 optimizer_choice, 
                 scheduler_choice, 
                 lr, 
                 momentum, 
                 weight_decay, 
                 gpu_id, 
                 num_classes,
                 num_epochs, 
                 log_dir, 
                 exp_name, 
                 save_every
                 ):
        self.gpu_id = gpu_id
        self.model = model.to(gpu_id)
        self.num_classes = num_classes
        self.num_epochs = num_epochs
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
        self.log_dir = log_dir
        self.experiment_name = exp_name
        self.save_every = save_every
        self.writer = SummaryWriter(log_dir=self.log_dir)
        self.logger = self.configure_logger()
        self.optimizer = self.configure_optimizer(optimizer_choice, lr, momentum, weight_decay)
        self.lr_scheduler = self.configure_scheduler(scheduler_choice, lr)

    def configure_logger(self):
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)
        log_format = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
        log_file_path = os.path.join(self.log_dir, "training_log.log")
        file_handler = logging.FileHandler(log_file_path)
        file_handler.setFormatter(log_format)
        logger.addHandler(file_handler)
        console_handler = logging.StreamHandler(sys.stdout)
        console_handler.setFormatter(log_format)
        logger.addHandler(console_handler)
        return logger

    def calculate_balanced_accuracy(self, y_pred, y_true, num_classes):
        """
        Calculates the balanced accuracy score using PyTorch operations.
        (y_pred == c): Creates a boolean tensor where each element is True 
        if the predicted label equals class c, and False otherwise.

        (y_true == c): Creates another boolean tensor where each element is True 
        if the true label equals class c, and False otherwise.

        &: Performs a logical AND operation between the two boolean tensors. 
        The result is a tensor where each element is True only if both conditions 
        are met: the predicted label is class c, and the true label is also class c. 
        This effectively filters out the true positives for class c.

        .sum(): Sums up the True values in the resultant tensor, which corresponds
        to the count of true positive predictions for class c.

        Args:
            y_pred (torch.Tensor): Tensor of predicted class labels( No Logits & Probabilities, only labels).
            y_true (torch.Tensor): Tensor of true class labels.
            num_classes (int): Number of classes.

        Returns:
            float: The balanced accuracy score.
        """
        correct_per_class = torch.zeros(num_classes, device=y_pred.device)
        total_per_class = torch.zeros(num_classes, device=y_pred.device)

        for c in range(num_classes):
            # The number of true positive predictions for class c. 
            # True positives are instances that are correctly identified as 
            # belonging to class c by the classifier.
            true_positives = ((y_pred == c) & (y_true == c)).sum()
            # Condition Positive: total number of instances that actually belong to class c, 
            # regardless of whether they were correctly identified by the classifier or not.
            condition_positives = (y_true == c).sum()
            
            correct_per_class[c] = true_positives.float()
            total_per_class[c] = condition_positives.float()

        # .clamp(min=1) function ensures that no value in the total_per_class tensor is less than 1
        recall_per_class = correct_per_class / total_per_class.clamp(min=1)
        balanced_accuracy = recall_per_class.mean().item()  # Convert to Python scalar for compatibility

        return balanced_accuracy

    def configure_optimizer(self, optimizer_choice, initial_lr, momentum, weight_decay):
        if optimizer_choice.lower() == 'adam':
            optimizer = optim.Adam(self.model.parameters(), lr=initial_lr, weight_decay=weight_decay)
        elif optimizer_choice.lower() == 'sgd':
            optimizer = optim.SGD(self.model.parameters(), lr=initial_lr, momentum=momentum, weight_decay=weight_decay)
        else:
            raise ValueError("Invalid optimizer choice. Choose 'adam' or 'sgd'.")
        return optimizer

    def configure_scheduler(self, scheduler_choice, initial_lr):
        if scheduler_choice.lower() == 'cosineannealinglr':
            lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=self.num_epochs)  # T_max=80, adjusted to "num_epochs"
        elif scheduler_choice.lower() == 'lambdalr':
            lr_lambda = lambda epoch: 0.1 ** (epoch // 30)
            lr_scheduler = optim.lr_scheduler.LambdaLR(self.optimizer, lr_lambda=lr_lambda)
        else:
            raise ValueError("Invalid scheduler choice. Choose 'cosineannealinglr' or 'lambdalr'.")
        return lr_scheduler

    def _train_epoch(self):
        self.model.train()
        running_loss,  num_samples = 0, 0
        y_pred_all, y_all = [], []
        for batch, (X, y) in enumerate(self.train_dataloader):
            X, y = X.to(self.gpu_id), y.to(self.gpu_id)
            self.optimizer.zero_grad()
            y_pred = self.model(X)
            loss = F.cross_entropy(y_pred, y)
            loss.backward()

            # Applying gradient clipping
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1)

            self.optimizer.step()
            running_loss += loss.item() * X.size(0)
            y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
            
            num_samples += X.size(0)
            y_pred_all.append(y_pred_class)
            y_all.append(y)
        metrics = self._calculate_metrics(running_loss, num_samples, y_pred_all, y_all)
        return metrics

    def _validation_epoch(self):
        self.model.eval()
        running_loss, num_samples = 0, 0
        y_pred_all, y_all = [], []
        with torch.no_grad():
            for X, y in self.val_dataloader:
                X, y = X.to(self.gpu_id), y.to(self.gpu_id)
                y_pred = self.model(X)
                loss = F.cross_entropy(y_pred, y)
                running_loss += loss.item() * X.size(0)
                y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
                
                num_samples += X.size(0)
                y_pred_all.append(y_pred_class)
                y_all.append(y)
        metrics = self._calculate_metrics(running_loss, num_samples, y_pred_all, y_all)
        return metrics

    def _calculate_metrics(self, running_loss,  num_samples, y_pred_all, y_all):
        avg_loss = running_loss / num_samples
        balanced_accuracy = self.calculate_balanced_accuracy(torch.concatenate(y_pred_all), torch.concatenate(y_all), self.num_classes)
        return avg_loss, balanced_accuracy

    def training_validation(self, max_epochs, resume=False, checkpoint_path=None):
        total_start_time = time.time()
        start_epoch = 0
        if resume:
            start_epoch = self.load_checkpoint(checkpoint_path)
            self._log(f"Resuming training from epoch {start_epoch}")

        for epoch in tqdm(range(start_epoch, max_epochs)):
            # server_file.setup_ccname()
            epoch_start_time = time.time()
            train_metrics = self._train_epoch()
            total_epoch_duration = time.time() - epoch_start_time
            self._log(f"Total training time for epoch {epoch}: {self._format_time(total_epoch_duration)}.")

            val_epoch_start_time = time.time()
            val_metrics = self._validation_epoch()
            total_epoch_val_duration = time.time() - val_epoch_start_time
            self._log(f"Total validation time for epoch {epoch}: {self._format_time(total_epoch_val_duration)}.")

            # Log metrics to TensorBoard and console
            self.writer.add_scalar("Train/Loss", train_metrics[0], epoch)
            self.writer.add_scalar("Train/Balanced_Accuracy", train_metrics[1], epoch)
            self.writer.add_scalar("Validation/Loss", val_metrics[0], epoch)
            self.writer.add_scalar("Validation/Balanced_Accuracy", val_metrics[1], epoch)

            self._log(f"Epoch: {epoch} | Train Loss: {train_metrics[0]:.4f} | Train Balanced Acc: {train_metrics[1] * 100:.4f} % | Val Loss: {val_metrics[0]:.4f} | Val Balanced Acc: {val_metrics[1] * 100:.4f} % ")

            self.lr_scheduler.step()

            if (epoch + 1) % self.save_every == 0:
                self._save_checkpoint(epoch, train_metrics, val_metrics)

        total_duration = time.time() - total_start_time
        self._log(f"Total training and validation time: {self._format_time(total_duration)}.")
        self.writer.close()

    def _save_checkpoint(self, epoch, train_metrics, val_metrics):
        checkpoint_dir = os.path.join(self.log_dir, "checkpoints")
        os.makedirs(checkpoint_dir, exist_ok=True)
        checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_{self.experiment_name}_epoch_{epoch}.pth")
        
        # Prepare the checkpoint dictionary
        checkpoint_dict = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'train_loss': train_metrics[0],
            'val_loss': val_metrics[0],
            'train_balanced_accuracy': train_metrics[1],
            'val_balanced_accuracy': val_metrics[1],
            # Saving the current learning rate (from the first param group)
            'current_lr': self.optimizer.param_groups[0]['lr']
        }
        
        # If a learning rate scheduler is used, save its state as well
        if hasattr(self, 'lr_scheduler') and self.lr_scheduler is not None:
            checkpoint_dict['scheduler_state_dict'] = self.lr_scheduler.state_dict()
        
        torch.save(checkpoint_dict, checkpoint_path)
        self._log(f"Saved checkpoint at epoch {epoch}: {checkpoint_path}")

    def _log(self, message):
        # print(message) # Uncomment if you want to print it
        self.logger.info(message)

    def _format_time(self, seconds):
        hours = seconds // 3600
        minutes = (seconds % 3600) // 60
        seconds = seconds % 60
        return f"{int(hours)}h:{int(minutes)}m:{int(seconds)}s"
    
    def load_checkpoint(self, checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        
        # Load the current learning rate back into the optimizer, if it was saved
        if 'current_lr' in checkpoint:
            for param_group in self.optimizer.param_groups:
                param_group['lr'] = checkpoint['current_lr']
        
        # If a learning rate scheduler state was saved, load it as well
        if hasattr(self, 'lr_scheduler') and self.lr_scheduler is not None and 'scheduler_state_dict' in checkpoint:
            self.lr_scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

        return checkpoint['epoch']

# LR: {0.003, 0.01, 0.03, 0.06}

In [36]:
# Configurations of the experiment 1
experiment_name = "Exp_001_LR_0.01"
num_epochs = 80
num_classes = 34
save_every = 20
optimizer = 'SGD'
scheduler = 'CosineAnnealingLR'
lr = 0.01 # {0.003,0.01,0.03,0.06}
momentum = 0.9
weight_decay = 0

In [37]:
# Main function for Experiment
def main(train_dataloader, val_dataloader):
    log_dir = os.path.join("experiments", experiment_name, "runs")
    os.makedirs(log_dir, exist_ok=True)

    trainer = Trainer(
        model = pretrained_vit,
        train_dataloader=train_dataloader,
        val_dataloader=val_dataloader,
        optimizer_choice=optimizer,
        scheduler_choice=scheduler,
        lr=lr,
        momentum=momentum,
        weight_decay=weight_decay,
        gpu_id=device,
        num_classes=num_classes,
        num_epochs=num_epochs,
        log_dir=log_dir,
        exp_name=experiment_name,
        save_every=save_every
    )

    trainer.training_validation(max_epochs=num_epochs, resume=False, checkpoint_path=None)

In [38]:
main(train_dataloader, val_dataloader)

  return F.conv2d(input, weight, bias, self.stride,


2024-02-29 03:12:02,817 - INFO - Total training time for epoch 0: 0h:21m:2s.
2024-02-29 03:12:02,817 - INFO - Total training time for epoch 0: 0h:21m:2s.
2024-02-29 03:29:49,794 - INFO - Total validation time for epoch 0: 0h:17m:46s.
2024-02-29 03:29:49,794 - INFO - Total validation time for epoch 0: 0h:17m:46s.


  0%|          | 0/80 [38:49<?, ?it/s]


IndexError: tuple index out of range