In [39]:
import torch

if torch.cuda.is_available():
    device = torch.device('cuda')
    print('GPU device:',torch.cuda.get_device_name(0))
else:
    device = torch.device('cpu')
    print('No GPU avaialable, Using CPU')

torch.cuda.set_device(0)

GPU device: Tesla V100-SXM2-32GB


In [1]:
# Standard Library Imports
import os
import sys
import time
import logging
import getpass
from glob import glob
from pathlib import Path
import random
from typing import Dict, List, Tuple, Optional

# Third-Party Library Imports
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Local Imports
sys.path.append('/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline')
from src.components import data_setup
from src.components.dataset import ImageFolderCustom
from src.components import utils
from src.components.config_manager_baseline import get_config

In [3]:
def format_time(seconds):
    """Converts time in seconds to hours, minutes, and seconds format."""
    hours = seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = seconds % 60
    return f"{int(hours)} hours, {int(minutes)} minutes, {int(seconds)} seconds"

In [5]:
def calculate_balanced_accuracy(y_pred, y_true, num_classes, epsilon=1e-9):
    """
    Calculates the balanced accuracy score.
    
    Args:
        y_pred (torch.Tensor): Predicted labels.
        y_true (torch.Tensor): True labels.
        num_classes (int): Number of classes in the dataset.
        epsilon (float): A small value to add to denominators to prevent division by zero.
        
    Returns:
        float: Balanced accuracy score.
    """
    # Create confusion matrix
    confusion_matrix = torch.zeros(num_classes, num_classes, device=y_pred.device)
    for t, p in zip(y_true.view(-1), y_pred.view(-1)):
        confusion_matrix[t.long(), p.long()] += 1

    # Calculate recall for each class, adding epsilon to avoid division by zero
    # Recall =  dividing the true positives by the sum of the true positive and false negative for each class
    # Recall = (diagonal elements of the confusion matrix) /  (the sum of elements in each row of the confusion matrix + epsilon)
    recall = torch.diag(confusion_matrix) / (confusion_matrix.sum(1) + epsilon)

    # balanced_accuracy_per_class = recall  # This line is technically not needed but added for clarity

    # Calculate balanced accuracy
    balanced_accuracy = recall.mean().item()

    return balanced_accuracy

In [6]:
def setup_and_create_dataloaders(batch_size, train_dir, val_dir, num_workers, prefetch_factor):
    pretrained_vit_weights = torchvision.models.ViT_B_16_Weights.DEFAULT
    pretrained_vit = torchvision.models.vit_b_16(weights=pretrained_vit_weights)

    # Freeze the base parameters
    for parameter in pretrained_vit.parameters():
        parameter.requires_grad = False

    # Change the classifier head
    pretrained_vit.heads = nn.Linear(in_features=768, out_features=34)
    pretrained_vit_transforms = pretrained_vit_weights.transforms()

    train_dataloader, val_dataloader, class_names = data_setup.create_dataloaders(
        train_dir=train_dir,
        val_dir=val_dir,
        transform=pretrained_vit_transforms,
        batch_size=batch_size,
        num_workers=num_workers,
        prefetch_factor=prefetch_factor
    )

    return train_dataloader, val_dataloader, class_names , pretrained_vit

In [7]:
def load_train_objs(model, num_epochs, optimizer_choice, scheduler_choice, initial_lr, momentum, weight_decay_adam, wd_sgd):
    # Setup the optimizer
    if optimizer_choice == 'ADAM':
        optimizer = optim.Adam(
            params=model.parameters(),
            lr=initial_lr,
            betas=(0.9, 0.999),
            weight_decay=weight_decay_adam
        )
    elif optimizer_choice == 'SGD':
        optimizer = optim.SGD(
            params=model.parameters(),
            lr=initial_lr,
            momentum=momentum,
            weight_decay=wd_sgd
        )
    else:
        raise ValueError("Invalid optimizer choice. Choose 'adam' or 'sgd'.")

    # Define the lambda function for learning rate scheduling
    def lr_lambda(epoch):
        # Decrease the learning rate by a factor of 10 every 30 epochs
        return 0.1 ** (epoch // 30)

    # Setup the learning rate scheduler
    if scheduler_choice == 'CosineAnnealingLR':
        lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=num_epochs
        )
    elif scheduler_choice == 'LambdaLR':
        lr_scheduler = optim.lr_scheduler.LambdaLR(
            optimizer,
            lr_lambda=lr_lambda  # Used the custom lambda function
        )
    else:
        raise ValueError("Invalid scheduler choice. Choose 'LambdaLR' or 'CosineAnnealingLR'")
    
    return optimizer, lr_scheduler

In [61]:
def train_step_sample_wise(gpu_id, model, train_dataloader, optimizer):
    model.train()
    y_pred_all = []
    y_all = []
    running_train_loss, train_acc, num_samples = 0, 0, 0
    print(f"In the beginning the value of running train loss is: {running_train_loss} , train accuracy is : {train_acc}, number of samples is: {num_samples}")
    for batch, (X, y) in enumerate(train_dataloader):
        X, y = X.to(gpu_id), y.to(gpu_id)
        print(f"The shape of the image batch: {batch} , is : {X.size()}")
        print(f"The shape of the ground truth batch : {batch} , is : {y.size()}")
        optimizer.zero_grad()
        y_pred = model(X)
        print(f"The shape of the predictions for batch: {batch} , is: {y_pred.size()}")
        loss = F.cross_entropy(y_pred, y)
        print(f"The calculated Loss value for batch: {batch} , is : {loss}")
        loss_sum = F.cross_entropy(y_pred, y, reduction='sum')
        print(f"The calculated Loss value as sum for batch: {batch} , is : {loss_sum}")
        loss.backward()
        optimizer.step()
        # F.cross_entropy returns the mean loss per batch, 
        # and we need the total loss to calculate the average loss over all samples after the loop.
        running_train_loss += loss.item() * X.size(0)
        y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
        train_acc += (y_pred_class == y).type(torch.float).sum().item()
        num_samples += X.size(0)
        y_pred_all.append(y_pred_class)
        y_all.append(y)

        print(f"After batch {batch} the value of running train loss is: {running_train_loss} , train accuracy is : {train_acc}, number of samples is: {num_samples}")
    avg_loss = running_train_loss / num_samples
    # Average accuracy = Summation of Accuracy over all batches / Number of samples
    avg_acc = train_acc / num_samples

    before_lr_rate =  {optimizer.param_groups[0]["lr"]}

    # Concatenate all the predictions and ground truths per epoch
    train_y_pred_all = torch.concatenate(y_pred_all)
    train_y_all = torch.concatenate(y_all)
    return avg_loss, avg_acc, train_y_pred_all, train_y_all, before_lr_rate

In [51]:
def val_step_sample_wise(gpu_id, model, val_dataloader):
    model.eval()
    y_pred_all = []
    y_all = []
    running_val_loss, val_acc, num_samples = 0, 0, 0
    with torch.no_grad():
        for batch, (X, y) in enumerate(val_dataloader):
            X, y = X.to(gpu_id), y.to(gpu_id)
            y_pred = model(X)
            loss = F.cross_entropy(y_pred, y)
            # F.cross_entropy returns the mean loss per batch, 
            # and we need the total loss to calculate the average loss over all samples after the loop.
            running_val_loss += loss.item() * X.size(0)
            y_pred_class = torch.argmax(torch.softmax(y_pred, dim=1), dim=1)
            val_acc += (y_pred_class == y).type(torch.float).sum().item()
            num_samples += X.size(0)
            y_pred_all.append(y_pred_class)
            y_all.append(y)
            print(f"After batch {batch} the value of running val loss is: {running_val_loss} , val accuracy is : {val_acc}, number of samples is: {num_samples}")
    
    avg_loss = running_val_loss / num_samples
    # Average accuracy = Summation of Accuracy over all batches / Number of samples
    avg_acc = val_acc / num_samples

    # Concatenate all the predictions and ground truths per epoch
    val_y_pred_all = torch.concatenate(y_pred_all)
    val_y_all = torch.concatenate(y_all)
    return avg_loss, avg_acc, val_y_pred_all, val_y_all

In [52]:
def training(max_epochs: int, num_classes: int, lr_scheduler, gpu_id, model, train_dataloader, optimizer, val_dataloader):
    model = model.to(gpu_id)
    total_start_time = time.time()  # Start time for the entire training and validation process
    try:
        start_epoch = 0
        for epoch in tqdm(range(start_epoch, max_epochs)):
            try:
                ############### Training & Validation step Here ##########################
                avg_train_loss, avg_train_acc, train_y_pred_all, train_y_all , before_lr_rate = train_step_sample_wise(gpu_id, model, train_dataloader, optimizer)
                # Balanced accuracy per epoch
                train_balanced_accuracy = calculate_balanced_accuracy(train_y_pred_all, train_y_all, num_classes)
                # Explicitly release memory
                del train_y_pred_all, train_y_all
                print(f"Epoch: {epoch} | Average train loss: {avg_train_loss} | Average train accuracy: {avg_train_acc} | Balanced Accuracy: {train_balanced_accuracy}")
                print(f"Epoch: {epoch} | learning rate: {before_lr_rate}" )
                lr_scheduler.step()


                avg_val_loss, avg_val_acc, val_y_pred_all, val_y_all= val_step_sample_wise(gpu_id, model, val_dataloader)
                # Balanced accuracy per epoch
                val_balanced_accuracy = calculate_balanced_accuracy(val_y_pred_all, val_y_all, num_classes)
                # Explicitly release memory
                del val_y_pred_all, val_y_all
                print(f"Epoch: {epoch} | Average val loss: {avg_val_loss} | Average val accuracy: {avg_val_acc} | Balanced Accuracy: {val_balanced_accuracy}")


            except RuntimeError as e:
                print(f"Runtime error occurred in epoch {epoch}: {e}")
                continue

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    finally:
        total_end_time = time.time()  # End time for the entire training and validation process
        total_duration = total_end_time - total_start_time
        formatted_duration = format_time(total_duration)
        print(f"Total training and validation time: {formatted_duration}.")

In [53]:
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets

def prepare_dataset():
    pretrained_vit_weights = torchvision.models.ViT_B_16_Weights.DEFAULT
    pretrained_vit = torchvision.models.vit_b_16(weights=pretrained_vit_weights)

    # Freeze the base parameters
    for parameter in pretrained_vit.parameters():
        parameter.requires_grad = False

    # Change the classifier head
    pretrained_vit.heads = nn.Linear(in_features=768, out_features=3)
    pretrained_vit_transforms = pretrained_vit_weights.transforms()
    # Convert the string path to a Path object
    image_path = Path("/home/sur06423/hiwi/vit_exp/vision_tranformer_baseline/src/ddp_code/data_pizza/pizza_steak_sushi")
    train_dir = image_path / "train"
    test_dir = image_path / "test"
    
    # Use ImageFolder to create dataset(s)
    train_dataset = datasets.ImageFolder(str(train_dir), transform=pretrained_vit_transforms)
    val_dataset = datasets.ImageFolder(str(test_dir), transform=pretrained_vit_transforms)

    # Get class names
    class_names = train_dataset.classes

    return train_dataset, val_dataset, class_names, pretrained_vit

In [54]:
def prepare_dataloader(dataset: Dataset, batch_size: int, num_workers, prefetch_factor):
    return DataLoader(
        dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=True,
        shuffle=False,
        drop_last = False,
        prefetch_factor = prefetch_factor
    )

In [55]:
# Here you can change things for experimentation
batch_size = 64
prefetch_factor = 2
num_epochs = 10

optimizer = 'SGD'
scheduler = 'CosineAnnealingLR'
lr = 0.001
momentum = 0.9
weight_decay = 0.0001
w_decay_adam = 0.03

if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.cuda.set_device(0)
else:
    device = torch.device('cpu')
    print('No GPU avaialable, Using CPU')

# device = "cuda" if torch.cuda.is_available() else "cpu"
utils.set_seeds(1)
num_workers = os.cpu_count()

train_dataset, val_dataset, class_names, pretrained_vit = prepare_dataset()
train_dataloader = prepare_dataloader(dataset= train_dataset, batch_size = batch_size, num_workers = num_workers, prefetch_factor = prefetch_factor)
val_dataloader = prepare_dataloader(dataset= val_dataset, batch_size = batch_size, num_workers = num_workers, prefetch_factor = prefetch_factor)

optimizer, lr_scheduler = load_train_objs(pretrained_vit, 
                                            num_epochs, 
                                            optimizer, 
                                            scheduler, 
                                            lr, 
                                            momentum, 
                                            w_decay_adam, 
                                            weight_decay
                                        )

In [56]:
print(len(train_dataloader.dataset))
print(len(train_dataloader))
print(len(val_dataloader.dataset))
print(len(val_dataloader))
print(train_dataloader.batch_size)


225
4
75
2
64


In [62]:
training(max_epochs = 2, 
         num_classes = 3, 
         lr_scheduler= lr_scheduler,  
         gpu_id = device, 
         model= pretrained_vit, 
         train_dataloader= train_dataloader, 
         optimizer = optimizer, 
         val_dataloader = val_dataloader)

  0%|          | 0/2 [00:00<?, ?it/s]

In the beginning the value of running train loss is: 0 , train accuracy is : 0, number of samples is: 0
The shape of the image batch: 0 , is : torch.Size([64, 3, 224, 224])
The shape of the ground truth batch : 0 , is : torch.Size([64])
The shape of the predictions for batch: 0 , is: torch.Size([64, 3])
The calculated Loss value for batch: 0 , is : 0.35381051898002625
The calculated Loss value as sum for batch: 0 , is : 22.64387321472168
After batch 0 the value of running train loss is: 22.64387321472168 , train accuracy is : 57.0, number of samples is: 64
The shape of the image batch: 1 , is : torch.Size([64, 3, 224, 224])
The shape of the ground truth batch : 1 , is : torch.Size([64])
The shape of the predictions for batch: 1 , is: torch.Size([64, 3])
The calculated Loss value for batch: 1 , is : 0.4343680441379547
The calculated Loss value as sum for batch: 1 , is : 27.7995548248291
After batch 1 the value of running train loss is: 50.44342803955078 , train accuracy is : 114.0, numb

 50%|█████     | 1/2 [00:10<00:10, 10.25s/it]

Epoch: 0 | Average val loss: 0.31162968158721926 | Average val accuracy: 0.92 | Balanced Accuracy: 0.912529706954956
In the beginning the value of running train loss is: 0 , train accuracy is : 0, number of samples is: 0
The shape of the image batch: 0 , is : torch.Size([64, 3, 224, 224])
The shape of the ground truth batch : 0 , is : torch.Size([64])
The shape of the predictions for batch: 0 , is: torch.Size([64, 3])
The calculated Loss value for batch: 0 , is : 0.3277164697647095
The calculated Loss value as sum for batch: 0 , is : 20.973854064941406
After batch 0 the value of running train loss is: 20.973854064941406 , train accuracy is : 57.0, number of samples is: 64
The shape of the image batch: 1 , is : torch.Size([64, 3, 224, 224])
The shape of the ground truth batch : 1 , is : torch.Size([64])
The shape of the predictions for batch: 1 , is: torch.Size([64, 3])
The calculated Loss value for batch: 1 , is : 0.3945465087890625
The calculated Loss value as sum for batch: 1 , is : 

100%|██████████| 2/2 [00:20<00:00, 10.09s/it]

Epoch: 1 | Average val loss: 0.2955687038103739 | Average val accuracy: 0.92 | Balanced Accuracy: 0.912529706954956
Total training and validation time: 0 hours, 0 minutes, 20 seconds.





In [13]:
# Here you can change things for experimentation
experiment_name = "Exp_01_SGD_Sp_1"
#device = 0

num_workers = 20
batch_size = 1024
prefetch_factor = 2

train_dir = "/net/polaris/storage/deeplearning/sur_data/rgb_daa/split_1/train"
val_dir = "/net/polaris/storage/deeplearning/sur_data/rgb_daa/split_1/val"
num_epochs = 10

optimizer = 'SGD'
scheduler = 'CosineAnnealingLR'
batch_size = 1024
lr = 0.001
momentum = 0.9
weight_decay = 0.0001
w_decay_adam = 0.03

if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.cuda.set_device(0)
else:
    device = torch.device('cpu')
    print('No GPU avaialable, Using CPU')

# device = "cuda" if torch.cuda.is_available() else "cpu"
utils.set_seeds(1)
num_workers = os.cpu_count()
train_dataloader, val_dataloader, _, model_vit = setup_and_create_dataloaders(batch_size, 
                                                                                train_dir, 
                                                                                val_dir, 
                                                                                num_workers, 
                                                                                prefetch_factor,
                                                                            )
optimizer, lr_scheduler = load_train_objs(model_vit, 
                                            num_epochs, 
                                            optimizer, 
                                            scheduler, 
                                            lr, 
                                            momentum, 
                                            w_decay_adam, 
                                            weight_decay
                                        )

In [21]:
print('GPU device:',torch.cuda.get_device_name(device))

GPU device: Tesla V100-SXM2-32GB


In [22]:
print(len(train_dataloader))


279


In [28]:
print(len(train_dataloader.dataset))

284807


In [29]:
print(len(val_dataloader.dataset))

54541


In [23]:
print(len(val_dataloader))

54
