In [5]:
from functools import partial
import os
import tempfile
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
import ray.cloudpickle as pickle
from torchvision.datasets import ImageFolder
from sklearn.metrics import balanced_accuracy_score

In [6]:
# Constants
EPOCHS = 20
N_TRIALS = 20
CLASSES = 10  # StateFarm has 10 classes

In [7]:
def define_model():
    """
    Defines the pretrained ViT_B_16 model with a modified last linear layer and frozen base layers.
    """
    pretrained_vit_weights = torchvision.models.ViT_B_16_Weights.DEFAULT
    pretrained_vit = torchvision.models.vit_b_16(weights=pretrained_vit_weights)
    
    # Freeze the base parameters
    for parameter in pretrained_vit.parameters():
        parameter.requires_grad = False

    # Modify the final layer for 10 classes (StateFarm)
    pretrained_vit.heads = nn.Linear(in_features=768, out_features=CLASSES)
    return pretrained_vit, pretrained_vit_weights.transforms()

In [8]:
def get_data_loaders(transform):
    """
    Creates the train and validation dataloaders.
    """
    train_dir = "/home/sur06423/wacv_paper/wacv_paper/data/imbalanced_v2/train"
    val_dir = "/home/sur06423/wacv_paper/wacv_paper/data/imbalanced_v2/validation"
    
    trainset = ImageFolder(root=train_dir, transform=transform)
    train_loader = torch.utils.data.DataLoader(trainset, batch_size=1024, shuffle=True)
    valset = ImageFolder(root=val_dir, transform=transform)
    val_loader = torch.utils.data.DataLoader(valset, batch_size=1024, shuffle=True)

    return train_loader, val_loader

In [9]:
def train_model(model, optimizer, train_loader, device):
    """
    Training function.
    """
    model.train()
    total_loss, total_correct = 0, 0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = F.cross_entropy(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total_correct += (predicted == labels).sum().item()
    return total_loss / len(train_loader.dataset), total_correct / len(train_loader.dataset)

def validate_model(model, val_loader, device):
    """
    Validation function.
    """
    model.eval()
    total_loss, total_correct = 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = F.cross_entropy(outputs, labels)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total_correct += (predicted == labels).sum().item()
    return total_loss / len(val_loader.dataset), total_correct / len(val_loader.dataset)


In [10]:
class TrainViT(tune.Trainable):
    """
    Trainable class for Ray Tune.
    """
    def setup(self, config):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model, transforms = define_model()
        self.model.to(self.device)
        self.train_loader, self.val_loader = get_data_loaders(transforms)
        self.optimizer = optim.SGD(self.model.parameters(), lr=config["lr"], momentum=config["momentum"])

    def step(self):
        train_loss, train_acc = train_model(self.model, self.optimizer, self.train_loader, self.device)
        val_loss, val_acc = validate_model(self.model, self.val_loader, self.device)
        return {"train_loss": train_loss, "train_acc": train_acc, "val_loss": val_loss, "val_acc": val_acc}

    def save_checkpoint(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
        torch.save({
            "model_state_dict": self.model.state_dict(),
            "optimizer_state_dict": self.optimizer.state_dict(),
        }, checkpoint_path)
        return checkpoint_dir

    def load_checkpoint(self, checkpoint_dir):
        checkpoint_path = os.path.join(checkpoint_dir, "model.pth")
        checkpoint = torch.load(checkpoint_path)
        self.model.load_state_dict(checkpoint["model_state_dict"])
        self.optimizer.load_state_dict(checkpoint["optimizer_state_dict"])

In [14]:
import ray
import os

ray.shutdown()  # Ensure Ray is not already running
ray.init(num_cpus=24, num_gpus=4, include_dashboard=True)  # Explicitly set the number of GPUs

print(ray.available_resources())


2024-10-23 11:46:01,347	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


{'CPU': 24.0, 'memory': 363576301773.0, 'node:__internal_head__': 1.0, 'object_store_memory': 160104129331.0, 'GPU': 4.0, 'node:10.56.7.46': 1.0}


In [15]:


# Define the directories to be added to LD_LIBRARY_PATH
library_paths = [
    "/usr/lib/xorg-nvidia-525.116.04/lib/x86_64-linux-gnu",
    "/usr/lib/xorg/lib/x86_64-linux-gnu",
    "/usr/lib/xorg-nvidia-535.113.01/lib/x86_64-linux-gnu"
]

# Current LD_LIBRARY_PATH from the environment
current_ld_library_path = os.environ.get('LD_LIBRARY_PATH', '')

# Adding each path only if it is not already in the LD_LIBRARY_PATH
new_paths = [path for path in library_paths if path not in current_ld_library_path]

# Join all new paths with the existing LD_LIBRARY_PATH
os.environ['LD_LIBRARY_PATH'] = ':'.join(new_paths + [current_ld_library_path])

# Verify the update
print("Updated LD_LIBRARY_PATH:")
print(os.environ['LD_LIBRARY_PATH'])


Updated LD_LIBRARY_PATH:
/usr/lib/xorg-nvidia-525.116.04/lib/x86_64-linux-gnu:/usr/lib/xorg/lib/x86_64-linux-gnu:/usr/lib/xorg-nvidia-535.113.01/lib/x86_64-linux-gnu:


In [16]:
config = {
    "lr": tune.loguniform(1e-4, 1e-1),
    "momentum": tune.uniform(0.8, 0.99)
}

scheduler = ASHAScheduler(
    metric="val_acc",
    mode="max",
    max_t=100,
    grace_period=5,
    reduction_factor=2,
    brackets=3
)

analysis = tune.run(
    TrainViT,
    resources_per_trial={"cpu": 2, "gpu": 1},
    num_samples=2,
    scheduler=scheduler,
    config=config
)

2024-10-23 11:47:01,327	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2024-10-23 12:04:15
Running for:,00:17:13.64
Memory:,21.0/503.4 GiB

Trial name,status,loc,lr,momentum,iter,total time (s),train_loss,train_acc,val_loss
TrainViT_c0a7d_00000,RUNNING,10.56.7.46:733248,0.00495431,0.846764,15,987.36,0.699733,0.83842,3.11659
TrainViT_c0a7d_00001,RUNNING,10.56.7.46:733249,0.0241355,0.886337,15,978.528,0.310892,0.923399,2.87255


[36m(TrainViT pid=733249)[0m   return F.conv2d(input, weight, bias, self.stride,


Trial name,train_acc,train_loss,val_acc,val_loss
TrainViT_c0a7d_00000,0.83842,0.699733,0.302021,3.11659
TrainViT_c0a7d_00001,0.923399,0.310892,0.403389,2.87255


2024-10-23 12:04:15,039	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/home/sur06423/ray_results/TrainViT_2024-10-23_11-47-01' in 0.0205s.
2024-10-23 12:04:25,061	INFO tune.py:1041 -- Total run time: 1043.73 seconds (1033.62 seconds for the tuning loop).
Resume experiment with: tune.run(..., resume=True)


In [None]:
print("Best hyperparameters found were: ", analysis.best_config)