In [1]:
import os
os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' # aligns the GPU ids with what is displayed in nvidia-smi/nvtop
os.environ['CUDA_VISIBLE_DEVICES'] = '3' # restrict the available GPUs

import torch

from models import TUMViTG, TUMViTG_modified, timmTUMViTG

torch.set_float32_matmul_precision("medium") # make use of Tensor Cores
torch._dynamo.config.suppress_errors = True # makes torch.compile work, regardless of xFormers

In [None]:
# use the prepared /mnt/data/nfs03-R6/TUMViTG.pth file

model = TUMViTG("/mnt/data/nfs03-R6/TUMViTG.pth", output_mode="class+mean").cuda()
model = model.eval()

In [None]:
# maps the pretrained weights to timm implementation
# the torch.compiled timm implementation is about 1.5x to 2.5x faster than the original implementation with xformers
# -> at least in my case :)

# it has to be said: the timm implementation does not produce the exact same results as the original implementation
# -> in my tests the actual difference was noticable in training and validation curves, but did not have any considerable impact on the final results
# -> best to try both and see whether the performance benefits are worth the slight difference from the original implementation

timm_model = timmTUMViTG("/mnt/data/nfs03-R6/TUMViTG.pth", output_mode="class+mean").cuda()
timm_model = timm_model.eval()
timm_model = torch.compile(timm_model, fullgraph=True)

In [None]:
input = torch.randn(4, 3, 224, 224, device="cuda")

with torch.no_grad():
    output = model(input)

print(output.shape)

In [None]:
modified_model = TUMViTG_modified("/mnt/data/nfs03-R6/TUMViTG.pth", output_mode="class+mean", img_size=(448, 896)).cuda()
modified_model = modified_model.eval()

In [None]:
modified_input = torch.randn(4, 3, 448, 896, device="cuda")

with torch.no_grad():
    modified_output = modified_model(modified_input)

print(modified_output.shape)