### Load The VisualBERT Model

In [None]:
from transformers import BertTokenizer, VisualBertModel

# Load the pre-trained VisualBERT model
model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

### Load the COCO Dataset

In [None]:
import torch
import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Define data transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load data
train_dataset = datasets.CocoCaptions(root = r'val2014\val2014', annFile = r'annotations_trainval2014\annotations\captions_val2014.json', transform = transform)

#select 7000 samples from the dataset
train_dataset = torch.utils.data.Subset(train_dataset, range(7000))

# Split data into training, validation, and testing sets
train_size = int(0.8 * len(train_dataset))
val_size = int(0.1 * len(train_dataset))
test_size = len(train_dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size, test_size])

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

### Extract Image Features

In [None]:
encoder = torch.hub.load('pytorch/vision:v0.6.0', 'resnet18', pretrained=True)
features_list = []

for i, (images, captions) in enumerate(train_loader):
    features = encoder(images)
    features_list.append(features)

features = torch.cat(features_list, dim=0)

### Fine Tune VisualBERT

In [17]:
# fine tune VisualBERT model
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
num_epochs = 1
for epoch in range(num_epochs):
    for i, (images, captions) in enumerate(train_loader):
        #select the first caption of the 5 captions available for each image
        captions = [captions[0]]
        # tokenize captions
        tokenized_captions = tokenizer(captions[0], padding=True, truncation=True, return_tensors="pt")

        visual_token_type_ids = torch.ones(features.shape[:-1], dtype=torch.long)
        visual_attention_mask = torch.ones(features.shape[:-1], dtype=torch.float)

        tokenized_captions.update({'visual_embeds': features, 'visual_token_type_ids': visual_token_type_ids, 'visual_attention_mask': visual_attention_mask})
        # forward pass
        outputs = model(**tokenized_captions)
        
        # calculate loss
        loss = outputs.loss

        # backpropagate
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if i % 100 == 0:
            print("Epoch: {}, Iteration: {}, Loss: {}".format(epoch, i, loss.item()))

RuntimeError: Tensors must have same number of dimensions: got 2 and 1