In [1]:
!pip install -q transformers
!pip install datasets



In [2]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import itertools

In [3]:
# Set device and move model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
class VQADataset(Dataset):
    
    def __init__(self, data, processor,max_length):
        # Load your data from the provided path (replace with your logic)
        self.data = data
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        image = self.data['image'][idx]
        questions = self.data['question'][idx]

        encoding = self.processor(image, questions, padding="max_length", truncation=True, return_tensors="pt")
        # remove batch dimension
        for k,v in encoding.items():
            encoding[k] = v.squeeze()
        # add labels
        labels = self.data['answer'][idx]
        scores = self.data['weight'][idx]
        targets = torch.zeros(len(id2label))
        targets[labels] = scores
        encoding["labels"] = targets

        return encoding

In [9]:
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")

2024-05-10 18:17:24.219525: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-10 18:17:24.219650: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-10 18:17:24.471035: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


preprocessor_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [83]:
dataset = load_dataset("flaviagiammarino/vqa-rad")['train'][:]
dataset['weight'] = torch.ones(len(dataset['image']))


def replace_str(inputs):
    return label2id[inputs]

def replace_ids(inputs):
    return id2label[inputs]

labels = dataset['answer']

unique_labels = []

for label in labels:
    if label not in unique_labels:
        unique_labels.append(label)



label2id = {"<S>":0,"<E>":1}
num_id = 2
for label in unique_labels:
    label2id[label] = num_id
    num_id+=1

label2id

id2label = {idx: label for label, idx in label2id.items()}


answers = []
for data in dataset['answer']:
    answers.append(replace_str(data))

dataset['answer'] = answers

In [90]:
id2label[101]

'non-enhanced'

In [86]:
len(dataset['image'])

1793

In [87]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-mlm",id2label=id2label, label2id=label2id)

for name, param in model.named_parameters():
    if not name.startswith("classifier"):
        param.requires_grad = False

model.classifier = torch.nn.Sequential(
    nn.Linear(in_features=768, out_features=1536, bias=True),
  nn.LayerNorm((1536,), eps=1e-05, elementwise_affine=True),
  nn.GELU(approximate='none'),
  nn.Linear(in_features=1536, out_features=len(id2label), bias=True),
)


model.to(device)

Some weights of ViltForQuestionAnswering were not initialized from the model checkpoint at dandelin/vilt-b32-mlm and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.1.bias', 'classifier.1.weight', 'classifier.3.bias', 'classifier.3.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViltForQuestionAnswering(
  (vilt): ViltModel(
    (embeddings): ViltEmbeddings(
      (text_embeddings): TextEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(40, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (patch_embeddings): ViltPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
      )
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViltEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViltLayer(
          (attention): ViltAttention(
            (attention): ViltSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=76

In [88]:
def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    pixel_values = [item['pixel_values'] for item in batch]
    attention_mask = [item['attention_mask'] for item in batch]
    token_type_ids = [item['token_type_ids']for item in batch]
    labels = [item['labels'] for item in batch]

    my_dict = {'input_ids':input_ids,'attention_mask':attention_mask}

    image_encoding = processor.image_processor.pad(pixel_values, return_tensors="pt")
    text_encoding = processor.tokenizer.pad(my_dict,padding = 'max_length',return_tensors = "pt")

    batch = {}
    batch['input_ids'] = text_encoding['input_ids']
    batch['attention_mask'] = text_encoding['attention_mask']
    batch['pixel_values'] = image_encoding['pixel_values']
    batch['pixel_mask'] = image_encoding['pixel_mask']
    batch['labels'] = torch.stack(labels)

    return batch


train_dataset = VQADataset(dataset, processor,max_length = max)
train_loader = torch.utils.data.DataLoader(train_dataset, collate_fn = collate_fn, batch_size=4,shuffle= True)

In [91]:
for i, batch in enumerate(train_loader):
    print(batch['labels'])
    

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.]])


In [97]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 300

model.train()
for epoch in range(num_epochs):
    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        model_inputs = {
        "input_ids": batch["input_ids"].to(device),
        "attention_mask": batch["attention_mask"].to(device),
        "pixel_values": batch["pixel_values"].to(device),
        }

        labels = batch["labels"].to(device)

        model_inputs['labels'] = labels

        outputs = model(**model_inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Epoch: {epoch+1}/{num_epochs}, Step: {i+1}/{len(batch['input_ids'])}, Loss: {outputs.loss.item()}")

Epoch: 1/300, Step: 1/4, Loss: 16.93903350830078
Epoch: 2/300, Step: 1/4, Loss: 16.242759704589844
Epoch: 3/300, Step: 1/4, Loss: 15.597319602966309
Epoch: 4/300, Step: 1/4, Loss: 14.989378929138184
Epoch: 5/300, Step: 1/4, Loss: 14.416669845581055
Epoch: 6/300, Step: 1/4, Loss: 13.875
Epoch: 7/300, Step: 1/4, Loss: 13.360313415527344
Epoch: 8/300, Step: 1/4, Loss: 12.86999225616455
Epoch: 9/300, Step: 1/4, Loss: 12.402443885803223
Epoch: 10/300, Step: 1/4, Loss: 11.956439971923828
Epoch: 11/300, Step: 1/4, Loss: 11.530654907226562
Epoch: 12/300, Step: 1/4, Loss: 11.12398910522461
Epoch: 13/300, Step: 1/4, Loss: 10.73564624786377
Epoch: 14/300, Step: 1/4, Loss: 10.364906311035156
Epoch: 15/300, Step: 1/4, Loss: 10.011009216308594
Epoch: 16/300, Step: 1/4, Loss: 9.673189163208008
Epoch: 17/300, Step: 1/4, Loss: 9.350709915161133
Epoch: 18/300, Step: 1/4, Loss: 9.042869567871094
Epoch: 19/300, Step: 1/4, Loss: 8.748971939086914
Epoch: 20/300, Step: 1/4, Loss: 8.468330383300781
Epoch: 21/

In [24]:
def save_model(model, path):
    torch.save(model.state_dict(), path)

path = "my_fine_tuned_model.pth"
save_model(model, path)


In [69]:
import torch
from transformers import ViltProcessor, ViltForQuestionAnswering

def load_model(model_class, path):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model_class()
    model.load_state_dict(torch.load(path, map_location=device))
    model.eval()  # Set the model to evaluation mode for inference
    return model

path = '/kaggle/working/my_fine_tuned_model.pth'
model = load_model(ViltForQuestionAnswering(config.id2label = id2label,config.label2id = label2id), path)


In [98]:


indexes = [19,545,712,668,109,545,888]
all_logits = []
all_outs = []
all_answers = []
for index in indexes:
    image = dataset['image'][index]
    question = dataset['question'][index]
    answer = dataset['answer'][index]
    weight = dataset['weight'][index]
    all_answers.append(replace_ids(answer))
    example = processor(image,question,return_tensors = 'pt')
    example.to(device)
    
    # forward pass
    outputs = model(**example)

    logits = outputs.logits
    all_logits.append(logits)
    predicted_classes = torch.softmax(logits,dim = 1)

    probs, classes = torch.topk(predicted_classes, 5)
    maximum = 0
    out = ""
    for prob, class_idx in zip(probs.squeeze().tolist(), classes.squeeze().tolist()):
        if prob > maximum:
            out = model.config.id2label[class_idx]
            maximum = prob
            
    all_outs.append(out)

In [None]:
all_outs

['pa',
 '5.6cm focal, predominantly hypodense',
 'hydropneumothorax',
 'yes',
 'axial',
 '5.6cm focal, predominantly hypodense',
 'yes']