In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m87.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [None]:
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image

In [None]:
# imports
import os
import re
import time
import json
import math
import shutil
import random
import pandas as pd
import numpy as np
from PIL import Image
from collections import Counter, defaultdict
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torchvision.models import resnet152, ResNet152_Weights
import torch.optim as optim
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

In [None]:
!wget http://images.cocodataset.org/zips/val2014.zip
!unzip /content/val2014.zip
!rm /content/val2014.zip

!wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip
!unzip /content/v2_Questions_Val_mscoco.zip
!rm /content/v2_Questions_Val_mscoco.zip
!mv /content/v2_OpenEnded_mscoco_val2014_questions.json /content/val2014questions.json

!wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip
!unzip /content/v2_Annotations_Val_mscoco.zip
!rm /content/v2_Annotations_Val_mscoco.zip
!mv /content/v2_mscoco_val2014_annotations.json /content/val2014answers.json

!mkdir /content/questions
!mkdir /content/answers

!mv /content/val2014questions.json /content/questions/val.json
!mv /content/val2014answers.json /content/answers/val.json

In [None]:
class VQADataset(Dataset):
    def __init__(self, phase, questions_dir, answers_dir, images_dir):
        self.phase = phase
        self.questions_json = questions_dir + "/" + self.phase + ".json"
        self.answers_json = answers_dir + "/" + self.phase + ".json"
        self.images_dir = images_dir

        self.dataset = self.create_dataset()
    

    def create_dataset(self):
        with open(self.questions_json) as f:
            questions = json.load(f)["questions"]
        with open(self.answers_json) as f:
            answers = json.load(f)["annotations"]

        dataset = []
        file_loop = tqdm(enumerate(zip(questions, answers)), total=len(questions), colour="green")
        for idx, (q, a) in file_loop:
            if(q["image_id"]!=a["image_id"]):
                continue
            image_id = str(q["image_id"])
            image_path = self.images_dir + "/" + self.phase + "/" + image_id + ".jpg"

            ans = a["answers"]
            answers = []

            for answer in ans:
                if((answer["answer_confidence"]=="yes") and (answer["answer"] not in answers)):
                    answers.append(answer["answer"].lower())
            
            sample = {}
            sample["image_path"] = image_path
            sample["question"] = q["question"]
            sample["answers"] = answers
            dataset.append(sample)

            file_loop.set_description(f"Generating {self.phase} data")
        
        random.shuffle(dataset)
        return dataset


    def __len__(self):
        return len(self.dataset)
    
    
    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        
        sample = self.dataset[index]
        image_path =  sample["image_path"]
        image = Image.open(image_path).convert("RGB")
        question = sample["question"]
        answers = sample["answers"]
        
        return image, question, answers

In [None]:
def rename_image_dataset(phase, input_dir, num_samples=None):
    images = os.listdir(input_dir)
    if(len(images)==0):
        print("Input directory {} is empty".format(input_dir))
    else:
        if num_samples is not None:
            random.shuffle(images)
            images = images[:num_samples]
        image_count = len(images)
        file_loop = tqdm(enumerate(images), total=len(images), colour="green")
        for n_image, image_name in file_loop:
            try:
                input_image_path = os.path.join(input_dir + '/', image_name)
                with open(input_image_path, 'r+b') as f:
                    with Image.open(f) as img:
                        image_name = image_name.split("_")[-1].lstrip("0")
                        output_image_path = os.path.join(input_dir + '/', image_name)
                        img.save(output_image_path, img.format)
                        os.remove(input_image_path)
            except (IOError, SyntaxError) as e:
                print("Error while resizing {}".format(image_name))
                pass
            file_loop.set_description(f"Resizing {phase} images...")

In [None]:
rename_image_dataset(phase="val", input_dir="/content/val2014")

Resizing val images...: 100%|[32m██████████[0m| 40504/40504 [16:22<00:00, 41.24it/s]


In [None]:
val_dataset = VQADataset(phase="val",
                         questions_dir="/content/questions",
                         answers_dir="/content/answers",
                         images_dir="/content/val2014")

val_loader = DataLoader(val_dataset,
                        batch_size=1, 
                        shuffle=False)

Generating val data: 100%|[32m██████████[0m| 214354/214354 [08:45<00:00, 408.25it/s]


In [None]:
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

Downloading:   0%|          | 0.00/251 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/320 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/136k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/470M [00:00<?, ?B/s]

In [None]:
questions_json = "/content/questions/val.json"
answers_json = "/content/answers/val.json"
with open(questions_json) as f:
    questions = json.load(f)["questions"]
with open(answers_json) as f:
    answers = json.load(f)["annotations"]

In [None]:
images_dir="/content/val2014"

questions = questions[79225:]
answers = answers[79225:]
correct = 0
total = 0

correct_q = 0
total_q = 0

file_loop = tqdm(enumerate(zip(questions, answers)), total=len(questions), colour="green")
for idx, (q, a) in file_loop:
    if(q["image_id"]!=a["image_id"]):
        continue
    image_id = str(q["image_id"])
    image_path = images_dir + "/" + image_id + ".jpg"

    ans = a["answers"]
    all_answers = []

    for answer in ans:
        if((answer["answer_confidence"]=="yes") and (answer["answer"] not in all_answers)):
            all_answers.append(answer["answer"].lower())
    
    image = Image.open(image_path).convert("RGB")
    question = q["question"]
    all_answers = all_answers

    encodings = processor(image, question, return_tensors="pt").to(device)
    outputs = model(**encodings)
    logits = outputs.logits
    _, answer_index_top5 = torch.topk(logits, 5)
    predicted_answer = []
    for pred_answer_index in answer_index_top5[0, :]:
        predicted_answer.append(model.config.id2label[pred_answer_index.item()])
    
    if predicted_answer[0] in all_answers:
        correct_q += 1
        total_q += 1
    else:
        total_q += 1

    for ans in all_answers:
        total += 1
        if ans in predicted_answer:
            correct+=1
    
    file_loop.set_description(f"Testing on validation data")

Testing on validation data:  31%|[32m███▏      [0m| 42319/135129 [39:49<1:26:24, 17.90it/s]

In [None]:
print("Accuracy :", correct_q/total_q *100)

Accuracy : 84.65844896093839


In [None]:
print("Accuracy :", correct/total *100)

Accuracy : 64.3590399501735


In [None]:
all_answers

['down',
 'down',
 'at table',
 'skateboard',
 'down',
 'table',
 'down',
 'down',
 'down',
 'down']