# VQAv2 Dataset preprocessing

#### Multiple CHoice Questions creation

### Libraries

In [1]:
import json
import torch.utils.data as data
import os
import random 
import io
import base64
import matplotlib.pyplot as plt
import pandas as pd
import swifter
import gensim.downloader as api

from mcq import MCQ
from torchvision import transforms as T
from PIL import Image
from sense2vec import Sense2Vec
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from gensim.models import Word2Vec
from datasets import Dataset


[nltk_data] Downloading package stopwords to /home/lander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Local Dataset

In [2]:
# get images and annotations from https://visualqa.org/download.html
VQAV2_ROOT = '../flamingo-train_task/training/VQAV2/val2014'
VQAV2_ANN_VAL = '../flamingo-train_task/training/VQAV2/v2_mscoco_val2014_annotations.json'
VQAV2_ANN_QUEST_VAL = '../flamingo-train_task/training/VQAV2/v2_OpenEnded_mscoco_val2014_questions.json'

In [3]:
class VQAv2(data.Dataset):
    def __init__(self, image_folder, questions_file, annotations_file, transform=None, target_transform=None):
        self.image_folder = image_folder
        print(os.getcwd())
        self.questions = json.load(open(questions_file))['questions']
        self.annotations = json.load(open(annotations_file))['annotations']
        self.transform = transform
        self.target_transform = target_transform

    def __getitem__(self, index):
        target = self.questions[index]['question']
        label = self.annotations[index]['multiple_choice_answer']
        image_id = self.annotations[index]['image_id']
        image_path = os.path.join(self.image_folder, 'COCO_val2014_' + str(image_id).zfill(12) + '.jpg')
        image = Image.open(image_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)

        if self.target_transform is not None:
            target = self.target_transform(target)
        
        return image, target, label

    def __getcaption__(self,index):
        return self.questions["question"][index]
    
    def __getanswer__(self,index):
        return self.annotations["answer"][index]
    
    def __len__(self):
        return len(self.questions)

In [4]:
transform = T.Compose([ #Con cierta probabilidad da la vuelta a la imagen y procesa la imagen con Clip
    T.Resize((224, 224)),                       
])

def target_transform(data):
    return f"{random.choice(['', ' '])}[QA][CONTEXT]<image>{data}[ANSWER]"
    
dataset = VQAv2(
        image_folder=VQAV2_ROOT,
        questions_file=VQAV2_ANN_QUEST_VAL,
        annotations_file=VQAV2_ANN_VAL,
        target_transform=target_transform,
    )

/home/lander/Documentos/GitHub/MUCSI_Modal/preprocessing


In [5]:
print("Total annotations: {}, Total questions:{}".format(len(dataset.annotations), len(dataset.questions)))

Total annotations: 214354, Total questions:214354


In [6]:
print(dataset.questions[0])
print(dataset.annotations[0])

{'image_id': 262148, 'question': 'Where is he looking?', 'question_id': 262148000}
{'question_type': 'none of the above', 'multiple_choice_answer': 'down', 'answers': [{'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 1}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'at table', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'skateboard', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'table', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 7}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 262148, 'answer_type': 'other', 'question_id': 262148000}


In [None]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [7]:
model = api.load('word2vec-google-news-300')
INDEXES = ["(A) ", "(B) ", "(C) ", "(D) "]
mcq = MCQ()
s2v = Sense2Vec().from_disk('s2v_old')
sentence_transformer_model = SentenceTransformer('msmarco-distilbert-base-v3')

def generate_distractors(row):
    choices_tmp = '[OPTIONS]'

    if row['answer_type'] == "yes/no":
        choices_tmp += '(A) Yes (B) No '
    else:
        distractors = []
        word = row['answer']
        if word in model.key_to_index:
            distractors = [similar_word.capitalize() for similar_word, _ in model.most_similar(word, topn=5)]

        if len(distractors) < 3:
            distractors_mcq = mcq.get_distractors(word=word.capitalize(),
                                                  origsentence=row['question'],
                                                  sense2vecmodel=s2v,
                                                  sentencemodel=sentence_transformer_model,
                                                  top_n=40,
                                                  lambdaval=0.2)
        
            for distractor in distractors_mcq:
                distractors.append(distractor)
                
        while len(distractors) < 3:
            distractors.append(mcq.r.get_random_word().capitalize())
            
        distractors = distractors[0:3]
        distractors.append(word.capitalize())
        random.shuffle(distractors)

        for idx, distractor in zip(INDEXES, distractors):
            choices_tmp += idx + distractor + " "

    return choices_tmp

In [8]:
images, questions, answers,answer_types = [], [], [], []
dataset_id = 0
last_image_id = None
LIMIT = 10000

for idx, question, annotation in tqdm(zip(range(len(dataset.questions)), dataset.questions, dataset.annotations), total=len(dataset.questions)):
    answer = annotation['multiple_choice_answer']
    answer_type = annotation['answer_type']
    actual_image_id = question['image_id']
    question = question['question']

    img, _, _ = dataset.__getitem__(idx)
    
    if actual_image_id != last_image_id:
        images.append(img)
        questions.append('[QUESTION]' + question)
        answers.append(answer.capitalize())
        answer_types.append(answer_type)
        last_image_id = actual_image_id

    if len(images) >= LIMIT:
        break


 25%|██▍       | 52731/214354 [01:34<04:48, 560.58it/s]


In [9]:
data = {
    'image': images,
    'question': questions,
    'answer': answers,
    'answer_type': answer_types
}
df = pd.DataFrame.from_dict(data)
df['choices'] = df.swifter.apply(generate_distractors, axis=1)

Pandas Apply:   0%|          | 0/10000 [00:00<?, ?it/s]

In [11]:
df2 = df
replace_func = lambda text: text.replace('_', ' ')
df2['choices'] = df2['choices'].swifter.apply(replace_func)

Pandas Apply:   0%|          | 0/10000 [00:00<?, ?it/s]

In [12]:
df2.head()

Unnamed: 0,image,question,answer,answer_type,choices
0,<PIL.Image.Image image mode=RGB size=640x512 a...,[QUESTION]Where is he looking?,Down,other,[OPTIONS](A) Up (B) Down (C) Down chicken sand...
1,<PIL.Image.Image image mode=RGB size=640x428 a...,[QUESTION]What website copyrighted the picture?,Foodiebakercom,other,[OPTIONS](A) Giulio (B) Sunback (C) Foodiebake...
2,<PIL.Image.Image image mode=RGB size=640x480 a...,[QUESTION]What is the man doing in the street?,Walking,other,[OPTIONS](A) Walking (B) Strolling (C) Jogging...
3,<PIL.Image.Image image mode=RGB size=640x512 a...,[QUESTION]Why is there a gap between the roof ...,Yes,other,[OPTIONS](A) Oh yes (B) Yeah (C) Yes (D) Well
4,<PIL.Image.Image image mode=RGB size=640x427 a...,[QUESTION]What color is the grass in this pict...,Green,other,[OPTIONS](A) Greener (B) Blue (C) Green (D) Wh...


In [13]:
data_dict = df2.to_dict(orient='list')
dataset = Dataset.from_dict(data_dict)
dataset_name = "landersanmi/VQAv2".format(dataset_id)
dataset.push_to_hub(dataset_name)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/10 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]