# VQAv2 Dataset preprocessing

#### Multiple CHoice Questions creation

### Libraries

In [1]:
import json
import torch.utils.data as data
import os
import random 
import io
import base64
import matplotlib.pyplot as plt
import pandas as pd
import swifter
import gensim.downloader as api

from mcq import MCQ
from torchvision import transforms as T
from PIL import Image
from sense2vec import Sense2Vec
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from gensim.models import Word2Vec
from datasets import Dataset


[nltk_data] Downloading package stopwords to /home/lander/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load Local Dataset

In [4]:
# get images and annotations from https://visualqa.org/download.html
VQAV2_ROOT = '../flamingo-train_task/training/VQAV2/val2014'
VQAV2_ANN_VAL = '../flamingo-train_task/training/VQAV2/v2_mscoco_val2014_annotations.json'
VQAV2_ANN_QUEST_VAL = '../flamingo-train_task/training/VQAV2/v2_OpenEnded_mscoco_val2014_questions.json'

In [5]:
class VQAv2(data.Dataset):
    def __init__(self, image_folder, questions_file, annotations_file, transform=None, target_transform=None):
        self.image_folder = image_folder
        print(os.getcwd())
        self.questions = json.load(open(questions_file))['questions']
        self.annotations = json.load(open(annotations_file))['annotations']
        self.transform = transform
        self.target_transform = target_transform

    def __getitem__(self, index):
        target = self.questions[index]['question']
        label = self.annotations[index]['multiple_choice_answer']
        image_id = self.annotations[index]['image_id']
        image_path = os.path.join(self.image_folder, 'COCO_val2014_' + str(image_id).zfill(12) + '.jpg')
        image = Image.open(image_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)

        if self.target_transform is not None:
            target = self.target_transform(target)
        
        return image, target, label

    def __getcaption__(self,index):
        return self.questions["question"][index]
    
    def __getanswer__(self,index):
        return self.annotations["answer"][index]
    
    def __len__(self):
        return len(self.questions)

In [6]:
transform = T.Compose([ #Con cierta probabilidad da la vuelta a la imagen y procesa la imagen con Clip
    T.Resize((224, 224)),                       
])

def target_transform(data):
    return f"{random.choice(['', ' '])}[QA][CONTEXT]<image>{data}[ANSWER]"
    
dataset = VQAv2(
        image_folder=VQAV2_ROOT,
        questions_file=VQAV2_ANN_QUEST_VAL,
        annotations_file=VQAV2_ANN_VAL,
        target_transform=target_transform,
    )

/home/lander/Documentos/GitHub/MUCSI_Modal/preprocessing


In [7]:
print("Total annotations: {}, Total questions:{}".format(len(dataset.annotations), len(dataset.questions)))

Total annotations: 214354, Total questions:214354


In [8]:
print(dataset.questions[0])
print(dataset.annotations[0])

{'image_id': 262148, 'question': 'Where is he looking?', 'question_id': 262148000}
{'question_type': 'none of the above', 'multiple_choice_answer': 'down', 'answers': [{'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 1}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 2}, {'answer': 'at table', 'answer_confidence': 'yes', 'answer_id': 3}, {'answer': 'skateboard', 'answer_confidence': 'yes', 'answer_id': 4}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 5}, {'answer': 'table', 'answer_confidence': 'yes', 'answer_id': 6}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 7}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 8}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 9}, {'answer': 'down', 'answer_confidence': 'yes', 'answer_id': 10}], 'image_id': 262148, 'answer_type': 'other', 'question_id': 262148000}


In [9]:
#mcq = MCQ()
images, questions, answers,answer_types = [], [], [], []
#s2v = Sense2Vec().from_disk('s2v_old')
#sentence_transformer_model = SentenceTransformer('msmarco-distilbert-base-v3')

for idx, question, annotation in tqdm(zip(range(len(dataset.questions)), dataset.questions, dataset.annotations), total=len(dataset.questions)):
    answer = annotation['multiple_choice_answer']
    answer_type = annotation['answer_type']
    question = question['question']
    img, _, _ = dataset.__getitem__(idx)
    images.append(img)
    """
    with io.BytesIO() as buffer:
        img.save(buffer, format='JPEG')
        base64_image = base64.b64encode(buffer.getvalue()).decode('utf-8')
        images.append(str(base64_image))
    """
    questions.append('[QUESTION]' + question)
    answers.append(answer.capitalize())
    answer_types.append(answer_type)

    


 24%|██▍       | 51027/214354 [02:07<07:04, 384.44it/s]IOStream.flush timed out
 25%|██▌       | 54239/214354 [03:02<5:49:24,  7.64it/s] 

: 

: 

In [59]:
data = {
    'image': images,
    'question': questions,
    #'options': choices,
    'answer': answers,
    'answer_type': answer_types
}
print(data)

{'image': [<PIL.Image.Image image mode=RGB size=640x512 at 0x7FF3DD30C430>, <PIL.Image.Image image mode=RGB size=640x512 at 0x7FF374D33340>, <PIL.Image.Image image mode=RGB size=640x512 at 0x7FF27BF6EA00>, <PIL.Image.Image image mode=RGB size=640x428 at 0x7FF207549C40>, <PIL.Image.Image image mode=RGB size=640x428 at 0x7FF207549580>, <PIL.Image.Image image mode=RGB size=640x428 at 0x7FF2075497C0>, <PIL.Image.Image image mode=RGB size=640x428 at 0x7FF207549B20>, <PIL.Image.Image image mode=RGB size=640x480 at 0x7FF2075A38E0>, <PIL.Image.Image image mode=RGB size=640x480 at 0x7FF2075E60A0>, <PIL.Image.Image image mode=RGB size=640x480 at 0x7FF2075E6880>, <PIL.Image.Image image mode=RGB size=640x512 at 0x7FF2075E6940>, <PIL.Image.Image image mode=RGB size=640x512 at 0x7FF2075E6BE0>, <PIL.Image.Image image mode=RGB size=640x512 at 0x7FF2075E6730>, <PIL.Image.Image image mode=RGB size=640x427 at 0x7FF2075E6760>, <PIL.Image.Image image mode=RGB size=640x427 at 0x7FF374D33040>, <PIL.Image.Ima

In [60]:
df = pd.DataFrame.from_dict(data)
df.head()

Unnamed: 0,image,question,answer,answer_type
0,<PIL.Image.Image image mode=RGB size=640x512 a...,[QUESTION]Where is he looking?,Down,other
1,<PIL.Image.Image image mode=RGB size=640x512 a...,[QUESTION]What are the people in the backgroun...,Watching,other
2,<PIL.Image.Image image mode=RGB size=640x512 a...,[QUESTION]What is he on top of?,Picnic table,other
3,<PIL.Image.Image image mode=RGB size=640x428 a...,[QUESTION]What website copyrighted the picture?,Foodiebakercom,other
4,<PIL.Image.Image image mode=RGB size=640x428 a...,[QUESTION]Is this a creamy soup?,No,yes/no


In [64]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
--2023-06-02 00:43:10--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolviendo s3.amazonaws.com (s3.amazonaws.com)... 52.216.2.27, 52.217.104.62, 54.231.165.168, ...
Conectando con s3.amazonaws.com (s3.amazonaws.com)[52.216.2.27]:443... conectado.
Petición HTTP enviada, esperando respuesta... 404 Not Found
2023-06-02 00:43:11 ERROR 404: Not Found.



In [70]:
model = api.load('word2vec-google-news-300')
INDEXES = ["(A) ", "(B) ", "(C) ", "(D) "]

def generate_distractors(row):
    choices_tmp = '[OPTIONS]'

    if row['answer_type'] == "yes/no":
        choices_tmp += '(A) Yes (B) No '
    else:
        distractors = []
        word = row['answer']
        if word in model.key_to_index:
            distractors = [similar_word.capitalize() for similar_word, _ in model.most_similar(word, topn=5)]

        
        distractors = distractors[0:3]
        distractors.append(answer.capitalize())
        random.shuffle(distractors)

        for idx, distractor in zip(INDEXES, distractors):
            choices_tmp += idx + distractor + " "

    return choices_tmp

In [71]:
df['choices'] = df.swifter.apply(generate_distractors, axis=1)

Pandas Apply: 100%|██████████| 32/32 [00:04<00:00,  7.48it/s]


In [50]:
df.head(20)

Unnamed: 0,image,question,answer,answer_type,choices
0,<PIL.Image.Image image mode=RGB size=640x512 a...,[QUESTION]Where is he looking?,Down,other,[OPTIONS](A) Yes (B) Mambo (C) No (D) Fourth
1,<PIL.Image.Image image mode=RGB size=640x512 a...,[QUESTION]What are the people in the backgroun...,Watching,other,[OPTIONS](A) Yes (B) Mambo (C) Fourth (D) No
2,<PIL.Image.Image image mode=RGB size=640x512 a...,[QUESTION]What is he on top of?,Picnic table,other,[OPTIONS](A) Fourth (B) No (C) Yes (D) Mambo
3,<PIL.Image.Image image mode=RGB size=640x428 a...,[QUESTION]What website copyrighted the picture?,Foodiebakercom,other,[OPTIONS](A) No (B) Yes (C) Fourth (D) Mambo
4,<PIL.Image.Image image mode=RGB size=640x428 a...,[QUESTION]Is this a creamy soup?,No,yes/no,[OPTIONS](A) Yes (B) No
5,<PIL.Image.Image image mode=RGB size=640x428 a...,[QUESTION]Is this rice noodle soup?,Yes,yes/no,[OPTIONS](A) Yes (B) No
6,<PIL.Image.Image image mode=RGB size=640x428 a...,[QUESTION]What is to the right of the soup?,Chopsticks,other,[OPTIONS](A) Fourth (B) No (C) Yes (D) Mambo
7,<PIL.Image.Image image mode=RGB size=640x480 a...,[QUESTION]What is the man doing in the street?,Walking,other,[OPTIONS](A) Fourth (B) Yes (C) No (D) Mambo
8,<PIL.Image.Image image mode=RGB size=640x480 a...,[QUESTION]How many photo's can you see?,1,number,[OPTIONS](A) Yes (B) Mambo (C) Fourth (D) No
9,<PIL.Image.Image image mode=RGB size=640x480 a...,[QUESTION]What does the truck on the left sell?,Ice cream,other,[OPTIONS](A) Fourth (B) Yes (C) Mambo (D) No


In [None]:
# Assuming your DataFrame is named 'df'
data_dict = df.to_dict(orient='list')
dataset = Dataset.from_dict(data_dict)
dataset_name = "your-username/your-dataset-name"  # Replace with your desired dataset name
dataset.save_to_disk(dataset_name)