In [None]:
!pip install tqdm
!pip install datasets
!pip install jsonlines

In [None]:
from tqdm.auto import tqdm
from datasets import load_dataset
import torch

In [None]:
VQAv2_train = load_dataset("HuggingFaceM4/VQAv2", split="train").remove_columns(['question_type', 'answers', 'answer_type', 'question_id', 'image'])

VQAv2_val = load_dataset("HuggingFaceM4/VQAv2", split="val").remove_columns(['question_type', 'answers', 'answer_type', 'question_id', 'image'])

In [None]:
def train_img_format(img_id):
  return f"COCO_train2014_{img_id:012d}"

def val_img_format(img_id):
  return f"COCO_val2014_{img_id:012d}"

In [None]:
# VQAv2_train
image_id_train = []
answer_train = []
question_train = []

for feature in tqdm(VQAv2_train):
    if len(feature["multiple_choice_answer"]) <= 500 and len(feature["question"]) <= 500:

        answer_train.append(feature["multiple_choice_answer"])
        image = train_img_format(feature["image_id"])
        image_id_train.append(image)
        question_train.append(feature["question"])

In [None]:
# VQAv2_val
image_id_val = []
answer_val = []
question_val = []

for feature in tqdm(VQAv2_val):
    if len(feature["multiple_choice_answer"]) <= 500 and len(feature["question"]) <= 500:

        answer_val.append(feature["multiple_choice_answer"])
        image = val_img_format(feature["image_id"])
        image_id_val.append(image)
        question_val.append(feature["question"])

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def translation(model, tokenizer, sentences, batch_size):
    sentences_ru = []
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    with torch.no_grad():
        for i in tqdm(range(0, len(sentences), batch_size)):
            batch = sentences[i: i + batch_size]

            input_ids = tokenizer.batch_encode_plus(batch, padding="max_length", max_length=512, return_tensors="pt", truncation=True)["input_ids"].to(device)
            generated_tokens = model.generate(input_ids, max_length=512, forced_bos_token_id=tokenizer.lang_code_to_id["rus_Cyrl"])
            output_ids = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

            sentences_ru = sentences_ru + output_ids

    return sentences_ru

In [None]:
checkpoint = "facebook/nllb-200-distilled-600M"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint).to(device)
model.eval()

In [None]:
batch_size = 200

# VQAv2_train
questions_train_ru = translation(model, tokenizer, question_train, batch_size)
answers_train_ru = translation(model, tokenizer, answer_train, batch_size)

In [None]:
# VQAv2_val
questions_val_ru = translation(model, tokenizer, question_val, batch_size)
answers_val_ru = translation(model, tokenizer, answer_val, batch_size)

In [None]:
# VQAv2_train
questions_train = [i for i in questions_train_ru]
answers_train = [i for i in answers_train_ru]
max_len_train = len(question_train)

In [None]:
# VQAv2_val
questions_val = [i for i in questions_val_ru]
answers_val = [i for i in answers_val_ru]
max_len_val = len(questions_val)

In [None]:
result_train = [{'image_id' : image_id_train[i], 'question' : questions_train[i], 'answer' : answers_train[i]} for i in range(max_len_train)]

In [None]:
result_val = [{'image_id' : image_id_val[i], 'question' : questions_val[i], 'answer' : answers_val[i]} for i in range(max_len_val)]

In [None]:
import jsonlines

with jsonlines.open('VQAv2_train_translation.jsonl', mode='w') as writer:
  writer.write(result_train)

In [None]:
with jsonlines.open('VQAv2_val_translation.jsonl', mode='w') as writer:
  writer.write(result_val)