In [None]:
%reset -f

In [None]:
!pip install -q transformers sentencepiece datasets evaluate accelerate transformers[torch]

In [None]:
import json
import torch
import transformers
from transformers import pipeline, ViTImageProcessor, ViTForImageClassification, ViTModel, AutoTokenizer, T5Tokenizer, MT5Model, AutoFeatureExtractor, MT5ForConditionalGeneration
import pandas as pd
from torchvision.io import read_image
from PIL import Image
from torchvision.transforms import ToTensor
import zipfile
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
from datasets import Dataset, load_from_disk
import evaluate
import gc
from torch.utils.data import DataLoader
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
!pip install pynvml
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")
print_gpu_utilization()

GPU memory occupied: 2792 MB.


We want to use a ViT + mT5 model to work try mulitlingual VQA. ViT is a vision model that has an encoder transformer model like BERT. mT5 is a multilingual seq to seq transformer model (encoder - decoder).

To combine these two models, need to introduce the visual information into the question.



In [None]:
#annot_path = '/content/drive/Shareddrives/CS263_final/models/data/evjvqa_train.json'
#img_zip_path = '/content/drive/Shareddrives/CS263_final/models/data/train-images.zip'


In [None]:
import os

if os.getcwd() == '/content':
  from google.colab import drive
  drive.mount('/content/drive')
  train_val_dataset = load_from_disk('/content/drive/Shareddrives/CS263_final/models/data/evjvqa_train_PIL_image')
else:
  train_val_dataset = load_from_disk('G:/Shared drives/CS263_final/models/data/evjvqa_train_PIL_image')


In [None]:

en_list = list(range(5702, 7204))
vi_list = list(range(14023, 15524))
ja_list = list(range(22283, 23785))

In [None]:
val_list = en_list + vi_list + ja_list

In [None]:
en_list = list(range(4001, 5201))
ja_list = list(range(20582, 21783))
vi_list = list(range(11322, 13522))

In [None]:
test_list = en_list + vi_list + ja_list

In [None]:
full_list = list(range(0, 23785))
train_list = [i for i in full_list if ((i not in test_list) and (i not in val_list))]

In [None]:
test_dataset = train_val_dataset.select(test_list)
train_dataset = train_val_dataset.select(train_list)
val_dataset = train_val_dataset.select(val_list)

In [None]:
test_dataset

Dataset({
    features: ['id', 'question', 'answer', 'image'],
    num_rows: 7136
})

In [None]:
train_val_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'image'],
        num_rows: 12486
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'image'],
        num_rows: 4163
    })
})

In [None]:
txt_checkpoint = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(txt_checkpoint)

img_checkpoint = 'facebook/vit-mae-base'
image_processor = AutoFeatureExtractor.from_pretrained(img_checkpoint)



In [None]:
class MultimodalCollator:
  tokenizer: AutoTokenizer
  preprocessor: AutoFeatureExtractor
  def __init__(self, tokenizer, preprocessor):
    self.tokenizer = tokenizer
    self.preprocessor = preprocessor
  def tokenize_text(self, texts, targets):

      encoded_question = self.tokenizer(
          text=texts,
          max_length = 80,
          padding='max_length',
          return_tensors='pt',
          return_attention_mask=True,
      )
      encoded_label = self.tokenizer(
          text=targets,
          max_length = 80,
          padding='max_length',
          return_tensors='pt',
          return_attention_mask=True,
      )
      return {
          "input_ids": encoded_question['input_ids'],
          "labels": encoded_label['input_ids'],
          "attention_mask": encoded_question['attention_mask'],
          "decoder_attention_mask": encoded_label['attention_mask']
      }

  def preprocess_images(self, images):
      processed_images = self.preprocessor(
          images,
          return_tensors="pt",
      )
      return {
          "pixel_values": processed_images['pixel_values'],
      }

  def __call__(self, examples):
      return {
          **self.tokenize_text(
              examples['question'] if isinstance(examples, dict) else [i['question'] for i in examples],
              examples['answer'] if isinstance(examples, dict) else [i['answer'] for i in examples]
          ),
          **self.preprocess_images(
              examples['image'] if isinstance(examples, dict) else [i['image'] for i in examples]
          )
      }

In [None]:
collator = MultimodalCollator(tokenizer, image_processor)

In [None]:
from transformers import MT5ForConditionalGeneration, AutoModel
image_encoder = AutoModel.from_pretrained(img_checkpoint)

Some weights of the model checkpoint at facebook/vit-mae-base were not used when initializing ViTMAEModel: ['decoder.decoder_layers.6.intermediate.dense.bias', 'decoder.decoder_layers.2.attention.attention.value.bias', 'decoder.decoder_layers.5.layernorm_before.bias', 'decoder.decoder_layers.3.attention.attention.query.bias', 'decoder.decoder_layers.3.intermediate.dense.bias', 'decoder.decoder_layers.7.output.dense.bias', 'decoder.decoder_layers.4.layernorm_before.bias', 'decoder.decoder_layers.3.attention.attention.query.weight', 'decoder.decoder_layers.5.attention.attention.query.weight', 'decoder.decoder_layers.1.layernorm_after.bias', 'decoder.decoder_layers.6.layernorm_before.bias', 'decoder.decoder_layers.0.output.dense.bias', 'decoder.decoder_layers.7.attention.output.dense.bias', 'decoder.decoder_layers.6.intermediate.dense.weight', 'decoder.decoder_layers.2.layernorm_before.weight', 'decoder.decoder_layers.3.attention.output.dense.bias', 'decoder.decoder_layers.0.attention.att

In [None]:
from typing import Optional, Tuple, Union
from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutput

class VQA_Model(MT5ForConditionalGeneration):
  """
  The VQAModel should consist of a image encoder and a multilingual language transformer (encoder, decoder, encoder-decoder).

  The visual_text_block is a module that integrates the encodings from the images and text.
  """

  def __init__(self, config, visualEncoder, img_hidden_dim = 768):
    super().__init__(config)
    self.visualEncoder = visualEncoder
    if img_hidden_dim != config.d_model:
      self.needConvert = True
      self.dim_change = nn.Linear(img_hidden_dim, config.d_model)

  def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        pixel_values: torch.FloatTensor = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        decoder_input_ids: Optional[torch.LongTensor] = None,
        decoder_attention_mask: Optional[torch.BoolTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        decoder_head_mask: Optional[torch.FloatTensor] = None,
        cross_attn_head_mask: Optional[torch.Tensor] = None,
        encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):


    use_cache = use_cache if use_cache is not None else self.config.use_cache
    return_dict = return_dict if return_dict is not None else self.config.use_return_dict

    # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
    if head_mask is not None and decoder_head_mask is None:
        if self.config.num_layers == self.config.num_decoder_layers:
            warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
            decoder_head_mask = head_mask

    # Encode if needed (training, first prediction pass)
    if encoder_outputs is None:
        # Convert encoder inputs in embeddings if needed
        text_encoder_outputs = self.encoder(
            input_ids=input_ids,
            attention_mask=attention_mask,
            inputs_embeds=inputs_embeds,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        image_encoder_outputs = self.visualEncoder(pixel_values = pixel_values)
        if self.needConvert:
          image_state = self.dim_change(image_encoder_outputs['last_hidden_state'])
        else:
          image_state = image_encoder_outputs['last_hidden_state']
        img_mask = torch.ones((image_state.shape[0], image_state.shape[1])).to(image_state.device)
        hidden_states = torch.cat([image_state, text_encoder_outputs[0]], dim = 1)
        attention_mask = torch.cat([img_mask, attention_mask,], dim = 1)

        encoder_outputs = BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=text_encoder_outputs[1] if len(text_encoder_outputs) > 1 else None,
            attentions=text_encoder_outputs[2] if len(text_encoder_outputs) > 2 else None,
        )
    elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
        encoder_outputs = BaseModelOutput(
            last_hidden_state=encoder_outputs[0],
            hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
            attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
        )

    if self.model_parallel:
        torch.cuda.set_device(self.decoder.first_device)

    if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
        # get decoder inputs from shifting lm labels to the right
        decoder_input_ids = self._shift_right(labels)

    # Set device for model parallelism
    if self.model_parallel:
        torch.cuda.set_device(self.decoder.first_device)
        hidden_states = hidden_states.to(self.decoder.first_device)
        if decoder_input_ids is not None:
            decoder_input_ids = decoder_input_ids.to(self.decoder.first_device)
        if attention_mask is not None:
            attention_mask = attention_mask.to(self.decoder.first_device)
        if decoder_attention_mask is not None:
            decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)

    hidden_states = encoder_outputs[0]


    # Decode
    decoder_outputs = self.decoder(
        input_ids=decoder_input_ids,
        attention_mask=decoder_attention_mask,
        inputs_embeds=decoder_inputs_embeds,
        past_key_values=past_key_values,
        encoder_hidden_states=hidden_states,
        encoder_attention_mask=attention_mask,
        head_mask=decoder_head_mask,
        cross_attn_head_mask=cross_attn_head_mask,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    sequence_output = decoder_outputs[0]

    # Set device for model parallelism
    if self.model_parallel:
        torch.cuda.set_device(self.encoder.first_device)
        self.lm_head = self.lm_head.to(self.encoder.first_device)
        sequence_output = sequence_output.to(self.lm_head.weight.device)

    if self.config.tie_word_embeddings:
        # Rescale output before projecting on vocab
        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
        sequence_output = sequence_output * (self.model_dim**-0.5)

    lm_logits = self.lm_head(sequence_output)

    loss = None
    if labels is not None:
        loss_fct = nn.CrossEntropyLoss(ignore_index=0)
        # move labels to correct device to enable PP
        labels = labels.to(lm_logits.device)
        loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
        # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666

    if not return_dict:
        output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs
        return ((loss,) + output) if loss is not None else output

    return Seq2SeqLMOutput(
        loss=loss,
        logits=lm_logits,
        past_key_values=decoder_outputs.past_key_values,
        decoder_hidden_states=decoder_outputs.hidden_states,
        decoder_attentions=decoder_outputs.attentions,
        cross_attentions=decoder_outputs.cross_attentions,
        encoder_last_hidden_state=encoder_outputs.last_hidden_state,
        encoder_hidden_states=encoder_outputs.hidden_states,
        encoder_attentions=encoder_outputs.attentions,
    )


In [None]:
from transformers import MT5Config
model = VQA_Model.from_pretrained(txt_checkpoint, image_encoder)
#model = VQA_Model.from_pretrained('G:/Shared drives/CS263_final/models/ViTMAE_mt5/checkpoint-24976')

Some weights of VQA_Model were not initialized from the model checkpoint at google/mt5-small and are newly initialized: ['visualEncoder.encoder.layer.7.attention.attention.query.bias', 'visualEncoder.encoder.layer.5.output.dense.bias', 'visualEncoder.encoder.layer.4.attention.output.dense.weight', 'visualEncoder.encoder.layer.2.layernorm_before.bias', 'visualEncoder.encoder.layer.1.layernorm_after.bias', 'visualEncoder.encoder.layer.2.attention.attention.query.weight', 'visualEncoder.encoder.layer.3.output.dense.bias', 'visualEncoder.encoder.layer.2.attention.attention.query.bias', 'visualEncoder.encoder.layer.5.attention.attention.query.weight', 'visualEncoder.encoder.layer.8.attention.attention.value.weight', 'visualEncoder.encoder.layer.5.output.dense.weight', 'visualEncoder.encoder.layer.11.output.dense.weight', 'visualEncoder.encoder.layer.4.layernorm_after.weight', 'visualEncoder.encoder.layer.0.attention.attention.query.bias', 'visualEncoder.encoder.layer.10.output.dense.weight'

In [None]:

model.generation_config

GenerationConfig {
  "_from_model_config": true,
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.29.2"
}

In [None]:
# Freeze the image and text encoders
for param in model.shared.parameters():
  param.requires_grad = False

for param in model.encoder.parameters():
  param.requires_grad = False

for param in model.visualEncoder.parameters():
  param.requires_grad = False


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, GenerationConfig

if os.getcwd() == '/content':
  save_location = '/content/drive/Shareddrives/CS263_final/models/ViTMAE_mt5/'
else:
  save_location = 'G:/Shared drives/CS263_final/models/ViTMAE_mt5/'

lr = 3e-4
train_args = Seq2SeqTrainingArguments(
    output_dir = save_location,
    overwrite_output_dir = True,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    evaluation_strategy = "epoch",
    learning_rate = lr,
    num_train_epochs = 4,
    logging_strategy = "epoch",
    save_strategy = "epoch",
    seed = 2023,
    fp16 = False,
    bf16 = True,
    load_best_model_at_end = True,
    generation_config = GenerationConfig.from_pretrained(txt_checkpoint)
)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-8)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                               step_size=400,
                                               gamma=0.9)

class CustomSeq2SeqTrainer(Seq2SeqTrainer):
  def get_train_dataloader(self):
    # build train dataloader
    train_loader = DataLoader(self.train_dataset, batch_size=self.args.per_device_train_batch_size, shuffle = True, num_workers = 0, collate_fn = collator)
    return train_loader
  def get_eval_dataloader(self, eval_dataset):
    eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
    eval_loader = DataLoader(eval_dataset, batch_size=self.args.per_device_eval_batch_size, shuffle = True, num_workers = 0, collate_fn = collator)
    return eval_loader
  def get_test_dataloader(self, test_dataset):
    test_dataset = test_dataset if test_dataset is not None else self.test_dataset
    test_loader = DataLoader(test_dataset, batch_size=self.args.per_device_eval_batch_size, shuffle = True, num_workers = 0, collate_fn = collator)
    return test_loader

def compute_metric(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis = -1)
  return metric.compute(predictions, references = labels)

trainer = CustomSeq2SeqTrainer(
                  model,
                  args = train_args,
                  data_collator = collator,
                  train_dataset = train_dataset,
                  eval_dataset = val_dataset,
                  optimizers = (optimizer, scheduler),
                  # compute_metrics=compute_metric
                 )
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

VQA_Model(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): Linear(i

In [None]:
torch.cuda.empty_cache()
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.2974,1.963122
2,2.2442,1.854659
3,2.0831,1.836426
4,2.026,1.828433


TrainOutput(global_step=14680, training_loss=2.4127052244763285, metrics={'train_runtime': 2778.927, 'train_samples_per_second': 21.129, 'train_steps_per_second': 5.283, 'total_flos': 7280166514360320.0, 'train_loss': 2.4127052244763285, 'epoch': 4.0})

In [None]:
trainer.lr_scheduler.get_last_lr()

[6.758519863481758e-06]

In [None]:
torch.cuda.empty_cache()
from transformers import GenerationConfig
config = GenerationConfig.from_pretrained(txt_checkpoint, max_length = 80)
data = collator(test_dataset[:4])
for k, v in data.items():
  data[k] = v.to(device)
labels = data.pop('labels' )
data.pop("decoder_attention_mask")
generation_output = model.generate(**data, generation_config = config, return_dict_in_generate = True, output_scores = True, temperature = 2 )



In [None]:
tokenizer.batch_decode(generation_output['sequences'])

['<pad> there are two pillars in front of thetower</s><pad>',
 '<pad> the green scale is green</s><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> the wall is hanging on the wall in the room</s><pad>',
 '<pad> the households are hang on the side of the road</s>']

In [None]:
test_dataset[:4]['answer']

['there are two pillars in front of the tower',
 'in front of the woman hunching over hher back',
 'the drawings',
 'the red flag with yellow star']

In [None]:
test_dataset[:4]['question']

['how many pillars are there in front of the tower?',
 'where is the green scale?',
 'what is hanging on the wall in the room?',
 'what flags do the households here hang on the side of the road?']

In [None]:
generation_output = model.generate(**data, generation_config = config, return_dict_in_generate = True, output_scores = True, num_beams = 4 )

In [None]:
tokenizer.batch_decode(generation_output['sequences'])

['<pad> there are two pillars in front of thetower</s><pad>',
 '<pad> the green scale is green</s><pad><pad><pad><pad><pad><pad><pad>',
 '<pad> the wall is hanging on the wall in the room</s><pad>',
 '<pad> the households are hang on the side of the road</s>']

In [None]:
generation_output['sequences']

tensor([[    0, 77919,     1,     0,     0,     0,     0,     0,     0,     0,
             0],
        [    0,  1672,   259,   272,  2832,   282,  3325,   259,   272,  3456,
             1]], device='cuda:0')

In [None]:
from torchmetrics import F1Score

In [None]:
f1 = F1Score(task = 'multiclass', num_classes = 250112, top_k=1, ignore_index=0)

In [None]:
from torchmetrics import F1Score
from langdetect import detect
greedy_F1_scores = []
for data in test_dataset:
  lang = detect(data['question'])
  inputs = collator(data)
  for k, v in inputs.items():
    inputs[k] = v.to(device)
  labels = inputs.pop('labels' )
  inputs.pop("decoder_attention_mask")
  generation_output = model.generate(**inputs, generation_config = config, return_dict_in_generate = True, output_scores = True, num_beams = 1)
  pred_str = tokenizer.batch_decode(generation_output['sequences'], skip_special_tokens = True)
  padded_str = tokenizer(pred_str +  [data['answer']], padding = 'longest', return_attention_mask = False, return_tensors='pt')
  padded_pred_token = padded_str['input_ids'][0]
  padded_label_token = padded_str['input_ids'][1]
  score = f1(padded_pred_token, padded_label_token)
  greedy_F1_scores.append(score)
  gc.collect()
  torch.cuda.empty_cache()
greedy_f1_score_avg = np.mean(greedy_F1_scores)



In [None]:
greedy_f1_score_avg

0.19819523

In [None]:
from torchmetrics import F1Score
beam4_F1_scores = []
for data in test_dataset:
  inputs = collator(data)
  for k, v in inputs.items():
    inputs[k] = v.to(device)
  labels = inputs.pop('labels' )
  inputs.pop("decoder_attention_mask")
  generation_output = model.generate(**inputs, generation_config = config, return_dict_in_generate = True, output_scores = True, num_beams = 4)
  pred_str = tokenizer.batch_decode(generation_output['sequences'], skip_special_tokens = True)
  padded_str = tokenizer(pred_str +  [data['answer']], padding = 'longest', return_attention_mask = False, return_tensors='pt')
  padded_pred_token = padded_str['input_ids'][0]
  padded_label_token = padded_str['input_ids'][1]
  score = f1(padded_pred_token, padded_label_token)
  beam4_F1_scores.append(score)
  gc.collect()
  torch.cuda.empty_cache()
beam4_f1_score_avg = np.mean(beam4_F1_scores)



In [None]:
beam4_f1_score_avg

0.20050485

In [None]:
shuffled_set = test_dataset.shuffle(seed = 2023).select(list(range(1000)))
beam4_F1_scores = []
for data in shuffled_set:
  inputs = collator(data)
  for k, v in inputs.items():
    inputs[k] = v.to(device)
  labels = inputs.pop('labels' )
  inputs.pop("decoder_attention_mask")
  generation_output = model.generate(**inputs, generation_config = config, return_dict_in_generate = True, output_scores = True, num_beams = 4)
  pred_str = tokenizer.batch_decode(generation_output['sequences'], skip_special_tokens = True)
  padded_str = tokenizer(pred_str +  [data['answer']], padding = 'longest', return_attention_mask = False, return_tensors='pt')
  padded_pred_token = padded_str['input_ids'][0]
  padded_label_token = padded_str['input_ids'][1]
  score = f1(padded_pred_token, padded_label_token)
  beam4_F1_scores.append(score)
  gc.collect()
  torch.cuda.empty_cache()
beam4_f1_score_avg = np.mean(beam4_F1_scores)

Loading cached shuffled indices for dataset at G:\Shared drives\CS263_final\models\data\evjvqa_train_PIL_image\cache-ccdcb774d819f7c1.arrow


In [None]:
beam4_f1_score_avg

0.19243912

In [None]:
!pip install nltk
import string
from nltk.translate import bleu
from nltk.translate.bleu_score import SmoothingFunction
from langdetect import detect
smoothie = SmoothingFunction().method4
def bleu_score(reference, candidate):
  # ignore punctuations
  reference = reference.translate(str.maketrans('', '', string.punctuation))
  candidate = candidate.translate(str.maketrans('', '', string.punctuation))
  ref_list, can_list = [],[]
  # convert string to list of words
  try:
      lang = detect(reference)
  except:
      lang = "error"
      print("error for detecting: ", reference)
  # Detect if the language is Japanese, whose words in answers are not split by space.
  # Need to make it a list of characters for bleu score calculation.
  if lang != "en" or lang != "vi":
    # ignore possible spaces in Japanese sentences
    reference = reference.replace(" ", "")
    candidate = candidate.replace(" ", "")
    for ch in reference:
      ref_list.append(ch)
    for ch in candidate:
      can_list.append(ch)
  else:
    ref_list = reference.split()
    can_list = candidate.split()

  score = bleu([ref_list], can_list,smoothing_function=smoothie, weights = [0.25,0.25,0.25,0.25])
  return score

# Evaluate model performance on test dataset
def test_eval(test_data, model):
  f1_torchmetric, bleu_l, pred_token_l, pred_word_l = [],[],[],[]
  f1_en, f1_ja, f1_vi = [],[],[]
  bleu_en, bleu_ja, bleu_vi = [],[],[]
  f1 = F1Score(task="multiclass", num_classes=250, topk = 1, ignore_index = 0)
  for data in test_data:
    inputs = collator(data)
    for k, v in inputs.items():
      inputs[k] = v.to(device)

    labels = inputs.pop('labels' )
    inputs.pop("decoder_attention_mask")
    generation_output = model.generate(**inputs, generation_config = config, return_dict_in_generate = True, output_scores = True, num_beams = 4)
    pred_str = tokenizer.batch_decode(generation_output['sequences'], skip_special_tokens = True)
    padded_str = tokenizer(pred_str +  [data['answer']], padding = 'longest', return_attention_mask = False, return_tensors='pt')
    padded_pred_token = padded_str['input_ids'][0]
    padded_label_token = padded_str['input_ids'][1]

    # F1 score from torch.metric

    s3 = f1(padded_pred_token, padded_label_token)
    f1_torchmetric.append(s3)

    # Compute bleu score
    s2 = bleu_score(data['answer'], pred_str[0])
    bleu_l.append(s2)

    try:
      lang = detect(data['answer'])
    except:
      lang = "error"
      #print("error for detecting: ", answer)
    if lang == "en":
      f1_en.append(s3)
      bleu_en.append(s2)
    elif lang == "vi":
      f1_vi.append(s3)
      bleu_vi.append(s2)
    else:
      f1_ja.append(s3)
      bleu_ja.append(s2)


  return f1_torchmetric, bleu_l, pred_token_l, pred_word_l, f1_en, f1_ja, f1_vi, bleu_en, bleu_ja, bleu_vi



In [None]:
gc.collect()
torch.cuda.empty_cache()
f1_torchmetric, bleu_l, pred_token_l, pred_word_l, f1_en, f1_ja, f1_vi, bleu_en, bleu_ja, bleu_vi = test_eval(shuffled_set,model)



In [None]:
print("average f1 score: ", sum(f1_torchmetric)/len(f1_torchmetric))
print("average f1 score in en: ", sum(f1_en)/len(f1_en))
print("average f1 score in ja: ", sum(f1_ja)/len(f1_ja))
print("average f1 score in vi: ", sum(f1_vi)/len(f1_vi))
print("average bleu score: ", sum(bleu_l)/len(bleu_l))
print("average bleu score in en: ", sum(bleu_en)/len(bleu_en))
print("average bleu score in ja: ", sum(bleu_ja)/len(bleu_ja))
print("average bleu score in vi: ", sum(bleu_vi)/len(bleu_vi))

average f1 score:  tensor(0.1924)
average f1 score in en:  tensor(0.2036)
average f1 score in ja:  tensor(0.1932)
average f1 score in vi:  tensor(0.1867)
average bleu score:  0.2650023270519166
average bleu score in en:  0.27667069040263736
average bleu score in ja:  0.1830267181497897
average bleu score in vi:  0.3152150600395817


In [None]:
generation_output['sequences']

tensor([[     0,    259, 237638, 160699, 145710,    535, 215851,    306,      1]],
       device='cuda:0')

In [None]:
annot_path = 'G:\Shared drives\CS263_final\models\data\official_evjvqa_public_test.json'
img_zip_path = 'G:\Shared drives\CS263_final\models\data\public-test-images.zip'
with open(annot_path, encoding = 'utf-8') as f:
  """
  Data file structure:
  {
    images: {
      'id': image_id
      'filename': reference file
    },
    'annotations': {
      'id': annotation_id
      'image_id': refers to id in images
      'question': question about image
      'answer': answer to question
    }
  }
  """
  train_data = json.load(f)
  img_reference = pd.DataFrame(train_data['images']).set_index('id')

In [None]:
class EVJVQA_Dataset(torch.utils.data.Dataset):
  """
  Dataset class for the EVJVQA dataset.
  """
  def __init__(self, annotation_file, img_dir, zip_subpath):
    """
    Inputs:
      annotation_file - file path for the annotation json file
      img_dir - file path for the image zip file
      zip_subpath - subfolder in the zip folder
    """
    with open(annotation_file, encoding = 'utf-8') as f:
      json_file = json.load(f)
    self.annotations = pd.DataFrame(json_file['annotations'])
    self.img_reference = pd.DataFrame(json_file['images']).set_index('id')
    self.img_dir = img_dir
    self.transform = ToTensor()
    self.subpath = zip_subpath + '/'

  def __len__(self):
    return len(self.annotations)

  def __getitem__(self, idx):
    annot_id = self.annotations.loc[idx, 'id']
    image_id = self.annotations.loc[idx, 'image_id']
    question = self.annotations.loc[idx, 'question']
    answer = self.annotations.loc[idx, 'answer']
    img_file = self.img_reference.loc[image_id, 'filename']
    with zipfile.ZipFile(self.img_dir, 'r') as zip_ref:
       imgdata = zip_ref.open(self.subpath + img_file)
       img = Image.open(imgdata).convert('RGB')
    img = self.transform(img)
    return {
        'id': annot_id,
        'question': question,
        'answer': answer,
        'image': img,
    }

In [None]:
dataset = EVJVQA_Dataset(annot_path, img_zip_path, 'public-test-images')

In [None]:
loader = DataLoader(dataset, collate_fn = collator)

In [None]:
results = {}
for i in dataset:
  id = i['id']
  inputs = collator(i)
  for k, v in inputs.items():
    inputs[k] = v.to(device)
  labels = inputs.pop('labels' )
  inputs.pop("decoder_attention_mask")
  generation_output = model.generate(**inputs, generation_config = config, return_dict_in_generate = True, output_scores = True, num_beams = 4)
  pred_str = tokenizer.batch_decode(generation_output['sequences'], skip_special_tokens = True)
  results[str(id)] = pred_str[0]






KeyError: ignored

In [None]:
with open('G:/Shared drives/CS263_final/public_results.json', 'w') as f:
  json.dump(results, f)