In [25]:
#https://www.kaggle.com/code/nguynkoi/vlai-imgcap2vqa

In [31]:
from PIL import Image
import pandas as pd
import os

import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset
from torchvision import transforms
from torchvision.models import resnet34, ResNet34_Weights
import torch.optim as optim

from torchinfo import summary
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertModel
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer



  from .autonotebook import tqdm as notebook_tqdm


In [33]:
config = {
    'seed': 59,
    'dataset': 'dataset',
    'val_set_ratio': 0.1,
    'train_batch_size': 16,
    'val_batch_size': 16,
    'max_seq_len': 30,
    'aug_methods': 'imagecaptioning',
    'fusion_strategy': 'concat',
    'imagecaptioning_id': 'vit-gpt2-image-captioning',
    'max_length': 20,
    'min_length': 10,
    'num_beams': 4,
    'aug_strategy': 'sum',
    'activation_F': 'GELU',
    'text_encoder_id': 'bert-base-uncased',
    'img_encoder_id': 'resnet34',
    'joint_embedding_dim': 1024,
    'mlp_input_dim': 2048,
    'mlp_strategy': '2l:(*1_0.5)',
    'mlp_hidden_unit': 1024,
    'learning_rate': 1e-5,
    'weight_decay': 1e-5,
    'epochs': 50,
    'patience': 5,
}

In [9]:
df = pd.read_csv("dataset/data.csv")
df.head()


Unnamed: 0,question,answer,image_id
0,what is on the left side of the white oven on ...,garbage_bin,image1
1,what is on the left side of the fire extinguis...,table,image1
2,what is between the the two white and black ga...,chair,image1
3,how many objects are between the fire extingui...,3,image1
4,what is the largest object in this picture,washing_machine,image1


In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [11]:
df.describe()

Unnamed: 0,question,answer,image_id
count,12468,12468,12468
unique,5886,1443,1447
top,what is on the table,2,image8
freq,248,554,31


# Image Captioning

In [16]:
IMAGE_PATH_ROOT = "dataset/images"
FOLDER = os.listdir(IMAGE_PATH_ROOT)

In [17]:
FILES = []
for file in FOLDER: 
    FILES.append(f'{IMAGE_PATH_ROOT}/{file}')

In [32]:
imgcap_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.1"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
 

In [34]:
imgcap_model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTSdpaAttention(
            (attention): ViTSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(i

In [35]:
gen_kwargs = {
    "max_length": config['max_length'],
    "min_length": config['min_length'],
    "num_beams": config['num_beams'],
}

In [36]:
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = imgcap_model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [37]:
img_caption = dict()
batchs = []

for i in range(0, len(FILES), 256):
    batchs.append(FILES[i:i+256])
print('Batchs = ', len(batchs))

Batchs =  6


In [None]:
imgcap = []
progress = 0
for batch in batchs: 
    imgcap.append(predict_step(batch))
    progress += 1
    print(f'Progress: {progress}/{len(batchs)}')

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Progress: 1/6
