In [13]:
import torch
import torch.nn as nn
# import torchtext
import os
import random
import numpy as np
import pandas as pd
import spacy
import timm
import matplotlib.pyplot as plt

from PIL import Image
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
# from torchtext.data.utils import get_tokenizer
# from torchtext.vocab import build_vocab_from_iterator
from torchvision import transforms
from transformers import ViTFeatureExtractor, ViTModel, RobertaTokenizer

In [3]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


seed = 59
set_seed(seed)

### Chia bộ train-test

In [4]:
import re

# Chuỗi ban đầu
line = "COCO_val2014_000000396568.jpg#0 extra text"

# Sử dụng regex để xóa mọi thứ sau dấu # và dừng lại khi gặp khoảng trắng
line = re.sub(r'#\S*', '', line)

# Loại bỏ khoảng trắng thừa ở cuối (nếu cần)
line = line.strip()

print(line)  # Kết quả: COCO_val2014_000000396568.jpg

COCO_val2014_000000396568.jpg extra text


In [5]:
train_data = []
train_set_path = './vqa_coco_dataset/vaq2.0.TrainImages.txt'

with open(train_set_path, 'r') as f:
    for line in f:
        line = re.sub(r'#\S*', '', line)
        temp = line.split('\t')
        qa = temp[1].split('?')
        answer = qa[1].strip()

        data_sample = {
            'image_path': temp[0],
            'question': qa[0],
            'answer': answer
        }
        train_data.append(data_sample)


val_data = []
val_set_path = './vqa_coco_dataset/vaq2.0.DevImages.txt'

with open(train_set_path, 'r') as f:
    for line in f:
        temp = line.split('\t')
        qa = temp[1].split('?')
        answer = qa[1].strip()

        data_sample = {
            'image_path': temp[0],
            'question': qa[0],
            'answer': answer
        }
        val_data.append(data_sample)


test_data = []
test_set_path = './vqa_coco_dataset/vaq2.0.TestImages.txt'

with open(train_set_path, 'r') as f:
    lines = f.readlines()
    for line in lines:
        temp = line.split('\t')
        qa = temp[1].split('?')
        answer = qa[1].strip()

        data_sample = {
            'image_path': temp[0],
            'question': qa[0] + '?',
            'answer': answer
        }
        test_data.append(data_sample)

### Xây dựng tokenize

In [None]:
img_path = './vqa_coco_dataset/val2014-resised/COCO_val2014_000000000133.jpg'
img = Image.open(img_path).convert('RGB')

feature_extractor = ViTFeatureExtractor.from_pretrained(
    'google/vit-base-patch16-224-in21k')

inputs = feature_extractor(
    images=img, return_tensors="pt")    
inputs = inputs['pixel_values'].squeeze().permute(1, 2, 0).numpy()



1

In [22]:
# fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# # Original image
# axes[0].imshow(img)
# axes[0].axis('off')
# axes[0].set_title('Original Image')

# # Features image
# axes[1].imshow(inputs)
# axes[1].axis('off')
# axes[1].set_title('Feature Image')

# plt.tight_layout()
# plt.show()

In [None]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
tokenizer("Hello world hhihihi", max_length=20, padding="max_length",
          truncation=True, return_tensors="pt")["input_ids"].squeeze(0)

We need to remove 7 to truncate the input but the first sequence has a length 7. 


tensor([    0, 31414,   232,  1368,   298,  4001,  4001,   118,     2])

In [None]:
def tokenize(text, max_seq_length):
    tokenizer = RobertaTokenizer.from_pretrained("FacebookAI/roberta-base")
    encode_text = tokenizer(text, max_length=20, padding="max_length",
                            truncation=True, return_tensors="pt")["input_ids"].squeeze(0)

    if len(encode_text) > max_seq_length:
        encode_text = encode_text[:max_seq_length]

    return encode_text

def feature_extractor(img_path):
    img = Image.open(img_path).convert('RGB')
    
    feature_extractor = ViTFeatureExtractor.from_pretrained(
        'google/vit-base-patch16-224-in21k')
    
    inputs = feature_extractor(images=img, return_tensors="pt")
    inputs = inputs['pixel_values'].squeeze().permute(1, 2, 0).numpy()

    return inputs

### Xây dựng mapping labels dictionary

In [None]:
labels = set(
    sample['answer'] for sample in train_data
)

label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

print(label2id)

### Xây dựng Pytorch datasets

In [None]:
class VQADatasets(Dataset):
    def __init__(self, data, label2idx, max_seq_length=20, transform=None, tokenize=None, feature_extractor = None,  img_dir='./vqa_coco_dataset/val2014-resised'):
        super().__init__()
        self.data = data
        self.label2idx = label2idx
        self.max_seq_length = max_seq_length
        self.transform = transform
        self.tokenize = tokenize
        self.feature_extractor = feature_extractor
        self.img_dir = img_dir

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        try:
            img_path = os.path.join(
                self.img_dir, self.data[index]['image_path'])
            img = self.feature_extractor(img_path)
            
            if self.transform:
                img = self.transform(img)

            questions = self.data[index]['question']
            questions = torch.tensor(self.tokenize(
                questions, self.max_seq_length))

            answer = self.data[index]['answer']
            id_label = self.label2idx[answer]
            id_label = torch.tensor(id_label)

            return img, questions, id_label
        except Exception as e:
            print(f"Error at index {index}: {e}")
            print(f"Data at index {index}: {self.data[index]}")
            raise  # Tùy chọn: bạn có thể giữ hoặc bỏ dòng này nếu muốn chương trình dừng khi gặp lỗi

### Xây dựng Transforms

In [None]:
data_transform = {
    'train': transforms.Compose([
        transforms.Resize(size=(224, 224)),
        transforms.CenterCrop(size=180),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1),
        transforms.RandomHorizontalFlip(),
        transforms.GaussianBlur(3),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]),
    'val': transforms.Compose([
        transforms.Resize(size=(224, 224)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ])
}