In [1]:
import json
import os
from PIL import Image

Open image from a path

In [2]:
def getImageFromJson(json_file_path, images_folder):
    with open(json_file_path, 'r') as file:
        image_map = json.load(file)

    # Iterate over the items in the JSON file
    for image_name, question in image_map.items():
        # Construct the full path to the image
        image_path = os.path.join(images_folder, image_name)

        try:
            # Open the image
            with Image.open(image_path) as img:
                img.show()  # This will display the image using the default image viewer
                print(f"Opened {image_name} with question: {question}")
        except IOError:
            # Handle scenarios where the image cannot be opened
            print(f"Failed to open image {image_name} at {image_path}")
        

# json_file_path = '../dataset/question_image_mapping.json'
# images_folder = './segmentedImage'
# getImageFromJson(json_file_path, images_folder)

Let us begin QAVIT here

In [3]:
import torch
import torch.nn as nn
from transformers import T5Tokenizer, T5EncoderModel, ViTModel, T5ForConditionalGeneration
from torchvision import transforms
from PIL import Image
import os
from transformers import get_cosine_schedule_with_warmup, AdamW
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import Dataset, DataLoader
import json
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


QA-VIT

In [4]:
class QAViT(nn.Module):
    def __init__(self, vision_model):
        super(QAViT, self).__init__()
        self.vision_model = vision_model

    def forward(self, image):

        visual_outputs = self.vision_model(pixel_values=image)
        visual_features = visual_outputs.last_hidden_state
        print("hola amigo",visual_features.shape)
        #visual_outputs.last_hidden_state = visual_features
        return visual_outputs

Loading dataset

In [5]:
# So I have json which maps question_id to the segmentedImage. Now I will add this information of segmentedImage in the combined json file
# and finally my dataset will be annotaion[question] , annotation [image_file], annotation [multiple_choice_answer]
class QADataset():
    def __init__(self, image_dir, questionId_image_path, annotations, tokenizer, transform=None):
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.annotations= annotations
        self.transform = transform #or transforms.ToTensor()  # Default transform

        # Load the question-image mapping
        with open(questionId_image_path, 'r') as f:
            print("boyy")
            questionId_image_map = json.load(f)


# Reversing keys and values
        self.reversed_dict = {value: key for key, value in questionId_image_map.items()}
        # Load the combined data JSON
        # with open(combined_data_path, 'r') as f:
        #     print("hood")
        #     combined_data = json.load(f)
        
        # print(combined_data)
        # # Merge the data
        # self.annotations = []
        # for data in self.combined_data:
        #     question_id = data["question_id"]
        #     # Match image file with the question_id from the question_image_map
        #     if str(question_id) in self.question_image_map:
        #         image_file = self.questionId_image_map[str(question_id)]
        #         data["image_file"] = image_file  # Add image file name to the data
        #         self.annotations.append(data)

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        annotation = self.annotations[idx]
        question_id = annotation["question_id"]
        image_file = self.reversed_dict[question_id]
        image_path = os.path.join(self.image_dir, image_file)

        image = Image.open(image_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        question = annotation["question"]
        answer = annotation["multiple_choice_answer"]
        question_tokens = self.tokenizer(question, return_tensors="pt", padding=True, truncation=True)

        return {
            "image": image,
            "question_tokens": {k: v.squeeze(0) for k, v in question_tokens.items()},
            "answer": answer
        }


In [6]:
def qa_collate_fn(batch):
    # Stack images
    images = torch.stack([item["image"] for item in batch])

    # Prepare questions
    question_tokens = {k: torch.cat([item["question_tokens"][k] for item in batch]) for k in batch[0]["question_tokens"]}
    padded_question_tokens = {
        "input_ids": torch.nn.utils.rnn.pad_sequence([item["question_tokens"]["input_ids"] for item in batch], batch_first=True, padding_value=0),
        "attention_mask": torch.nn.utils.rnn.pad_sequence([item["question_tokens"]["attention_mask"] for item in batch], batch_first=True, padding_value=0),
    }

    # Tokenize and pad answers
    answers = [item["answer"] for item in batch]
    answer_tokens = tokenizer(answers, return_tensors="pt", padding=True, truncation=True)

    return {
        "image": images,
        "question_tokens": padded_question_tokens,
        "answer": answer_tokens
    }


In [7]:
def load_and_split_dataset(image_dir, questionId_image_path,json_path, tokenizer, transform):
    # Load the annotations from JSON
    #json_path is combined json
    with open(json_path, 'r') as file:
        data = json.load(file)
    annotations = data["annotations"]

    # Split into train (80%), val (10%), and test (10%)
    train_annotations, test_annotations = train_test_split(annotations, test_size=0.2, random_state=42)
    val_annotations, test_annotations = train_test_split(test_annotations, test_size=0.5, random_state=42)

    # Create Dataset objects

    train_dataset = QADataset(image_dir,questionId_image_path, train_annotations, tokenizer, transform)
    val_dataset = QADataset(image_dir,questionId_image_path, val_annotations, tokenizer, transform)
    test_dataset = QADataset(image_dir,questionId_image_path, test_annotations, tokenizer, transform)

    return train_dataset, val_dataset, test_dataset

In [8]:
def apply_lora_to_t5(t5_model):
    lora_config = LoraConfig(
        task_type=TaskType.SEQ_2_SEQ_LM,
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        target_modules=["q", "k"]
    )
    return get_peft_model(t5_model, lora_config)

In [9]:
def train_qavit_with_validation(qavit_model, t5_model, train_loader, val_loader, tokenizer, num_epochs, device):
    qavit_model.train()
    t5_model.train()

    # Apply LoRa to T5 model
    t5_model = apply_lora_to_t5(t5_model)

    # Optimizer and Scheduler
    optimizer = AdamW([
        {"params": qavit_model.parameters(), "lr": 1e-4},
        {"params": t5_model.parameters(), "lr": 5e-5}
    ])
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=1000,
        num_training_steps=len(train_loader) * num_epochs
    )

    for epoch in range(num_epochs):
        # Training
        total_loss = 0
        qavit_model.train()
        t5_model.train()
        for batch in train_loader:
            images = batch["image"].to(device)
            question_tokens = {k: v.to(device) for k, v in batch["question_tokens"].items()}
            answers = {k: v.to(device) for k, v in batch["answer"].items()}

            optimizer.zero_grad()

            # Forward pass through QA-ViT model
            visual_outputs = qavit_model(images)
            visual_features = visual_outputs.last_hidden_state.mean(dim=1)

            # Forward pass through T5 model
            encoder_outputs = (visual_features.unsqueeze(1).repeat(1, question_tokens["input_ids"].size(1), 1), None, None)

            t5_outputs = t5_model(
                input_ids=question_tokens["input_ids"],
                attention_mask=question_tokens["attention_mask"],
                encoder_outputs=encoder_outputs,
                labels=answers["input_ids"]
            )

            loss = t5_outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss:.4f}")

        # Validation
        qavit_model.eval()
        t5_model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                images = batch["image"].to(device)
                question_tokens = {k: v.to(device) for k, v in batch["question_tokens"].items()}
                answers = {k: v.to(device) for k, v in batch["answer"].items()}

                # Forward pass through QA-ViT model
                visual_outputs = qavit_model(images)
                visual_features = visual_outputs.last_hidden_state.mean(dim=1)

                # Forward pass through T5 model
                encoder_outputs = (visual_features.unsqueeze(1).repeat(1, question_tokens["input_ids"].size(1), 1), None, None)

                t5_outputs = t5_model(
                    input_ids=question_tokens["input_ids"],
                    attention_mask=question_tokens["attention_mask"],
                    encoder_outputs=encoder_outputs,
                    labels=answers["input_ids"]
                )

                loss = t5_outputs.loss
                val_loss += loss.item()

        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")


In [11]:
# Define the data paths and questions/answers

image_dir = "../segmentedImage"
json_path = "../dataset/combined_data.json"#"dataset/combined_data.json"


questionId_path ="../dataset/questionId_image_mapping.json" #"dataset/questionId_image_mapping.json"

# image_paths = ["/home/dpadalia_umass_edu/685proj/pink_bear.jpg"]
# questions = ["What is the color of the object?"]
# answers = ["pink"]

# Load and preprocess the images
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Tokenizer and model initialization
pretrained_model = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(pretrained_model)

# Create the dataset and dataloader

train_dataset, val_dataset, test_dataset = load_and_split_dataset(image_dir,questionId_path, json_path, tokenizer, transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=qa_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=qa_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=qa_collate_fn)

# Load the ViT base model
vit_model = ViTModel.from_pretrained("google/vit-base-patch16-224")

# Initialize the QA-ViT model
fusion_layers = 4
qavit_model = QAViT(vit_model)

# Load the T5 model for conditional generation
t5_model = T5ForConditionalGeneration.from_pretrained(pretrained_model)

# Move models to the appropriate device (e.g., GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qavit_model.to(device)
t5_model.to(device)

# Train the QA-ViT model
num_epochs = 5
# train_qavit(qavit_model, t5_model, train_loader, tokenizer, num_epochs, device)
train_qavit_with_validation(qavit_model, t5_model, train_loader, val_loader, tokenizer, num_epochs, device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


boyy
boyy
boyy


Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


hola amigo torch.Size([8, 197, 768])
Epoch 1/5, Training Loss: 40.3262
hola amigo torch.Size([1, 197, 768])
Epoch 1/5, Validation Loss: 7.6297
hola amigo torch.Size([8, 197, 768])
Epoch 2/5, Training Loss: 39.4640
hola amigo torch.Size([1, 197, 768])
Epoch 2/5, Validation Loss: 7.5847
hola amigo torch.Size([8, 197, 768])
Epoch 3/5, Training Loss: 40.7731
hola amigo torch.Size([1, 197, 768])
Epoch 3/5, Validation Loss: 7.4913
hola amigo torch.Size([8, 197, 768])
Epoch 4/5, Training Loss: 38.0103
hola amigo torch.Size([1, 197, 768])
Epoch 4/5, Validation Loss: 7.3608
hola amigo torch.Size([8, 197, 768])
Epoch 5/5, Training Loss: 39.9284
hola amigo torch.Size([1, 197, 768])
Epoch 5/5, Validation Loss: 7.2132


In [13]:
from transformers.modeling_outputs import BaseModelOutput

def infer_qavit(qavit_model, t5_model, image_path, question, tokenizer, device):
    # Load and preprocess the image
    image = Image.open(image_path).convert("RGB")
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image = transform(image).unsqueeze(0).to(device)

    # Tokenize the question
    question_tokens = tokenizer(question, return_tensors="pt", padding=True, truncation=True)
    question_tokens = {k: v.to(device) for k, v in question_tokens.items()}

    # Perform inference with the QA-ViT model
    qavit_model.eval()
    with torch.no_grad():
        visual_outputs = qavit_model(image)
        
        visual_features = visual_outputs.last_hidden_state.mean(dim=1)
    #print("visual features",visual_features.shape)
       

        
    # Construct the encoder outputs
    encoder_hidden_state = visual_features.unsqueeze(1).repeat(1, question_tokens["input_ids"].size(1), 1)
    encoder_outputs = BaseModelOutput(last_hidden_state=encoder_hidden_state)

    # Generate output with T5 model
    t5_model.eval()
    with torch.no_grad():
        output_ids = t5_model.generate(
            input_ids=question_tokens["input_ids"],
            attention_mask=question_tokens["attention_mask"],
            encoder_outputs=encoder_outputs,
            max_length=10,
            num_beams=1,
            early_stopping=True
        )

    # Decode the generated output
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).split()[0]

    return output_text

In [14]:
image_path = "/Users/sudhanshu/Desktop/UMASS_COURSES_SEMESTERS/SEM_2/NLP/VITLLMs/src/segmentedImage/262148_262148000_image.png"
question = "Where is he looking?"

output_text = infer_qavit(qavit_model, t5_model, image_path, question, tokenizer, device)
print("Generated Output:", output_text)

hola amigo torch.Size([1, 197, 768])




Generated Output: Subscribe


In [15]:
image_path = "./segmentedImage/262148_262148000_image.png"
question = "Where is he looking?"

output_text = infer_qavit(qavit_model, t5_model, image_path, question, tokenizer, device)
print("Generated Output:", output_text)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/sudhanshu/Desktop/UMASS_COURSES_SEMESTERS/SEM_2/NLP/VITLLMs/src/scripts/segmentedImage/262148_262148000_image.png'