In [1]:
# Install necessary packages
!pip install torch torchvision transformers gradio gdown datasets sentence-transformers

Collecting gradio
  Downloading gradio-4.31.4-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_

In [2]:
# Download and unzip dataset
!gdown --id 1QBzgZ7hQtC80IqJcdMOPceddwK3JxP67
!unzip VQA.zip

Downloading...
From (original): https://drive.google.com/uc?id=1QBzgZ7hQtC80IqJcdMOPceddwK3JxP67
From (redirected): https://drive.google.com/uc?id=1QBzgZ7hQtC80IqJcdMOPceddwK3JxP67&confirm=t&uuid=932bb750-2212-43a7-9ff2-12dd83ca357a
To: /content/VQA.zip
100% 431M/431M [00:06<00:00, 65.9MB/s]
Archive:  VQA.zip
  inflating: dataset/all_qa_pairs.txt  
  inflating: dataset/answer_space.txt  
  inflating: dataset/data.csv        
  inflating: dataset/data_eval.csv   
  inflating: dataset/data_train.csv  
  inflating: dataset/images/image1.png  
  inflating: dataset/images/image10.png  
  inflating: dataset/images/image100.png  
  inflating: dataset/images/image1000.png  
  inflating: dataset/images/image1001.png  
  inflating: dataset/images/image1002.png  
  inflating: dataset/images/image1003.png  
  inflating: dataset/images/image1004.png  
  inflating: dataset/images/image1005.png  
  inflating: dataset/images/image1006.png  
  inflating: dataset/images/image1007.png  
  inflating: data

In [3]:
# Mount Google Drive to access the saved model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
# Import necessary libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image
import os
from datasets import load_dataset
from transformers import BertTokenizer
import gradio as gr
import matplotlib.pyplot as plt
import numpy as np

In [24]:
# Define the VQA model
import torch.nn as nn
from transformers import BertModel

class VQAModel1(nn.Module):
    def __init__(self, num_answers):
        super(VQAModel1, self).__init__()
        # Image feature extractor
        self.cnn = models.resnet50(pretrained=True)
        self.cnn.fc = nn.Identity()  # Remove the final classification layer

        # Question feature extractor
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Fusion and final classification
        self.fc1 = nn.Linear(2048 + 768, 1024)
        self.fc2 = nn.Linear(1024, num_answers)
        self.dropout = nn.Dropout(0.5)

    def forward(self, images, input_ids, attention_mask):
        # Extract image features
        image_features = self.cnn(images)

        # Extract question features
        outputs = self.bert(input_ids, attention_mask)
        question_features = outputs.last_hidden_state[:, 0, :]  # shape: (batch_size, 768)

        # Concatenate features
        combined_features = torch.cat((image_features, question_features), dim=1)

        # Classification
        x = self.fc1(combined_features)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = VQAModel1(582) #answer_space
model.to(device)
# Load the best model checkpoint
checkpoint_path = "/content/drive/MyDrive/best_model.pth"
checkpoint = torch.load(checkpoint_path)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
print("Best model loaded successfully.")




Best model loaded successfully.


In [26]:
def preprocess_answer(answer):
    # Remove underscores from the answer
    return answer.replace('_', ' ')

In [27]:
with open(os.path.join("dataset", "answer_space.txt")) as f:
        answer_space = f.read().splitlines()
transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

In [28]:
def vqa_inference(image, question):

    image = transform(image).unsqueeze(0).to(device)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    inputs = tokenizer(question, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(image, input_ids, attention_mask)
        _, predicted = torch.max(outputs, 1)

    answer = answer_space[predicted.item()]
    print(f"Predicted answer index: {predicted.item()}")
    print(f"Predicted answer: {answer}")
    return preprocess_answer(answer)

In [30]:
# Set up Gradio interface
image_input = gr.components.Image(type="pil")
text_input = gr.components.Textbox(lines=1, placeholder="Enter your question here")
output = gr.components.Textbox()

gr.Interface(fn=vqa_inference, inputs=[image_input, text_input], outputs=output, title="Visual Question Answering Model").launch(debug=False)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://2c3f38791f8a13925e.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


