In [11]:
!pip install torch torchvision transformers pillow gradio --upgrade



In [12]:
import torch

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# Model and processor paths
MODEL_NAME = "Salesforce/blip-vqa-base"


In [13]:
from transformers import BlipProcessor, BlipForQuestionAnswering

# Directly access the variables defined in the previous cell
MODEL_NAME = "Salesforce/blip-vqa-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

def load_vqa_model():
    """
    Loads the BLIP Visual Question Answering (VQA) model and processor.
    Returns both the model and processor, with the model set to the appropriate device.
    """
    print("Loading the VQA model and processor...")
    processor = BlipProcessor.from_pretrained(MODEL_NAME)
    model = BlipForQuestionAnswering.from_pretrained(MODEL_NAME).to(device)
    print("VQA model and processor loaded successfully.")
    return processor, model

In [14]:
# ipython-input-6-d237fb1fd8e9
import torch

# Instead of importing 'device' from a non-existent 'config' module,
# directly use the 'device' variable that is likely defined in your global scope.
# Assuming 'device' is already defined globally (e.g., in a previous cell)

def answer_question(image, question, processor, model):
    """
    Process an image and question to generate an answer using the VQA model.

    Args:
    - image (PIL.Image): The input image.
    - question (str): The question to be answered about the image.
    - processor (BlipProcessor): The text-image processor.
    - model (BlipForQuestionAnswering): The VQA model.

    Returns:
    - str: The generated answer or an error message if processing fails.
    """
    try:
        # Resize image for model compatibility
        image = image.resize((480, 480))

        # Create a detailed question prompt
        detailed_question = f"Please provide a detailed answer: {question}"

        # Prepare inputs for the model
        inputs = processor(images=image, text=detailed_question, return_tensors="pt").to(device)

        # Generate answer with controlled decoding parameters
        with torch.no_grad():
            out = model.generate(**inputs, max_length=100, num_beams=5, early_stopping=True)

        # Decode answer
        answer = processor.decode(out[0], skip_special_tokens=True)

        # Format the detailed answer output
        detailed_answer = f"The model's answer is: '{answer}'. If you need more information, feel free to ask follow-up questions related to the image or clarify specific aspects you'd like to know about."

        return detailed_answer

    except Exception as e:
        return f"An error occurred: {str(e)}"

In [15]:
!pip install gradio --upgrade



In [18]:
import gradio as gr


# Load the model and processor
processor, model = load_vqa_model()

# Gradio interface configurations
title = "Visual Question Answering (VQA)"
description = "Upload an image and ask any question about it. The model will provide an appropriate answer based on the visual content of the image. Developed by Panchadip"

# Define the Gradio interface
interface = gr.Interface(
    fn=lambda img, q: answer_question(img, q, processor, model),
    inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, placeholder="Ask a question about the image...")],
    outputs="text",
    title=title,
    description=description,
    allow_flagging="never",
    live=True,
)

# Launch the interface
if __name__ == "__main__":
    interface.launch(share=True)


Loading the VQA model and processor...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.54G [00:00<?, ?B/s]

VQA model and processor loaded successfully.




Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://435f712687655170d5.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
