In [None]:
# Import required libraries
import requests
from PIL import Image
from transformers import  ViltProcessor, ViltForQuestionAnswering
from IPython.display import display, Javascript, HTML
from google.colab.output import eval_js
from google.colab import files
import cv2
import numpy as np
import base64
import matplotlib.pyplot as plt  # For displaying images in Colab

# Function to capture image from the webcam
def capture_image_from_webcam():
    display(Javascript('''
        async function captureImage() {
            const div = document.createElement('div');
            document.body.appendChild(div);
            const video = document.createElement('video');
            video.style.display = 'block';
            div.appendChild(video);

            // Request access to the webcam
            const stream = await navigator.mediaDevices.getUserMedia({video: true});
            video.srcObject = stream;
            await video.play();

            // Create a button to capture the frame
            const button = document.createElement('button');
            button.innerText = 'Capture Image';
            div.appendChild(button);

            await new Promise((resolve) => button.onclick = resolve);

            // Capture the frame from the webcam
            const canvas = document.createElement('canvas');
            canvas.width = video.videoWidth;
            canvas.height = video.videoHeight;
            const ctx = canvas.getContext('2d');
            ctx.drawImage(video, 0, 0);

            // Stop the video stream
            stream.getTracks().forEach(track => track.stop());

            const imageDataUrl = canvas.toDataURL('image/png');
            div.remove();
            return imageDataUrl;
        }
    '''))
    # Call the JavaScript function that has been injected into the notebook
    data_url = eval_js('captureImage()')
    return data_url

# Function to upload an image from the device
def upload_image_from_device():
    uploaded = files.upload()
    for filename in uploaded.keys():
        return Image.open(filename)

# Function to convert Data URL to image
def convert_data_url_to_image(data_url):
    # Decode base64 image from Data URL to an image
    encoded_data = data_url.split(',')[1]
    nparr = np.frombuffer(base64.b64decode(encoded_data), np.uint8)
    image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR (OpenCV format) to RGB (PIL format)
    return Image.fromarray(image)

# Function to let the user choose between uploading or capturing an image
def get_image_from_user():
    print("Choose an option:")
    print("1. Upload an image from your device")
    print("2. Capture a photo from your camera")
    choice = input("Enter your choice (1 or 2): ")

    if choice == "1":
        print("Please upload an image from your device.")
        image = upload_image_from_device()
    elif choice == "2":
        print("Please capture an image from the webcam.")
        data_url = capture_image_from_webcam()
        image = convert_data_url_to_image(data_url)
    else:
        raise ValueError("Invalid choice. Please choose 1 or 2.")

    return image

# Step 1: Get the image from the user
raw_image = get_image_from_user()

# Display the captured/uploaded image
print("Image:")
plt.imshow(raw_image)
plt.axis('off')  # Hide axis for cleaner display
plt.show()

# Step 2: Load the BLIP model and processor
print("Loading BLIP model... Please wait.")
processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

In [None]:

# Step 3: Ask a question about the image
question = ""
print(f"Question: {question}")
inputs = processor(raw_image, question, return_tensors="pt")

# Step 4: Generate an answer using the BLIP model
print("Processing image and generating an answer...")
out = model.generate(**inputs)
answer = processor.decode(out[0], skip_special_tokens=True)

# Display the answer
print("Answer:", answer)