<a href="https://colab.research.google.com/github/SamiAhmed432/VLM-model/blob/main/VLM_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install bitsandbytes accelerate transformers dets peft google-generativeai

In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from IPython.display import display, Javascript
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
import matplotlib.pyplot as plt
import google.generativeai as genai

In [None]:
# Load processor and model ID
model_id = "Salesforce/blip-image-captioning-base"
processor = BlipProcessor.from_pretrained(model_id)
model = BlipForConditionalGeneration.from_pretrained(model_id)

In [None]:
def take_photo(filename='photo.jpg', quality=0.8):
    js = Javascript('''
    async function takePhoto(quality) {
      const div = document.createElement('div');
      const capture = document.createElement('button');
      capture.textContent = 'Capture';
      div.appendChild(capture);

      const video = document.createElement('video');
      video.style.display = 'block';
      div.appendChild(video);
      document.body.appendChild(div);

      const stream = await navigator.mediaDevices.getUserMedia({video: true});

      // Show the video stream in the video element
      video.srcObject = stream;
      await video.play();

      // Resize the output to fit the video element.
      google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

      // Wait for the capture button to be clicked.
      await new Promise((resolve) => capture.onclick = resolve);

      const canvas = document.createElement('canvas');
      canvas.width = video.videoWidth;
      canvas.height = video.videoHeight;
      canvas.getContext('2d').drawImage(video, 0, 0);
      stream.getVideoTracks()[0].stop();
      div.remove();

      const dataUrl = canvas.toDataURL('image/jpeg', quality);
      return dataUrl;
    }
    ''')
    display(js)
    data = eval_js('takePhoto({})'.format(quality))
    binary = b64decode(data.split(',')[1])
    with open(filename, 'wb') as f:
        f.write(binary)
    return filename

In [None]:
# Capture the image
filename = take_photo()

# Read the image using PIL
raw_image = Image.open(filename).convert("RGB")

# Display the captured image
plt.imshow(np.asarray(raw_image))
plt.axis('off')
plt.show()

In [None]:
# Preprocess the image
inputs = processor(raw_image, return_tensors="pt")

# Generate the output
generation_args = {
    "max_length": 100,
    "num_beams": 5,
    "temperature": 0.7,
    "top_p": 0.9,
    "early_stopping": True
}

output = model.generate(**inputs, **generation_args)

# Decode and print the output
answer = processor.decode(output[0], skip_special_tokens=True)
print("Generated Caption:", answer)

In [None]:
# Google Generative AI setup
genai.configure(api_key="AIzaSyC94PxUwZDYM8nAhxK0GAW3mm3bhILAbL0")
model = genai.GenerativeModel("gemini-1.0-pro")
chat = model.start_chat(history=[])

def get_gemini_response(answer, prompt_type):
    try:
        prompt = f"Generate a simple and short {prompt_type} for: {answer}"
        response = chat.send_message(prompt, stream=True)
        return response
    except Exception as e:
        print(f"Error: {e}")
# User selects the type of text to generate
print("Select the type of text to generate:")
print("1. Story")
print("2. Poem")
print("3. Journal Entry")
choice = input("Enter the number of your choice (1, 2, 3): ")

prompt_type = ""
if choice == "1":
    prompt_type = "story"
elif choice == "2":
    prompt_type = "poem"
elif choice == "3":
    prompt_type = "journal entry"
else:
    print("Invalid choice. Defaulting to 'story'.")
    prompt_type = "story"


In [None]:
# Generate text based on the user's choice
cap_response = get_gemini_response(answer, prompt_type)
if cap_response:
    for chunk in cap_response:
        print(chunk.text)
else:
    print("Failed to generate the text.")