In [None]:
# ============================================================
# CELL 1: INSTALL DEPENDENCIES (Run only once per setup)
# ============================================================

print("üì¶ Installing dependencies...")
!pip install -q transformers pillow gTTS opencv-python

# Imports
from transformers import BlipProcessor, BlipForConditionalGeneration
from gtts import gTTS
from PIL import Image
from IPython.display import Audio, display
import torch, os, cv2, time

print("‚úÖ All libraries installed and imported successfully!")

# Load BLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üöÄ Using device: {device.upper()}")

model_id = "Salesforce/blip-image-captioning-large"
processor = BlipProcessor.from_pretrained(model_id)
model = BlipForConditionalGeneration.from_pretrained(model_id).to(device)

print("‚úÖ BLIP model loaded successfully!")


üì¶ Installing dependencies...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m98.2/98.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


‚úÖ All libraries installed and imported successfully!
üöÄ Using device: CPU


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

‚úÖ BLIP model loaded successfully!


In [None]:
# ============================================================
# CELL 2: SUPPORTING FUNCTIONS (Caption, Audio, Frame Extraction)
# ============================================================

def generate_caption(image_path: str) -> str:
    """Generate caption for a given image path."""
    try:
        raw_image = Image.open(image_path).convert("RGB")
        inputs = processor(raw_image, return_tensors="pt").to(device)
        output = model.generate(**inputs, max_new_tokens=50)
        caption = processor.decode(output[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"‚ùå Caption error: {e}")
        return "Error generating caption."

def speak_text(text: str, filename="caption.mp3"):
    """Convert text to speech and play it."""
    try:
        if not text.strip():
            return
        gTTS(text=text[:400], lang="en").save(filename)
        display(Audio(filename, autoplay=True))
    except Exception as e:
        print(f"‚ö†Ô∏è Audio error: {e}")

def extract_frames(video_path, target_frames=6):
    """Extract limited number of evenly spaced frames for faster video processing."""
    vidcap = cv2.VideoCapture(video_path)
    total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total == 0:
        print("‚ö†Ô∏è No frames detected in video.")
        return []
    interval = max(total // target_frames, 1)
    frames, count = [], 0
    while True:
        success, frame = vidcap.read()
        if not success:
            break
        if count % interval == 0:
            frame_path = f"frame_{count}.jpg"
            cv2.imwrite(frame_path, frame)
            frames.append(frame_path)
        count += 1
    vidcap.release()
    print(f"‚úÖ Extracted {len(frames)} key frames.")
    return frames

def describe_image(image_path):
    """Describe a single image and play audio."""
    caption = generate_caption(image_path)
    print(f"üñºÔ∏è  {caption}")
    speak_text(caption)
    return caption

def describe_video(video_path):
    """Describe a video using extracted frames."""
    print(f"\nüé¨ Processing video: {video_path}")
    frames = extract_frames(video_path, target_frames=6)
    if not frames:
        print("‚ùå No frames to process.")
        return
    captions = []
    for i, frame in enumerate(frames, 1):
        print(f"\nüß† Frame {i}/{len(frames)}:")
        caption = generate_caption(frame)
        print("üìù", caption)
        captions.append(caption)
        speak_text(caption)
        os.remove(frame)
    summary = ". ".join(dict.fromkeys(captions))
    print("\nüìú Summary:\n", summary)
    speak_text(summary, "summary.mp3")


In [None]:
import transformers
print(f"‚úÖ 'transformers' library is installed and version is: {transformers.__version__}")

‚úÖ 'transformers' library is installed and version is: 4.57.1


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Define device, if not already defined globally from a previous cell
device = "cuda" if torch.cuda.is_available() else "cpu"

# Choose a pre-trained summarization model
summarizer_model_id = "sshleifer/distilbart-cnn-12-6"

# Initialize the tokenizer
summarizer_tokenizer = AutoTokenizer.from_pretrained(summarizer_model_id)

# Initialize the summarization model and move it to the device
summarizer_model = AutoModelForSeq2SeqLM.from_pretrained(summarizer_model_id).to(device)

print(f"‚úÖ Summarization model '{summarizer_model_id}' loaded successfully on {device.upper()}!")

‚úÖ Summarization model 'sshleifer/distilbart-cnn-12-6' loaded successfully on CPU!


In [None]:
# Ensure gTTS is installed if not already present in the environment
!pip install -q gTTS

import torch
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
import os
from google.colab import files
from IPython.display import display, Javascript, Audio
from google.colab.output import eval_js
import base64
from io import BytesIO
from PIL import Image
from gtts import gTTS
import cv2
import time


In [21]:
# ============================================================
# CONSOLIDATED SUPPORTING FUNCTIONS AND MAIN EXECUTION (Moved from original b91b940f)
# ============================================================

# Supporting functions (Caption, Audio, Frame Extraction)
def generate_caption(image_path: str) -> str:
    """Generate caption for a given image path."""
    try:
        raw_image = Image.open(image_path).convert("RGB")
        inputs = processor(raw_image, return_tensors="pt").to(device)
        output = model.generate(**inputs, max_new_tokens=50)
        caption = processor.decode(output[0], skip_special_tokens=True)
        return caption
    except Exception as e:
        print(f"‚ùå Caption error: {e}")
        return "Error generating caption."

def speak_text(text: str, filename="caption.mp3", lang="en"):
    """Convert text to speech and play it."""
    try:
        if not text.strip():
            return
        gTTS(text=text[:400], lang=lang).save(filename)
        display(Audio(filename, autoplay=True))
    except Exception as e:
        print(f"‚ö†Ô∏è Audio error: {e}")

def extract_frames(video_path, target_frames=6):
    """Extract limited number of evenly spaced frames for faster video processing."""
    vidcap = cv2.VideoCapture(video_path)
    total = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    if total == 0:
        print("‚ö†Ô∏è No frames detected in video.")
        return []
    interval = max(total // target_frames, 1)
    frames, count = [], 0
    while True:
        success, frame = vidcap.read()
        if not success:
            break
        if count % interval == 0:
            frame_path = f"frame_{count}.jpg"
            cv2.imwrite(frame_path, frame)
            frames.append(frame_path)
        count += 1
    vidcap.release()
    print(f"‚úÖ Extracted {len(frames)} key frames.")
    return frames

def describe_image(image_path):
    """Describe a single image and play audio."""
    caption = generate_caption(image_path)
    print(f"üñºÔ∏è  {caption}")
    speak_text(caption, lang=selected_language)
    return caption

def describe_video(video_path):
    """Describe a video using extracted frames."""
    print(f"\nüé¨ Processing video: {video_path}")
    frames = extract_frames(video_path, target_frames=6)
    if not frames:
        print("‚ùå No frames to process.")
        return
    captions = []
    for i, frame in enumerate(frames, 1):
        print(f"\nüß† Frame {i}/{len(frames)}:")
        caption = generate_caption(frame)
        print("üìù", caption)
        captions.append(caption)
        speak_text(caption, lang=selected_language)
        os.remove(frame)
    summary = ". ".join(dict.fromkeys(captions))
    print("\nüìú Summary:\n", summary)
    speak_text(summary, "summary.mp3", lang=selected_language)

# Helper functions for file input/camera capture
def choose_file():
    """Open file picker dialog for Colab."""
    print("Please upload an image or video file.")
    uploaded = files.upload() # This returns a dictionary of filename: content
    if uploaded:
        # Assuming the user uploads only one file or we pick the first one
        file_path = list(uploaded.keys())[0]
        print(f"File '{file_path}' uploaded.")
        return file_path
    else:
        print("No file uploaded.")
        return None

def capture_from_camera(filename="capture.jpg"):
    """Capture a photo using webcam in Colab and save it."""
    print("üì∏ Initiating camera capture. Please grant camera access if prompted.")
    js = Javascript('''
        async function captureFromCamera() {
            const div = document.createElement('div');
            const video = document.createElement('video');
            video.style.display = 'block';
            const stream = await navigator.mediaDevices.getUserMedia({video: true});

            document.body.appendChild(div);
            div.appendChild(video);
            video.srcObject = stream;
            await video.play();

            // Resize the output to fit the video element.
            google.colab.output.setIframeHeight(document.documentElement.scrollHeight, true);

            const canvas = document.createElement('canvas');
            canvas.width = video.videoWidth;
            canvas.height = video.videoHeight;
            canvas.getContext('2d').drawImage(video, 0, 0);
            stream.getVideoTracks()[0].stop();
            div.remove();
            return canvas.toDataURL('image/jpeg');
        }
        ''')
    display(js)
    try:
        # Execute the JavaScript function and get the base64 image data
        data = eval_js('captureFromCamera()')

        if data:
            # Decode base64 and save
            binary_data = base64.b64decode(data.split(',')[1])
            img = Image.open(BytesIO(binary_data))
            img.save(filename)
            print(f"‚úÖ Captured {filename}")
            return filename
        else:
            print("‚ùå No photo captured.")
            return None
    except Exception as e:
        print(f"‚ùå Camera capture error: {e}")
        return None

# ---- LANGUAGE SELECTION ----
language_options = {
    'en': 'English',
    'es': 'Spanish',
    'fr': 'French',
    'de': 'German',
    'it': 'Italian',
    'pt': 'Portuguese',
    'zh-CN': 'Chinese (Mandarin)',
    'hi': 'Hindi',
    'ru': 'Russian',
    'ar': 'Arabic'
}

print("\nüåç Please choose a language for the audio descriptions:")
for code, name in language_options.items():
    print(f"[{code}] {name}")

selected_language = 'en' # Default language
while True:
    choice = input("Enter language code (e.g., 'en' for English): ").strip().lower()
    if choice in language_options:
        selected_language = choice
        print(f"‚úÖ Language set to {language_options[selected_language]}.")
        break
    else:
        print("‚ö†Ô∏è Invalid language code. Please try again.")

# ---- MAIN EXECUTION ----
while True:
    print("\nüìÅ Choose an image/video from your device or capture via webcam.")
    print("üëâ Type 'file' to choose a file, 'camera' to capture, or 'stop' to exit:")

    choice = input("Enter choice: ").strip().lower()
    if choice == "file":
        path = choose_file()
        if not path:
            print("‚ùå No file selected.")
        elif path.lower().endswith(('.jpg', '.jpeg', '.png')):
            describe_image(path)
        elif path.lower().endswith(('.mp4', '.mov', '.avi')):
            describe_video(path)
        else:
            print("‚ö†Ô∏è Unsupported file type.")
    elif choice == "camera":
        photo = capture_from_camera()
        if photo:
            describe_image(photo)
    elif choice == "stop":
        print("Execution stopped by user.")
        break
    else:
        print("‚ö†Ô∏è Invalid choice.")


üåç Please choose a language for the audio descriptions:
[en] English
[es] Spanish
[fr] French
[de] German
[it] Italian
[pt] Portuguese
[zh-CN] Chinese (Mandarin)
[hi] Hindi
[ru] Russian
[ar] Arabic
Enter language code (e.g., 'en' for English): ar
‚úÖ Language set to Arabic.

üìÅ Choose an image/video from your device or capture via webcam.
üëâ Type 'file' to choose a file, 'camera' to capture, or 'stop' to exit:
Enter choice: camera
üì∏ Initiating camera capture. Please grant camera access if prompted.


<IPython.core.display.Javascript object>

‚úÖ Captured capture.jpg
üñºÔ∏è  there is a man sitting at a table with a plate of food



üìÅ Choose an image/video from your device or capture via webcam.
üëâ Type 'file' to choose a file, 'camera' to capture, or 'stop' to exit:
Enter choice: stop
Execution stopped by user.
