<a href="https://colab.research.google.com/github/Vicky270506/GenAI/blob/main/MultiModalRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

RAG Multimodal Image Analyser



In [None]:
# Import necessary libraries
import os
import re
import gradio as gr
import requests
import json
from pytube import YouTube
from PIL import Image
import base64
from io import BytesIO
import textwrap
from IPython.display import Markdown, display

# Function to download and extract text from YouTube videos
def get_youtube_transcript(youtube_url):
    try:
        # Create a YouTube object
        yt = YouTube(youtube_url)

        # Get video information
        title = yt.title
        description = yt.description

        # Since we can't directly get transcripts without additional libraries,
        # we'll use the video description and title as content
        content = f"Title: {title}\n\nDescription: {description}"

        return content
    except Exception as e:
        return f"Error processing YouTube video: {str(e)}"

# Function to determine input type and process accordingly
def determine_input_type(user_input):
    # Check if it's a YouTube URL
    youtube_pattern = r'(https?://)?(www\.)?(youtube\.com/watch\?v=|youtu\.be/)[\w-]+'
    if re.match(youtube_pattern, user_input):
        return "youtube", user_input

    # Check if it's an image URL
    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp']
    if any(user_input.lower().endswith(ext) for ext in image_extensions) and user_input.startswith(('http://', 'https://')):
        return "image_url", user_input

    # Otherwise, treat as text
    return "text", user_input

# Function to load image from URL
def load_image_from_url(image_url):
    try:
        response = requests.get(image_url)
        img = Image.open(BytesIO(response.content))
        return img
    except Exception as e:
        return f"Error loading image: {str(e)}"

# Function to encode image to base64
def encode_image(image):
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return img_str

# Function to get model list from OpenRouter
def get_openrouter_models(api_key):
    try:
        headers = {
            "Authorization": f"Bearer {api_key}"
        }
        response = requests.get("https://openrouter.ai/api/v1/models", headers=headers)
        if response.status_code == 200:
            return response.json()
        else:
            return {"error": f"Failed to retrieve models: {response.status_code}"}
    except Exception as e:
        return {"error": f"Error fetching models: {str(e)}"}

# Function to test API connection
def test_openrouter_api(api_key):
    try:
        models = get_openrouter_models(api_key)
        if "error" in models:
            return False, models["error"]
        return True, "API connection successful! ✓"
    except Exception as e:
        return False, f"API Error: {str(e)}"

# Function to get summary using OpenRouter API
def get_openrouter_summary(content, content_type="text", api_key=None, model_id=None):
    if not api_key:
        return "API key is required"

    # Default model if none specified
    if not model_id or model_id == "":
        model_id = "anthropic/claude-3-opus:beta"  # Good default for both text and vision

    try:
        headers = {
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json"
        }

        if content_type == "text":
            # Text-only request
            payload = {
                "model": model_id,
                "messages": [
                    {"role": "user", "content": f"Please provide a comprehensive summary of the following content:\n\n{content}"}
                ]
            }

        elif content_type == "image":
            # Convert image to base64
            if isinstance(content, str):  # If it's still a URL
                image = load_image_from_url(content)
                if isinstance(image, str):  # Error message
                    return image
                img_base64 = encode_image(image)
            else:  # If it's already a PIL Image
                img_base64 = encode_image(content)

            # Create payload with image
            payload = {
                "model": model_id,
                "messages": [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": "Describe and summarize what you see in this image:"},
                            {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_base64}"}}
                        ]
                    }
                ]
            }

        # Debug info
        print(f"Using model: {model_id}")
        print(f"Payload type: {content_type}")

        # Make request to OpenRouter API
        response = requests.post(
            "https://openrouter.ai/api/v1/chat/completions",
            headers=headers,
            json=payload
        )

        if response.status_code == 200:
            result = response.json()
            if "choices" in result and len(result["choices"]) > 0:
                return result["choices"][0]["message"]["content"]
            else:
                return f"No content in response: {result}"
        else:
            return f"API Error: {response.status_code} - {response.text}"

    except Exception as e:
        return f"Error getting summary: {str(e)}"

# Main function to process user input and generate summary
def process_input(api_key, model_name, user_input):
    try:
        # Determine input type and process accordingly
        input_type, content = determine_input_type(user_input)

        # Extract the model_id from the model_name (which might be a display name)
        # The model dropdown will store the actual ID value
        model_id = model_name

        if input_type == "youtube":
            # Extract content from YouTube video
            video_content = get_youtube_transcript(content)
            summary = get_openrouter_summary(video_content, "text", api_key, model_id)
            return f"YouTube Video Summary:\n\n{summary}"

        elif input_type == "image_url":
            # Handle image URL input
            summary = get_openrouter_summary(content, "image", api_key, model_id)
            return f"Image Summary:\n\n{summary}"

        else:  # Text input
            summary = get_openrouter_summary(content, "text", api_key, model_id)
            return f"Text Summary:\n\n{summary}"

    except Exception as e:
        return f"Error: {str(e)}"

# Function to populate model dropdown
def update_model_dropdown(api_key):
    if not api_key:
        return [], "Enter API key to view available models"

    try:
        # Get models from OpenRouter
        models_data = get_openrouter_models(api_key)

        if "error" in models_data:
            return [], models_data["error"]

        # Create a list of models with their IDs
        models = []
        for model in models_data.get("data", []):
            model_id = model.get("id", "")
            models.append(model_id)

        # Sort models alphabetically
        models.sort()

        # Set a default model if available
        default_model = None
        for model_id in models:
            if "claude-3" in model_id.lower() and "opus" in model_id.lower():
                default_model = model_id
                break
            elif "claude-3" in model_id.lower():
                default_model = model_id
                break
            elif "gpt-4" in model_id.lower() and "vision" in model_id.lower():
                default_model = model_id
                break

        if not default_model and models:
            default_model = models[0]

        return models, "Models loaded successfully!"

    except Exception as e:
        return [], f"Error loading models: {str(e)}"

# Create Gradio interface
def create_interface():
    with gr.Blocks(title="OpenRouter Summarizer Chatbot") as demo:
        gr.Markdown("# OpenRouter Summarizer Chatbot")
        gr.Markdown("This chatbot can summarize text, image URLs, and YouTube videos using OpenRouter API, which provides access to various LLMs.")

        # API Key input
        api_key_input = gr.Textbox(
            label="OpenRouter API Key",
            placeholder="Enter your OpenRouter API key here",
            type="password"
        )

        # Status message
        status_message = gr.Textbox(label="Status", interactive=False)

        # Test API button
        test_button = gr.Button("Test API Connection")

        # Model selection dropdown (will be populated after API key is entered)
        with gr.Row():
            model_dropdown = gr.Dropdown(
                label="Select Model",
                choices=[],
                interactive=False,
                info="Select a model that supports your input type"
            )
            refresh_models = gr.Button("Refresh Models")

        # Debug info
        api_debug = gr.Textbox(label="API Debug Info", visible=False)

        # User input
        user_input = gr.Textbox(
            label="Input (Text, Image URL, or YouTube URL)",
            placeholder="Enter text, image URL, or YouTube video URL",
            lines=5
        )

        # Submit button
        submit_button = gr.Button("Get Summary")

        # Output
        output = gr.Textbox(label="Summary Output", lines=10)

        # Set up event handlers
        def test_api(key):
            success, message = test_openrouter_api(key)
            return message

        test_button.click(
            fn=test_api,
            inputs=api_key_input,
            outputs=status_message
        )

        def load_models(key):
            models, message = update_model_dropdown(key)
            return gr.Dropdown(choices=models, value=models[0] if models else None, interactive=bool(models)), message

        refresh_models.click(
            fn=load_models,
            inputs=api_key_input,
            outputs=[model_dropdown, status_message]
        )

        submit_button.click(
            fn=process_input,
            inputs=[api_key_input, model_dropdown, user_input],
            outputs=output
        )

        # Automatically update models when API key is entered
        api_key_input.change(
            fn=load_models,
            inputs=api_key_input,
            outputs=[model_dropdown, status_message]
        )

        # Add examples
        gr.Examples(
            examples=[
                ["", "What are the main challenges of climate change?"],
                ["", "https://www.youtube.com/watch?v=dQw4w9WgXcQ"],
                ["", "https://images.unsplash.com/photo-1575936123452-b67c3203c357?q=80&w=1000&auto=format&fit=crop"]
            ],
            inputs=user_input
        )

    return demo

# Run the app in Colab
def main():
    # Install required packages if in Colab
    try:
        import google.colab
        print("Running in Colab, installing required packages...")
        !pip install -q gradio pytube pillow
    except:
        pass

    demo = create_interface()
    demo.launch(debug=True, share=True)
    return "Gradio app is running!"

# Run the app
if __name__ == "__colab__" or "__main__":
    print("Starting OpenRouter Summarizer Chatbot...")
    main()