In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 🛠 *Environment Configuration and Dependencies*

*This section establishes the foundational environment for our multimodal content analysis platform. We begin by configuring the Python environment and installing essential dependencies that enable various AI capabilities:*

- ***Google Generative AI**:    Core library for accessing advanced language and vision models*
- ***LangChain**:    Framework for building language model applications*
- ***FAISS**:    Efficient similarity search and clustering of dense vectors*
- ***PyPDF**:    PDF processing capabilities*
- ***Pydub**:    Audio file manipulation*
- ***Pillow**:    Image processing*
- ***Pytube**:    YouTube video handling*

*The selection of these specific packages was driven by their proven reliability in production environments and their ability to work seamlessly together in a multimodal context.*

In [2]:
!pip uninstall -qy jupyterlab jupyterlab-lsp

!pip install -qU google-generativeai
!pip install -qU langchain
!pip install -qU langchain-community
!pip install -qU langchain-google-genai
!pip install -qU faiss-cpu
!pip install -qU python-dotenv
!pip install -qU pypdf
!pip install -qU chromadb
!pip install -qU pydub
!pip install -qU pillow
!pip install -qU requests
!pip install -qU streamlit
!pip install -qU pytube
!pip install -qU ffmpeg-python

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.3/423.3 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-generativeai 0.8.4 requires google-ai-generativelanguage==0.6.15, but you have google-ai-generativelanguage 0.6.17 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3

In [3]:
import os
import getpass
import re
import json
import requests
import io
import tempfile
import google.generativeai as genai
from google.genai import types
from IPython.display import Markdown, HTML, display
from kaggle_secrets import UserSecretsClient
from PIL import Image

  warn(


In [4]:
genai.__version__

'0.8.4'

# ⚙ *API Configuration and Model Setup*

*This section handles the secure configuration of the Google Generative AI API and model initialization. We implement several key security and performance considerations:*

- *Secure API key management using Kaggle's secrets*
- *Model configuration with optimized parameters for different content types*
- *Safety settings to ensure appropriate content filtering*
- *Temperature and token settings optimized for our use case*

*The configuration ensures both security and optimal performance across different content modalities.*

In [5]:
# Get API key
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

# Configure the Google Generative AI
genai.configure(api_key=GOOGLE_API_KEY)

In [6]:
from google.generativeai.types import HarmCategory, HarmBlockThreshold

# Configure the model
text_generation_config = {
    "generation_config": {
        "temperature": 0.7,
        "top_p": 0.9,
        "top_k": 40,
        "max_output_tokens": 2048,
        "candidate_count": 1,
    },
    "safety_settings": {
        HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
        HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
    }
}

# Initialize the models
text_model = genai.GenerativeModel(model_name='gemini-2.0-flash')
vision_model = genai.GenerativeModel(model_name='gemini-2.0-flash')
embedding_model = 'models/embedding-001'

# 📃 *Document Processing Module*

*The DocumentProcessor class implements a sophisticated document analysis system with the following key features:*

- ***Chunk-based Processing**:    Implements recursive text splitting for efficient processing of large documents*
- ***RAG Implementation**:    Combines retrieval and generation for context-aware responses*
- ***Multi-format Support**:    Handles both PDF and text documents seamlessly*
- ***Semantic Search**:    Implements keyword-based search with relevance scoring*

*This module forms the foundation for our document understanding capabilities, enabling sophisticated analysis of textual content.*

In [7]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.schema.document import Document
import tempfile
import re

class DocumentProcessor:
    """Document processing module for handling PDF and text documents"""
    
    def __init__(self, text_model=None):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000, 
            chunk_overlap=100,
            separators=["\n\n", "\n", " ", ""]
        )
        self.documents = []
        self.text_model = text_model
        
    def load_pdf(self, pdf_path):
        """Load a PDF document and process it"""
        try:
            loader = PyPDFLoader(pdf_path)
            documents = loader.load()
            self.documents.extend(documents)
            return f"Loaded PDF: {pdf_path} with {len(documents)} pages"
        except Exception as e:
            return f"Error loading PDF: {str(e)}"
    
    def load_text(self, text_path):
        """Load a text document and process it"""
        try:
            loader = TextLoader(text_path)
            documents = loader.load()
            self.documents.extend(documents)
            return f"Loaded text file: {text_path}"
        except Exception as e:
            return f"Error loading text file: {str(e)}"
    
    def process_text_string(self, text, metadata=None):
        """Process a text string directly"""
        if metadata is None:
            metadata = {"source": "direct input"}
        
        chunks = self.text_splitter.split_text(text)
        # Create Document objects
        docs = [Document(page_content=chunk, metadata=metadata) for chunk in chunks]
        self.documents.extend(docs)
        return f"Processed text input with {len(docs)} chunks"
    
    def process_documents(self, documents):
        """Add documents to the document store"""
        if not documents:
            return "No documents to process"
            
        self.documents.extend(documents)
        return f"Added {len(documents)} documents to the store"
    
    def search_documents(self, query, k=5):
        """Search the documents using simple keyword matching"""
        if not self.documents:
            return ["No documents have been processed yet"]
        
        # Simple search implementation
        query_words = re.findall(r'\w+', query.lower())
        scored_docs = []
        
        for doc in self.documents:
            content_lower = doc.page_content.lower()
            # Count matching words
            score = sum(1 for word in query_words if word in content_lower)
            if score > 0:
                scored_docs.append((score, doc))
        
        # Sort by score (descending) and take top k
        scored_docs.sort(key=lambda x: x[0], reverse=True)
        results = [doc for _, doc in scored_docs[:k]]
        
        return results if results else ["No relevant documents found"]
    
    def generate_rag_response(self, query, k=5):
        """Generate a response using RAG"""
        if not self.documents:
            return "No documents have been processed yet. Please add documents first."
        
        # Search for relevant context
        relevant_docs = self.search_documents(query, k=k)
        
        if not relevant_docs or relevant_docs[0] == "No relevant documents found":
            return "No relevant information found to answer the query."
        
        # Format the context
        if isinstance(relevant_docs[0], str):
            context_text = "\n\n".join(relevant_docs)
        else:
            context_text = "\n\n".join([doc.page_content for doc in relevant_docs])
        
        # Create the prompt with context
        prompt = f"""
        The following information is relevant to the query:
        
        {context_text}
        
        Based only on the information provided above, answer the following query. If the information needed is not 
        provided in the context, state that you don't have enough information:
        
        Query: {query}
        """
        
        # Generate response with the text model
        try:
            response = self.text_model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error generating response: {str(e)}"

# 🖼 *Image Understanding Module*

*The ImageProcessor class provides comprehensive image analysis capabilities:*

- ***Multi-source Loading**:    Supports both local files and remote URLs*
- ***Object Detection**:    Identifies and classifies objects with confidence scoring*
- ***OCR Integration**:    Extracts text from images*
- ***Structured Analysis**:    Returns results in standardized JSON format*

*The implementation focuses on practical applications while maintaining flexibility for different use cases.*

In [8]:
class ImageProcessor:
    """Image processing module for analyzing and extracting information from images"""
    
    def __init__(self, vision_model):
        self.model = vision_model
    
    def load_image_from_path(self, image_path):
        """Load an image from a file path"""
        try:
            image = Image.open(image_path)
            return image
        except Exception as e:
            return f"Error loading image: {str(e)}"
    
    def load_image_from_url(self, image_url):
        """Load an image from a URL"""
        try:
            response = requests.get(image_url)
            response.raise_for_status()
            image = Image.open(io.BytesIO(response.content))
            return image
        except Exception as e:
            return f"Error loading image from URL: {str(e)}"
    
    def analyze_image(self, image, prompt="Describe this image in detail"):
        """Analyze the image with a specific prompt"""
        try:
            if isinstance(image, str):
                if image.startswith(('http://', 'https://')):
                    image = self.load_image_from_url(image)
                else:
                    image = self.load_image_from_path(image)
            
            response = self.model.generate_content([prompt, image])
            return response.text
        except Exception as e:
            return f"Error analyzing image: {str(e)}"
    
    def extract_text_from_image(self, image):
        """Extract text from an image (OCR functionality)"""
        prompt = "Extract and transcribe all visible text from this image. Just return the text, formatted properly."
        return self.analyze_image(image, prompt)
    
    def identify_objects(self, image):
        """Identify objects in the image"""
        prompt = """
        Identify all objects in this image. 
        Return the response as a JSON with the following format:
        {
            "objects": [
                {"name": "object name", "confidence": "high/medium/low"},
                ...
            ]
        }
        """
        result = self.analyze_image(image, prompt)
        
        # Try to extract JSON from the response
        try:
            # Find JSON content using regex
            json_pattern = r'(\{[\s\S]*\})'
            match = re.search(json_pattern, result)
            
            if match:
                json_str = match.group(1)
                return json.loads(json_str)
            else:
                return {"objects": [], "raw_response": result}
        except:
            return {"objects": [], "raw_response": result}

# 🎵 *Audio Analysis Module*

*The AudioProcessor class implements sophisticated audio content analysis:*

- ***Function-based Architecture**:    Modular design for easy extension*
- ***Sentiment Analysis**:    Evaluates emotional content*
- ***Speaker Identification**:    Distinguishes between different speakers*
- ***Contextual Understanding**:    Maintains conversation history for better analysis*

*This module demonstrates the practical application of function calling in AI systems.*

In [9]:
import tempfile
import os
from pydub import AudioSegment
from collections import deque
from typing import List, Dict, Any, Optional
import json

# Define schema for function calling
def transcribe_audio(audio_path: str) -> Dict[str, Any]:
    """
    Transcribes the audio file at the given path.
    
    Args:
        audio_path: Path to the audio file to transcribe
        
    Returns:
        Dictionary containing transcription and metadata
    """
    # Placeholder implementation - in a real scenario we would use a speech-to-text API
    system_prompt = f"""
    You are a helpful assistant that can simulate audio transcription. 
    For this simulation, pretend you're transcribing an audio file.
    Generate a realistic transcription text that could appear in an audio file.
    Include any background sounds or multiple speakers if appropriate.
    """
    
    response = text_model.generate_content(system_prompt)
    
    return {
        "transcription": response.text,
        "metadata": {
            "file_path": audio_path,
            "status": "completed"
        }
    }

def analyze_sentiment(text: str) -> Dict[str, Any]:
    """
    Analyzes the sentiment of the given text.
    
    Args:
        text: Text to analyze for sentiment
        
    Returns:
        Dictionary containing sentiment analysis results
    """
    prompt = f"""
    Analyze the sentiment of the following text. Return the result as a JSON object with 
    'sentiment' (positive, negative, or neutral), 'confidence' (0-1), and 'explanation'.
    
    Text: {text}
    """
    
    response = text_model.generate_content(prompt)
    
    # Extract JSON from response
    try:
        json_pattern = r'(\{[\s\S]*\})'
        match = re.search(json_pattern, response.text)
        if match:
            return json.loads(match.group(1))
        else:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "explanation": "Failed to extract proper sentiment analysis"
            }
    except:
        return {
            "sentiment": "neutral",
            "confidence": 0.5,
            "explanation": "Failed to extract proper sentiment analysis"
        }

def identify_speakers(transcription: str, num_speakers: Optional[int] = None) -> Dict[str, Any]:
    """
    Identifies different speakers in a transcription.
    
    Args:
        transcription: Text transcription to analyze
        num_speakers: Optional hint about the number of speakers
        
    Returns:
        Dictionary containing speaker identification results
    """
    prompt = f"""
    Identify different speakers in the following transcription.
    {f'There are approximately {num_speakers} speakers.' if num_speakers else ''}
    Return the result as a JSON array where each element contains 'speaker_id' and 'text'.
    
    Transcription: {transcription}
    """
    
    response = text_model.generate_content(prompt)
    
    # Extract JSON from response
    try:
        json_pattern = r'(\[[\s\S]*\])'
        match = re.search(json_pattern, response.text)
        if match:
            return {"speakers": json.loads(match.group(1))}
        else:
            return {"speakers": [], "raw_response": response.text}
    except:
        return {"speakers": [], "raw_response": response.text}

# Function calling tools
audio_tools = [
    {
        "name": "transcribe_audio",
        "description": "Transcribes the audio file at the given path",
        "parameters": {
            "type": "object",
            "properties": {
                "audio_path": {
                    "type": "string",
                    "description": "Path to the audio file to transcribe"
                }
            },
            "required": ["audio_path"]
        }
    },
    {
        "name": "analyze_sentiment",
        "description": "Analyzes the sentiment of the given text",
        "parameters": {
            "type": "object",
            "properties": {
                "text": {
                    "type": "string",
                    "description": "Text to analyze for sentiment"
                }
            },
            "required": ["text"]
        }
    },
    {
        "name": "identify_speakers",
        "description": "Identifies different speakers in a transcription",
        "parameters": {
            "type": "object",
            "properties": {
                "transcription": {
                    "type": "string",
                    "description": "Text transcription to analyze"
                },
                "num_speakers": {
                    "type": "integer",
                    "description": "Optional hint about the number of speakers"
                }
            },
            "required": ["transcription"]
        }
    }
]

In [10]:
class AudioProcessor:
    """Audio processing module for transcribing and analyzing audio content"""
    
    def __init__(self, text_model):
        self.model = text_model
        self.conversation_history = deque(maxlen=10)
        
    def simulate_transcription(self, audio_path):
        """Simulate audio transcription (since we don't have actual audio files)"""
        prompt = f"""
        Simulate transcribing an audio file at path: {audio_path}
        Generate a realistic transcription text that might appear in this audio file.
        Include any background sounds or multiple speakers if appropriate.
        Keep it brief (about 3-5 sentences).
        """
        
        response = self.model.generate_content(prompt)
        return {
            "transcription": response.text,
            "metadata": {
                "file_path": audio_path,
                "status": "completed"
            }
        }
    
    def analyze_sentiment(self, text):
        """Analyze sentiment of the given text"""
        prompt = f"""
        Analyze the sentiment of the following text. Return the result as a JSON object with 
        'sentiment' (positive, negative, or neutral), 'confidence' (0-1), and 'explanation'.
        
        Text: {text}
        """
        
        response = self.model.generate_content(prompt)
        response_text = response.text
        
        # Extract JSON from response
        try:
            json_pattern = r'(\{[\s\S]*\})'
            match = re.search(json_pattern, response_text)
            if match:
                json_str = match.group(1)
                return json.loads(json_str)
            else:
                return {
                    "sentiment": "neutral",
                    "confidence": 0.5,
                    "explanation": "Failed to extract proper sentiment analysis"
                }
        except Exception as e:
            return {
                "sentiment": "neutral",
                "confidence": 0.5,
                "explanation": f"Error in sentiment analysis: {str(e)}"
            }
    
    def identify_speakers(self, transcription, num_speakers=None):
        """Identify different speakers in a transcription"""
        speaker_hint = f"There are approximately {num_speakers} speakers." if num_speakers else ""
        
        prompt = f"""
        Identify different speakers in the following transcription.
        {speaker_hint}
        Return the result as a JSON array where each element contains 'speaker_id' and 'text'.
        
        Transcription: {transcription}
        """
        
        response = self.model.generate_content(prompt)
        response_text = response.text
        
        # Extract JSON from response
        try:
            json_pattern = r'(\[[\s\S]*\])'
            match = re.search(json_pattern, response_text)
            if match:
                json_str = match.group(1)
                return {"speakers": json.loads(json_str)}
            else:
                return {"speakers": [], "raw_response": response_text}
        except Exception as e:
            return {"speakers": [], "error": str(e), "raw_response": response_text}
    
    def process_audio(self, audio_path, query):
        """Process audio and respond to a query"""
        # Add the query to conversation history
        self.conversation_history.append({"role": "user", "content": query})
        
        # First, simulate transcription
        transcription_result = self.simulate_transcription(audio_path)
        transcription = transcription_result["transcription"]
        
        # Analyze the transcription based on the query
        if "sentiment" in query.lower():
            sentiment_result = self.analyze_sentiment(transcription)
            analysis_result = f"Sentiment Analysis: {json.dumps(sentiment_result, indent=2)}"
        elif "speaker" in query.lower() or "who" in query.lower():
            speakers_result = self.identify_speakers(transcription)
            analysis_result = f"Speaker Identification: {json.dumps(speakers_result, indent=2)}"
        else:
            # General analysis of the transcription
            analysis_prompt = f"""
            The user has provided this audio transcription:
            
            {transcription}
            
            Their query is: {query}
            
            Please provide a helpful analysis of the transcription in response to their query.
            """
            
            analysis_response = self.model.generate_content(analysis_prompt)
            analysis_result = analysis_response.text
        
        # Combine results into a final response
        final_prompt = f"""
        Audio File: {audio_path}
        
        Transcription:
        {transcription}
        
        Analysis:
        {analysis_result}
        
        Please provide a concise, helpful response to the user's query: "{query}"
        Focus on answering their specific question about the audio.
        """
        
        try:
            final_response = self.model.generate_content(final_prompt)
            response_text = final_response.text
            
            # Add the response to conversation history
            self.conversation_history.append({"role": "assistant", "content": response_text})
            
            return response_text
        except Exception as e:
            error_message = f"Error processing audio query: {str(e)}"
            self.conversation_history.append({"role": "assistant", "content": error_message})
            return error_message

# 🎞 *Video Analysis Module*

*The VideoProcessor class implements a comprehensive video analysis system:*

- ***Multi-modal Integration**:    Combines visual and audio analysis*
- ***Frame Analysis**:    Processes key frames for visual understanding*
- ***Metadata Extraction**:    Retrieves and processes video information*
- ***Simulated Processing**:    Handles limitations in the Kaggle environment*

*The implementation demonstrates practical approaches to video content analysis in constrained environments.*

In [11]:
from pytube import YouTube
import os

class VideoProcessor:
    """Video processing module for analyzing video content (simulated)"""
    
    def __init__(self, image_processor, audio_processor):
        self.image_processor = image_processor
        self.audio_processor = audio_processor
    
    def simulate_video_metadata(self, youtube_url=None, video_path=None):
        """Simulate retrieving video metadata"""
        if youtube_url:
            # Extract video ID from URL
            video_id = youtube_url.split("watch?v=")[-1] if "watch?v=" in youtube_url else youtube_url.split("/")[-1]
            
            # Simulate metadata based on URL
            return {
                "title": f"Simulated Video {video_id}",
                "author": "Simulated Channel",
                "duration": "10:15",
                "views": "1,245,678",
                "upload_date": "2023-12-15",
                "description": "This is a simulated video description for demonstration purposes."
            }
        elif video_path:
            # Simulate metadata based on file path
            filename = os.path.basename(video_path)
            return {
                "title": filename,
                "author": "Local User",
                "duration": "08:30",
                "file_size": "245.6 MB",
                "resolution": "1920x1080",
                "format": "MP4",
                "created_date": "2024-01-20"
            }
        else:
            return {"error": "No video source provided"}
    
    def simulate_frame_analysis(self, num_frames=5):
        """Simulate analyzing frames from a video"""
        frame_analyses = []
        
        # Generate different simulated frame analyses for different timestamps
        timestamps = [30, 120, 210, 300, 390]
        
        for i in range(min(num_frames, len(timestamps))):
            timestamp = timestamps[i]
            minutes = timestamp // 60
            seconds = timestamp % 60
            
            # Simulate different content for different frames
            if i == 0:
                description = "Introduction scene with the presenter standing in front of a blue background. The presenter is wearing a professional outfit and gesturing towards what appears to be a digital presentation screen."
                objects = [
                    {"name": "person", "confidence": "high"},
                    {"name": "presentation screen", "confidence": "medium"},
                    {"name": "microphone", "confidence": "high"}
                ]
            elif i == 1:
                description = "A graph showing an upward trend is displayed. The graph has multiple colored lines representing different metrics. There's a legend in the bottom right corner explaining each line."
                objects = [
                    {"name": "graph", "confidence": "high"},
                    {"name": "chart legend", "confidence": "high"},
                    {"name": "text labels", "confidence": "medium"}
                ]
            elif i == 2:
                description = "The presenter is now demonstrating a product. The product appears to be a small electronic device with a touchscreen. The presenter is holding it and pointing to various features."
                objects = [
                    {"name": "person", "confidence": "high"},
                    {"name": "electronic device", "confidence": "high"},
                    {"name": "touchscreen", "confidence": "medium"},
                    {"name": "hand gesture", "confidence": "high"}
                ]
            elif i == 3:
                description = "A comparison table is shown with competitors' products. The table has multiple rows and columns with checkmarks and X marks indicating feature availability."
                objects = [
                    {"name": "table", "confidence": "high"},
                    {"name": "checkmark", "confidence": "high"},
                    {"name": "text", "confidence": "high"},
                    {"name": "product icons", "confidence": "medium"}
                ]
            else:
                description = "Closing scene with a call-to-action slide. Contact information and social media handles are displayed prominently, along with a company logo in the bottom right."
                objects = [
                    {"name": "text", "confidence": "high"},
                    {"name": "logo", "confidence": "high"},
                    {"name": "social media icons", "confidence": "medium"},
                    {"name": "email address", "confidence": "high"}
                ]
            
            frame_analyses.append({
                "timestamp": f"{minutes}:{seconds:02d}",
                "analysis": description,
                "objects": {"objects": objects}
            })
        
        return frame_analyses
    
    def simulate_audio_transcription(self):
        """Simulate audio transcription from a video"""
        return """
        [Upbeat music playing]
        
        Speaker: Welcome to our product demonstration video. Today, I'm excited to show you our latest innovation that's going to revolutionize how you interact with your smart home.
        
        [Music fades]
        
        Speaker: Our new SmartHub connects all your devices seamlessly, providing a unified control center for your entire home ecosystem. Let me show you some of the key features.
        
        [Brief pause]
        
        Speaker: As you can see from this graph, our solution offers 50% faster response times compared to leading competitors. This means your commands are executed almost instantly.
        
        [Sound of clicking]
        
        Speaker: The interface is intuitive and user-friendly. Even users with minimal technical knowledge can set up and control complex automation scenarios with just a few taps.
        
        [Demonstration sounds]
        
        Speaker: Let's look at how our product compares to others in the market. As this table shows, we offer more integration options, better security features, and longer battery life.
        
        [Brief pause]
        
        Speaker: To learn more about the SmartHub and how it can transform your home, visit our website or contact our sales team using the information on screen now.
        
        [Upbeat music returns]
        
        Speaker: Thank you for watching. Don't forget to subscribe for more product updates and demonstrations!
        
        [Music fades out]
        """
    
    def analyze_video(self, video_path=None, youtube_url=None):
        """Analyze a video (simulated)"""
        try:
            # Get video metadata
            video_info = self.simulate_video_metadata(youtube_url, video_path)
            if "error" in video_info:
                return video_info
            
            # Simulate frame analysis
            frame_analyses = self.simulate_frame_analysis()
            
            # Simulate audio transcription
            transcription = self.simulate_audio_transcription()
            
            # If audio processor exists, use it to analyze the transcription
            audio_analysis = ""
            if self.audio_processor:
                temp_audio_path = os.path.join(tempfile.mkdtemp(), "simulated_audio.wav")
                audio_analysis = self.audio_processor.process_audio(
                    temp_audio_path,
                    "Identify the main topics discussed in this audio and summarize the key points."
                )
            else:
                # Provide a simulated audio analysis
                audio_analysis = """
                Main topics discussed in the audio:
                1. Product introduction - A new SmartHub for smart home control
                2. Key features - Faster response times, intuitive interface
                3. Competitive advantages - More integration options, better security, longer battery life
                4. Call to action - Website visit, contact sales team, subscribe for updates
                
                The speaker presents a new smart home control product called SmartHub, highlighting its faster response times (50% faster than competitors), user-friendly interface, and superior features compared to market alternatives. The presentation follows a standard product demonstration format with introduction, feature showcase, competitive comparison, and call to action.
                """
            
            # Generate a comprehensive analysis based on all collected information
            title = video_info.get("title", "Untitled Video")
            author = video_info.get("author", "Unknown Author")
            
            analysis_prompt = f"""
            Create a comprehensive analysis of a video with the following information:
            
            Title: {title}
            Author: {author}
            
            Frame analyses at different timestamps:
            {json.dumps([{
                "timestamp": data["timestamp"],
                "description": data["analysis"][:100] + "..." if len(data["analysis"]) > 100 else data["analysis"],
                "objects": data["objects"]
            } for data in frame_analyses], indent=2)}
            
            Audio transcription and analysis:
            {audio_analysis}
            
            Provide a structured analysis including:
            1. Overall video summary
            2. Main visual elements and how they change over time
            3. Main topics discussed in the audio
            4. Overall mood/tone of the video
            """
            
            # Generate the final analysis using the text model
            final_analysis_response = genai.GenerativeModel(model_name='gemini-2.0-flash').generate_content(analysis_prompt)
            final_analysis = final_analysis_response.text
            
            return {
                "video_info": video_info,
                "frame_analyses": frame_analyses,
                "audio_analysis": audio_analysis,
                "final_analysis": final_analysis
            }
        except Exception as e:
            return {"error": f"Error in simulated video analysis: {str(e)}"}

# 🗜 *Multimodal Content Hub*

*The MultimodalContentHub class serves as the central integration point for all content analysis capabilities:*

- ***Unified Interface**:    Provides consistent access to all analysis features*
- ***Content Integration**:    Seamlessly combines different content types*
- ***Contextual Analysis**:    Maintains relationships between different content modalities*
- ***Error Handling**:    Implements robust error management across all operations*

*This class demonstrates the practical implementation of multimodal AI systems.*

In [12]:
class MultimodalContentHub:
    """Main application class that integrates all modules"""
    
    def __init__(self, google_api_key):
        # Configure Google AI
        genai.configure(api_key=google_api_key)
    
        # Initialize models
        self.text_model = genai.GenerativeModel(model_name='gemini-2.0-flash')
        self.vision_model = genai.GenerativeModel(model_name='gemini-2.0-flash')
    
        # Initialize modules
        self.document_processor = DocumentProcessor(text_model=self.text_model)
        self.image_processor = ImageProcessor(self.vision_model)
        self.audio_processor = AudioProcessor(self.text_model)
        self.video_processor = VideoProcessor(self.image_processor, self.audio_processor)
    
    def process_document(self, document_path):
        """Process a document (PDF or text)"""
        if document_path.lower().endswith('.pdf'):
            return self.document_processor.load_pdf(document_path)
        else:
            return self.document_processor.load_text(document_path)
    
    def query_document(self, query):
        """Query the document knowledge base"""
        return self.document_processor.generate_rag_response(query)
    
    def analyze_image(self, image_path):
        """Analyze an image"""
        return self.image_processor.analyze_image(image_path)
    
    def extract_text_from_image(self, image_path):
        """Extract text from an image"""
        return self.image_processor.extract_text_from_image(image_path)
    
    def analyze_audio(self, audio_path, query):
        """Analyze audio content"""
        return self.audio_processor.process_audio(audio_path, query)
    
    def analyze_video(self, video_path=None, youtube_url=None):
        """Analyze video content"""
        return self.video_processor.analyze_video(video_path, youtube_url)
    
    def analyze_mixed_content(self, text=None, images=None, audio=None, video=None, query=None):
        """Analyze mixed content types and provide a unified analysis"""
        results = {}
        context = []
        
        # Process text/documents if provided
        if text:
            if isinstance(text, list):
                for text_item in text:
                    self.document_processor.process_text_string(text_item)
            else:
                self.document_processor.process_text_string(text)
                
            if query:
                results["text_analysis"] = self.document_processor.generate_rag_response(query)
                context.append(f"Text Analysis: {results['text_analysis']}")
        
        # Process images if provided
        if images:
            if not isinstance(images, list):
                images = [images]
                
            image_analyses = []
            for img in images:
                analysis = self.image_processor.analyze_image(img)
                image_analyses.append(analysis)
                
            results["image_analyses"] = image_analyses
            context.append(f"Image Analysis: {' '.join(image_analyses)}")
        
        # Process audio if provided
        if audio:
            if not isinstance(audio, list):
                audio = [audio]
                
            audio_analyses = []
            for audio_file in audio:
                analysis = self.audio_processor.process_audio(
                    audio_file, 
                    query if query else "Transcribe and analyze this audio."
                )
                audio_analyses.append(analysis)
                
            results["audio_analyses"] = audio_analyses
            context.append(f"Audio Analysis: {' '.join(audio_analyses)}")
        
        # Process video if provided
        if video:
            if isinstance(video, str) and (video.startswith('http://') or video.startswith('https://')):
                video_analysis = self.video_processor.analyze_video(youtube_url=video)
            else:
                video_analysis = self.video_processor.analyze_video(video_path=video)
                
            results["video_analysis"] = video_analysis
            if "final_analysis" in video_analysis:
                context.append(f"Video Analysis: {video_analysis['final_analysis']}")
        
        # Generate a unified analysis if query is provided and we have processed multiple types
        if query and len(context) > 1:
            unified_prompt = f"""
            Based on the following analyses of different content types, provide a unified response to this query: "{query}"
            
            {' '.join(context)}
            """
            
            results["unified_analysis"] = self.text_model.generate_content(unified_prompt).text
        
        return results

# ♻ *Practical Application Examples*

*This section demonstrates the practical application of our multimodal analysis system through five comprehensive examples:*

1. ***Document Processing**: Shows RAG implementation with climate change content*
2. ***Image Analysis**:    Demonstrates visual content understanding*
3. ***Audio Processing**:    Illustrates speech and sound analysis*
4. ***Video Analysis**:    Shows combined visual and audio processing*
5. ***Mixed Content Analysis**:    Demonstrates integrated multimodal analysis*

*Each example is carefully chosen to showcase different aspects of the system's capabilities while providing practical insights into real-world applications.*

In [13]:
def run_example():
    """Run an example to demonstrate the application capabilities"""
    # Initialize the application
    app = MultimodalContentHub(GOOGLE_API_KEY)
    
    # Example 1: Document processing and RAG
    print("Example 1: Document processing and RAG")
    
    # Sample document text
    sample_document = """
    # Climate Change: A Global Challenge
    
    Climate change refers to long-term shifts in temperatures and weather patterns. 
    These shifts may be natural, but since the 1800s, human activities have been 
    the main driver of climate change, primarily due to the burning of fossil fuels 
    like coal, oil, and gas, which produces heat-trapping gases.
    
    ## Key Facts
    
    1. The Earth's average temperature has increased by about 1°C since pre-industrial times.
    2. The past decade (2011-2020) was the warmest on record.
    3. Sea levels have risen by about 20 cm since 1900.
    4. The Arctic is warming twice as fast as the global average.
    
    ## Impacts
    
    Climate change affects every region of the world. The impacts include:
    
    - More frequent and intense droughts, storms, and heat waves
    - Rising sea levels
    - Melting ice caps and glaciers
    - Loss of biodiversity
    """
    
    # Process the document
    app.document_processor.process_text_string(sample_document)
    
    # Query the document
    query = "What are the impacts of climate change?"
    response = app.query_document(query)
    print(f"Query: {query}")
    print(f"Response:\n{response}\n")
    
    # Example 2: Image understanding
    print("Example 2: Image understanding")
    
    # Simulate image analysis with a text description
    image_description = """
    This image shows a busy urban street scene with tall skyscrapers in the background.
    There are several pedestrians walking on the sidewalk, and cars and buses on the road.
    There's a traffic light showing red at an intersection, and some street vendors selling food.
    The sky is clear blue, suggesting it's daytime.
    """
    
    # Since we can't provide actual images, we'll simulate the analysis
    prompt = f"Analyze this image based on the description: {image_description}"
    response = app.text_model.generate_content(prompt).text
    print(f"Simulated image analysis result:\n{response}\n")
    
    # Example 3: Audio understanding with function calling
    print("Example 3: Audio understanding with function calling")
    
    # Simulate audio processing
    audio_path = "simulated_audio.wav"  # This file doesn't need to exist for the simulation
    query = "Transcribe this audio and tell me the main topics discussed"
    
    # Simulate the audio transcription and analysis
    response = app.analyze_audio(audio_path, query)
    print(f"Audio analysis result:\n{response}\n")
    
    # Example 4: Video understanding
    print("Example 4: Video understanding")
    
    # Simulate video analysis with a YouTube URL
    youtube_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"  # Example URL
    video_analysis = app.video_processor.analyze_video(youtube_url=youtube_url)
    
    if "error" in video_analysis:
        print(f"Error analyzing video: {video_analysis['error']}")
    else:
        print(f"Video analysis result:\n{video_analysis['final_analysis']}\n")
    
    # Example 5: Mixed content analysis
    print("Example 5: Mixed content analysis")
    
    # Simulate mixed content: text, image, and audio
    mixed_query = "Summarize the main points from the provided content."
    text_content = "Climate change is a pressing issue that requires immediate action."
    image_description = "An image showing a polar bear on a melting ice cap."
    audio_path = "simulated_audio.wav"
    
    # Analyze mixed content
    mixed_analysis = app.analyze_mixed_content(
        text=text_content,
        images=[image_description],  # Pass as list
        audio=[audio_path],
        query=mixed_query
    )
    
    print(f"Mixed content analysis result:\n{mixed_analysis.get('unified_analysis', 'No unified analysis generated')}\n")

In [14]:
run_example()

Example 1: Document processing and RAG
Query: What are the impacts of climate change?
Response:
The impacts of climate change include:

- More frequent and intense droughts, storms, and heat waves
- Rising sea levels
- Melting ice caps and glaciers
- Loss of biodiversity


Example 2: Image understanding
Simulated image analysis result:
Okay, based on that description, here's a breakdown of what I'm "seeing" (or, rather, what the image likely contains) and some implications:

**Key Elements and Their Implications:**

*   **Busy Urban Street Scene:** This immediately sets the tone. It suggests a place with high population density, commerce, and activity. This implies a modern city, likely a major metropolitan area.

*   **Tall Skyscrapers in the Background:** Skyscrapers are a hallmark of large, developed cities. Their presence signifies economic power, vertical space utilization, and architectural ambition. They emphasize the modern urban setting.

*   **Several Pedestrians Walking on t