# YouTube RAG System
## A Retrieval-Augmented Generation System for YouTube Video Transcripts

### Import Required Libraries

In [267]:
from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
)
from langchain_core.prompts import PromptTemplate
from langchain_ollama import OllamaEmbeddings, ChatOllama
from test2 import open_router_model
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_classic.retrievers.self_query.base import SelfQueryRetriever
from langchain_classic.chains.query_constructor.base import AttributeInfo, load_query_constructor_runnable
from langchain_community.query_constructors.chroma import ChromaTranslator
from langchain_core.output_parsers import StrOutputParser, PydanticOutputParser
from dotenv import load_dotenv
import re
import math

### Load Environment Variables

In [268]:
load_dotenv()

True

### Initialize Language Models

In [269]:
# models
model_ep = HuggingFaceEndpoint(repo_id="XiaomiMiMo/MiMo-V2-Flash", temperature=0.5)
model_hf = ChatHuggingFace(llm=model_ep)
model_ollama = ChatOllama(model="llama3.2:3b", temperature=0.5)
model_google = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.4)

In [270]:
# response = model_hf.invoke("Hello, how are you?")
# print(response.content)


## Step 1a - Indexing (Document Ingestion)

### Define Supported Languages and Helper Functions

In [271]:
# Major languages supported by BGE-M3
SUPPORTED_LANGS = [
    'en', 'hi', 'es', 'fr', 'de', 'zh-Hans', 'zh-Hant', 
    'ja', 'ko', 'ru', 'pt', 'it', 'ar', 'tr', 'vi'
]

def extract_video_id(url):
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*'
    match = re.search(pattern, url)
    return match.group(1) if match else url

### Extract YouTube Transcript

In [272]:
# user_link = input("Paste YouTube Link: ")
# video_id = extract_video_id(user_link)
video_id = "Gfr50f6ZBvo"

api = YouTubeTranscriptApi()
try:
    transcript_list = api.fetch(video_id, languages=SUPPORTED_LANGS)
except TranscriptsDisabled:
    print("No captions available for this video.")
except NoTranscriptFound:
    print("No transcript found for this video.")
except Exception as e:
    print(f"Error fetching transcript: {e}")

## Step 1b - Text Splitting

### Convert Transcript Fragments to Documents

In [273]:
# Convert each transcript fragment to a Document with its exact timestamp
fragment_docs = []
for fragment in transcript_list:
    fragment_docs.append(
        Document(
            page_content=fragment.text,
            metadata={
                "video_id": video_id,
                "start": int(fragment.start),
                "end": int(fragment.start + fragment.duration),
            }
        )
    )

### Merge Fragments While Preserving Timestamps

In [None]:
total_chars = sum(len(d.page_content) for d in fragment_docs)
print(f"Total Transcript Length: {total_chars}")

if not fragment_docs:
    chunks = []
else:
    # We never go below 600 (too fragmented) or above 1200 (too imprecise)
    # This keeps the "start" timestamp within ~45-80 seconds of the actual answer.
    target_size = max(600, min(1200, int(total_chars / 50))) 
    overlap = int(target_size * 0.15) # 15% overlap

    # --- STEP B: Smart Merging with Text Anchors ---
    raw_docs = []
    current_content = []
    current_start = 0
    current_len = 0

    for i, doc in enumerate(fragment_docs):
        if not current_content:
            current_start = doc.metadata.get("start", 0)
        
        # Add the fragment text
        current_content.append(doc.page_content)
        current_len += len(doc.page_content)
        current_end = doc.metadata.get("end", 0)

        # Merge until we hit the dynamic target_size
        if current_len >= target_size or i == len(fragment_docs) - 1:
            # Inject timestamp into text for LLM precision
            timestamp_label = f"[Timestamp: {int(current_start)}s] "
            
            raw_docs.append(
                Document(
                    page_content=timestamp_label + " ".join(current_content),
                    metadata={
                        "video_id": globals().get("video_id", None),
                        "start": int(current_start),
                        "end": int(current_end),
                    },
                )
            )
            current_content = []
            current_len = 0

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=target_size, 
        chunk_overlap=overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )
    chunks = splitter.split_documents(raw_docs)

    
    num_chunks = len(chunks)
    if num_chunks < 20:
        dynamic_k = min(num_chunks, 5) # Small video: show almost everything
    else:
        # Long video: Scale k between 5 and 10 based on video size
        dynamic_k = max(5, min(10, int(math.log2(num_chunks) * 1.5)))

    print(f"Chunks created: {num_chunks}")
    print(f"Chunk size: {target_size}")
    print(f"Dynamic k: {dynamic_k}")

Total Transcript Length: 130047
Chunks created: 214
Chunk size: 1200
Dynamic k: 10


In [289]:
page_cont = "/n/n".join(c.page_content for c in chunks)
len(page_cont)

154790

## Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

### Generate Embeddings and Create Vector Store

In [278]:
# embeddings
embeddings = OllamaEmbeddings(model="bge-m3")

# vector store
vector_store = Chroma.from_documents(
    chunks, 
    embeddings, 
    collection_name="youtube-transcript"
)

In [279]:
# generate a brief video summary for generic prompt usage
try:
    sample_text = " ".join([c.page_content for c in chunks[:10]])
    video_summary_obj = model_ollama.invoke(
        f"Summarize what this video is about in one short sentence based on this text: {sample_text}"
    )
    video_summary = video_summary_obj.content
except Exception:
    video_summary = "this video"

## Step 2 - Retrieval (Query Constructor) Using Self-Querying Retriever

### Define Metadata Field Information

In [280]:
metadata_field_info = [
    AttributeInfo(
        name="start",
        description="The start time of the video segment in seconds (integer). "
        "Rule 1: If user asks for 'at 12:00', use (start <= 720). "
        "Rule 2: If user asks for 'after 12:00', use (start >= 720). "
        "Rule 3: ALWAYS convert minutes:seconds to total seconds (min * 60 + sec). "
        "Rule 4: Remove time-related keywords (e.g., '12:00', 'minutes', 'seconds') from the semantic search query part.",
        type="integer",
    ),
    AttributeInfo(
        name="end",
        description="The end time of the video segment in seconds (integer). "
        "Rule 1: If user asks for 'at 12:00', use (end >= 720).",
        type="integer",
    ),
    AttributeInfo(
        name="video_id",
        description="The unique YouTube video identifier.",
        type="string",
    ),
]

document_content_description = "Transcript segments from a YouTube video"

### Initialize Query Constructor and Retriever

In [281]:
# Use the runnable with the custom prompt
query_constructor = load_query_constructor_runnable(
    llm=model_hf,
    document_contents=document_content_description,
    attribute_info=metadata_field_info,
)

# num_chunks = len(chunks)
# print(f"Number of chunks: {num_chunks}")
# # `dynamic k` which can be adjusted based on the number of chunks automatically
# dynamic_k = max(4, min(8, num_chunks // 10))
# print(f"Dynamic k: {dynamic_k}")

# Initialize Retriever
retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vector_store,
    structured_query_translator=ChromaTranslator(),
    search_kwargs={"k": dynamic_k},
    verbose=True
)

## Step 3 - Augmentation (Formatting)

### Define Document Formatting Function

In [282]:
# formatting retrieved docs with timestamps so that llm can understand the context with time
def format_docs(retrieved_docs):
    context_entries = []
    for doc in retrieved_docs:
        s = doc.metadata['start']
        timestamp = f"{s // 60}:{s % 60:02d}"
        context_entries.append(f"[{timestamp}]: {doc.page_content}")
    return "\n\n".join(context_entries)

### Routing intent for summarization

In [283]:
from typing import Literal
from pydantic import BaseModel, Field

# Intent Router 
class Router(BaseModel):
    route: Literal["SUMMARY", "RAG"] = Field(
        description="The user's intent: 'SUMMARY' for broad overviews, 'RAG' for specific questions or greetings."
    )

In [284]:
def get_intent(query, history):
    # sending llm the last 2 messages
    recent_history = "\n".join(
        [f"{m.type}: {m.content}" for m in history.messages[-2:]]
    )

    parser = PydanticOutputParser(pydantic_object=Router)

    router_instruction = """
    You are an expert query router. Based on the conversation history and the new user request, 
    determine if the user wants a broad overview (SUMMARY) or a specific detail/follow-up (RAG).

    CONVERSATION HISTORY:
    {history}

    NEW REQUEST: 
    {query}

    Rules:
    - If the request is a follow-up to a previous specific point or asks for specific details, pick RAG.
    - If the request is a greeting like "hi" or "hello", pick RAG.
    - If the request asks for a general overview of the whole video, pick SUMMARY.

    {format_instructions}
    """

    prompt = PromptTemplate(
        template=router_instruction,
        input_variables=["history", "query"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
    )

    chain = prompt | model_hf | parser

    try:
        intent_obj = chain.invoke({"history": recent_history, "query": query})
        return intent_obj.route
    except Exception as e:
        print(f"Router parsing error: {e}. Defaulting to RAG.")
        return "RAG"

### Smart Summary Stuffing or Sampling
If the yt transcript is greater than 500,000 characters, it takes every 2nd or 3rd chunk to stay in the high-accuracy zone else it uses the full transcript

In [285]:
# Summary Function 
def get_universal_summary(chunks): 
    MAX_CHARS = 500000 
    # In tokens it is equivalent to MAX_CHARS / 4 (1 token = 4 characters)
    
    total_text = " ".join([c.page_content for c in chunks])
    
    if len(total_text) > MAX_CHARS:
        print(f"Video is massive ({len(total_text)} chars). Using Smart Sampling...")
        # it takes every 2nd or 3rd chunk to stay in the high-accuracy zone
        step = len(total_text) // MAX_CHARS + 1
        sampled_chunks = chunks[::step] 
        final_text = " ".join([c.page_content for c in sampled_chunks])
    else:
        print("Video is standard size. Using full transcript...")
        final_text = total_text

    res = model_hf.invoke(f"""
        Summarize this YouTube video professionally. 
        Provide a concise 4-5 sentence overview followed by key takeaways in bullet points and mention all the key topics covered in the video.
        Don't include the youtube source link in the summary.
        VIDEO CONTENT:
        {final_text}
    """)
    return res.content

### Create Prompt Template

In [286]:
prompt = PromptTemplate(
    template="""
    You are a helpful YouTube AI assistant. 
    PRIMARY TASK:
    - Your primary goal is to answer the [USER QUESTION] provided below.
    - Use the [CHAT HISTORY] ONLY for context (e.g., if the user refers to a previous point). 
    - DO NOT answer old questions from the chat history.

    VIDEO CONTENT:
    {context}

    CHAT HISTORY:
    {chat_history}

    USER QUESTION: 
    {question}

    INSTRUCTIONS:
    1. GENERAL CONVERSATION & GREETINGS:
       - Reply naturally and warmly.
       - Acknowledge that you are here to help with the video. You can mention that the video is about: {video_summary}
       - DO NOT include Source Links or timestamps for general chat.

    2. TIMESTAMP QUERIES:
       - If the user asks about a specific time (e.g., "at 54:00"), use the closest available segments in the [VIDEO CONTENT].
       - Answer based on that content naturally. Simply state what is being discussed in that portion of the video.
       - CRITICAL: DO NOT expose technical details like "the transcript only includes segments from X" or mention specific timestamp ranges you have access to.
       - If the content around that time isn't available, just say: "I couldn't find specific information about that timestamp in the video."

    3. VIDEO QUESTIONS (INFORMATION FOUND):
       - Answer using ONLY the [VIDEO CONTENT] provided.
       - You MUST append the source link at the end of your response: https://youtu.be/{video_id}?t={seconds}s
       - Use the 'seconds' variable provided to you for the link.

    4. VIDEO QUESTIONS (INFORMATION NOT FOUND):
       - If the user asks about something not in the video, politely say: "I couldn't find information about that in this video."
       - Briefly mention the general theme: "This video focuses on {video_summary}. Would you like to know about that instead?"
       - DO NOT expose internal details about what segments or timestamps you have access to.
       - DO NOT provide a source link if the answer is not found.

    5. PERSONAL OPINIONS:
       - If the user asks for YOUR view or opinion, start by saying: "As an AI assistant, I don't have personal opinions. However, based on the video content..." and then proceed to answer using the transcript content.

    6. FORMATTING:
       - Keep responses conversational, helpful, and grounded.
       - Never expose technical implementation details to the user.
       - Respond in the same language as the [USER QUESTION].
    """,
    input_variables=["context","question","video_id","seconds","chat_history","video_summary",],
)

## Step 4 - Generation

### Initialize Chat History and Run Query

In [287]:
from langchain_community.chat_message_histories import ChatMessageHistory

parser = StrOutputParser()
rag_chain = prompt | model_hf | parser

history = ChatMessageHistory()
summary_cache = None 

print("\nAI: Video processed. I'm ready!")

while True:
    query = input("\nUser: ")
    if query.lower() in ["exit", "quit"]:
         break

    intent = get_intent(query, history)
      
    if intent == "SUMMARY":
        if not summary_cache:
            summary_cache = get_universal_summary(chunks)
        context_text = summary_cache
        timestamp = 0
    else:
        # RAG Mode
        retrieved_docs = retriever.invoke(query)
        # Use the most relevant chunk's timestamp for the source link
        sorted_docs = sorted(retrieved_docs, key=lambda x: x.metadata.get("start", 0))
        context_text = "\n\n".join([f"[{d.metadata['start']}s]: {d.page_content}" for d in sorted_docs])
        timestamp = retrieved_docs[0].metadata["start"] if retrieved_docs else 0

    # CALL 3: Final Answer Generation (keeping last 6 messages in the chat history)
    history_str = "\n".join([f"{m.type}: {m.content}" for m in history.messages[-6:]])

    result = rag_chain.invoke({
        "context": context_text,
        "question": query,
        "video_id": video_id,
        "seconds": timestamp,
        "chat_history": history_str,
        "video_summary": video_summary 
    })

    history.add_user_message(query)
    history.add_ai_message(result)
    
    print("\nUser: " + query)
    print(f"\nAI ({intent} mode): {result}")


AI: Video processed. I'm ready!

User: hii how can u help me?

AI (RAG mode): Hi there! I'd be happy to help you!

I'm here to assist you with this video featuring Demis Hassabis, CEO and co-founder of DeepMind. In the video, he discusses the Turing Test, its limitations, and how he believes true AI capabilities will be demonstrated through generalizability across multiple tasks rather than just language-based communication.

How can I help you with the video? Do you have any questions about what's being discussed?

User: What game does Demis consider the most impressive example of reinforcement learning in a computer game, and what was its core mechanic?

AI (RAG mode): Hey there! I'm here to help you with the video featuring Demis Hassabis, CEO and co-founder of DeepMind. In the video, he discusses the Turing Test, its limitations, and how he believes true AI capabilities will be demonstrated through generalizability across multiple tasks rather than just language-based communicatio

#### Try some questions

In [288]:

# question = "is the topic of origin of life discussed in this video? if yes then what was discussed"
# question = "What does Demis say about aliens or life on other planets?"
# question = "What is the fundamental problem that AlphaFold solved, and why is it significant for biology and medicine?"
# question = "Who first articulated the protein folding problem, and when?"
# question = "Does Demis Hassabis believe we are living in a computer game-like simulation, and what is his alternative view on understanding the universe?"
# question = "What game does Demis consider the most impressive example of reinforcement learning in a computer game, and what was its core mechanic?"
# question = "What specific topic is Demis Hassabis discussing right at (34:00) in the video?"
# question = "Dr. Divyakirti ne evolutionary psychology ka use karke, men mein emotions suppress karne ke phenomenon ko kaise explain kiya hai?"
# question = "Video mein, Dr. Divyakirti ne poverty scar hypothesis ke baare mein kya bataya? Aur unka personal experience is hypothesis se kaise align karta hai ya differ karta hai? "
question = "Dr. Divyakirti ne fame aur popularity ko kaise dekha hai, especially jab negative incidents unse associate kiye jaate hain, even if it's a misunderstanding? Unhone is situation ko handle karne ke liye kaun sa analogy use kiya?"