# YouTube RAG System
## A Retrieval-Augmented Generation System for YouTube Video Transcripts

### Import Required Libraries

In [1]:
from youtube_transcript_api import (
    YouTubeTranscriptApi,
    TranscriptsDisabled,
    NoTranscriptFound,
)
from langchain_core.prompts import PromptTemplate
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import Chroma
from langchain_classic.retrievers.self_query.base import SelfQueryRetriever
from langchain_classic.chains.query_constructor.base import AttributeInfo, load_query_constructor_runnable
from langchain_community.query_constructors.chroma import ChromaTranslator
from langchain_core.output_parsers import StrOutputParser
from dotenv import load_dotenv
import re

### Load Environment Variables

In [2]:
load_dotenv()

True

### Initialize Language Models

In [3]:
# models
model_ep = HuggingFaceEndpoint(repo_id="google/gemma-2-2b-it", temperature=0.2)
model_hf = ChatHuggingFace(llm=model_ep)
model_ollama = ChatOllama(model="llama3.2:3b", temperature=0.5)
model_google = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.4)

  from .autonotebook import tqdm as notebook_tqdm


## Step 1a - Indexing (Document Ingestion)

### Define Supported Languages and Helper Functions

In [4]:
# Major languages supported by BGE-M3
SUPPORTED_LANGS = [
    'en', 'hi', 'es', 'fr', 'de', 'zh-Hans', 'zh-Hant', 
    'ja', 'ko', 'ru', 'pt', 'it', 'ar', 'tr', 'vi'
]

def extract_video_id(url):
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*'
    match = re.search(pattern, url)
    return match.group(1) if match else url

### Extract YouTube Transcript

In [5]:
# user_link = input("Paste YouTube Link: ")
# video_id = extract_video_id(user_link)
video_id = "Gfr50f6ZBvo"

api = YouTubeTranscriptApi()
try:
    transcript_list = api.fetch(video_id, languages=SUPPORTED_LANGS)
except TranscriptsDisabled:
    print("No captions available for this video.")
except NoTranscriptFound:
    print("No transcript found for this video.")
except Exception as e:
    print(f"Error fetching transcript: {e}")

## Step 1b - Text Splitting

### Convert Transcript Fragments to Documents

In [6]:
# Convert each transcript fragment to a Document with its exact timestamp
fragment_docs = []
for fragment in transcript_list:
    fragment_docs.append(
        Document(
            page_content=fragment.text,
            metadata={
                "video_id": video_id,
                "start": int(fragment.start),
                "end": int(fragment.start + fragment.duration),
            }
        )
    )

### Merge Fragments While Preserving Timestamps

In [7]:
# Now merge fragments while preserving timestamps
raw_docs = []
current_chunk = ""
current_start = 0
current_end = 0

for doc in fragment_docs:
    # Start new chunk
    if not current_chunk:
        current_start = doc.metadata['start']
    
    current_chunk += doc.page_content + " "
    current_end = doc.metadata['end']
    
    if len(current_chunk) >= 750 or doc == fragment_docs[-1]:
        raw_docs.append(
            Document(
                page_content=current_chunk.strip(),
                metadata={
                    "video_id": video_id,
                    "start": current_start,
                    "end": current_end,
                }
            )
        )
        current_chunk = ""

### Apply Recursive Text Splitting

In [8]:
# Recursive Splitting 
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
)

chunks = splitter.split_documents(raw_docs)

In [9]:
page_cont = "/n/n".join(c.page_content for c in chunks)
len(page_cont)

134358

## Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

### Generate Embeddings and Create Vector Store

In [10]:
# embeddings
embeddings = OllamaEmbeddings(model="bge-m3")

# vector store
vector_store = Chroma.from_documents(
    chunks, 
    embeddings, 
    collection_name="youtube-transcript"
)

## Step 2 - Retrieval (Query Constructor) Using Self-Querying Retriever

### Define Metadata Field Information

In [11]:
metadata_field_info = [
    AttributeInfo(
        name="start",
        description="The start time of the video segment in seconds (integer). "
                    "Rule 1: If user asks for 'at 12:00', use (start <= 720). "
                    "Rule 2: If user asks for 'after 12:00', use (start >= 720). "
                    "Rule 3: ALWAYS convert minutes to seconds (min * 60).",
        type="integer", 
    ),
    AttributeInfo(
        name="end",
        description="The end time of the video segment in seconds (integer). "
                    "Rule 1: If user asks for 'at 12:00', use (end >= 720).",
        type="integer",
    ),
    AttributeInfo(
        name="video_id",
        description="The unique YouTube video identifier.",
        type="string",
    ),
]

document_content_description = "Transcript segments from a YouTube video"

### Initialize Query Constructor and Retriever

In [12]:
# Use the runnable with the custom prompt
query_constructor = load_query_constructor_runnable(
    llm=model_google,
    document_contents=document_content_description,
    attribute_info=metadata_field_info,
)

num_chunks = len(chunks)
print(f"Number of chunks: {num_chunks}")
# `dynamic k` which can be adjusted based on the number of chunks automatically
dynamic_k = max(4, min(12, num_chunks // 10))
print(f"Dynamic k: {dynamic_k}")

# Initialize Retriever
retriever = SelfQueryRetriever(
    query_constructor=query_constructor,
    vectorstore=vector_store,
    structured_query_translator=ChromaTranslator(),
    search_kwargs={"k": dynamic_k},
    verbose=True
)

Number of chunks: 175
Dynamic k: 12


## Step 3 - Augmentation (Formatting)

### Define Document Formatting Function

In [13]:
# formatting retrieved docs with timestamps so that llm can understand the context with time
def format_docs(retrieved_docs):
    context_entries = []
    for doc in retrieved_docs:
        s = doc.metadata['start']
        timestamp = f"{s // 60}:{s % 60:02d}"
        context_entries.append(f"[{timestamp}]: {doc.page_content}")
    return "\n\n".join(context_entries)

### Routing intent for summarization

In [14]:
from typing import Literal
from pydantic import BaseModel, Field

# Intent Router 
class Router(BaseModel):
    choice: Literal["SUMMARY", "RAG"] = Field(
        description="""
        SUMMARY: Choose this if the user wants an overview, key takeaways, 
        a list of main points, or a general description of the WHOLE video. 
        
        RAG: Choose this if the user is asking a specific question, looking 
        for a particular fact, person, timestamp, or details about a 
        specific sub-topic within the video.
        """
    )

router_chain = model_google.with_structured_output(Router)

### Smart Summary Stuffing or Sampling
If the yt transcript is greater than 500,000 characters, it takes every 2nd or 3rd chunk to stay in the high-accuracy zone else it uses the full transcript

In [15]:
# Summary Function 
def get_universal_summary(chunks): 
    MAX_CHARS = 500000 
    # In tokens it is equivalent to MAX_CHARS / 4 (1 token = 4 characters)
    
    total_text = " ".join([c.page_content for c in chunks])
    
    if len(total_text) > MAX_CHARS:
        print(f"Video is massive ({len(total_text)} chars). Using Smart Sampling...")
        # it takes every 2nd or 3rd chunk to stay in the high-accuracy zone
        step = len(total_text) // MAX_CHARS + 1
        sampled_chunks = chunks[::step] 
        final_text = " ".join([c.page_content for c in sampled_chunks])
    else:
        print("Video is standard size. Using full transcript...")
        final_text = total_text

    res = model_google.invoke(f"""
        Summarize this YouTube video professionally. 
        Provide a concise 4-5 sentence overview followed by key takeaways in bullet points.
        
        VIDEO CONTENT:
        {final_text}
    """)
    return res.content

### Create Prompt Template

In [16]:
prompt = PromptTemplate(
    template="""
    You are a helpful YouTube AI assistant. 
    Answer the question based on the video content and our conversation history.
    
    VIDEO CONTENT:
    {context}

    CHAT HISTORY:
    {chat_history}

    USER QUESTION: 
    {question}

    INSTRUCTIONS:
    1. Answer using ONLY the video content provided. 
    2. Refer to the content as "the video" or "the speaker," NEVER "the transcript" or "the context."
    3. If the information is NOT in the context, say: "This topic is not discussed in the provided transcript segments."
    4. If the answer IS present, conclude with the source link: https://youtu.be/{video_id}?t={seconds}s
    5. Do not provide a link if the information is not found.
    6. Respond in the same language as the USER QUESTION.
    """,
    input_variables=["context", "question", "video_id", "seconds", "chat_history"]
)

## Step 4 - Generation

### Initialize Chat History and Run Query

In [17]:
def get_intent(query, history):
    # We take the last 2 messages to give the Router context
    recent_history = "\n".join([f"{m.type}: {m.content}" for m in history.messages[-2:]])
    
    router_instruction = f"""
    You are an expert query router. Based on the conversation history and the new user request, 
    determine if the user wants a broad overview (SUMMARY) or a specific detail/follow-up (SPECIFIC_QUESTION).

    CONVERSATION HISTORY:
    {recent_history}

    NEW REQUEST: 
    {query}

    Rules:
    - If the request is a follow-up to a previous specific point, pick SPECIFIC_QUESTION.
    - If the request asks for a general overview of the whole video, pick SUMMARY.
    """
    
    # Using the structured output chain you already defined
    intent_obj = router_chain.invoke(router_instruction)
    return intent_obj.choice

In [None]:
from langchain_community.chat_message_histories import ChatMessageHistory

parser = StrOutputParser()
rag_chain = prompt | model_ollama | parser

history = ChatMessageHistory()
summary_cache = None 

print("\nAI: Video processed. I'm ready!")

while True:
    query = input("\nUser: ")
    if query.lower() in ["exit", "quit"]:
         break

    intent = get_intent(query, history)
    
    if intent == "SUMMARY":
        if not summary_cache:
            summary_cache = get_universal_summary(chunks)
        context_text = summary_cache
        timestamp = 0
    else:
        # RAG Mode (Specific Fact Finding)
        retrieved_docs = retriever.invoke(query)
        sorted_docs = sorted(retrieved_docs, key=lambda x: x.metadata.get('start', 0))
        context_text = "\n\n".join([f"[{d.metadata['start']}s]: {d.page_content}" for d in sorted_docs])
        timestamp = sorted_docs[0].metadata['start'] if sorted_docs else 0

    # CALL 3: Final Answer Generation (keeping last 6 messages in the chat history)
    history_str = "\n".join([f"{m.type}: {m.content}" for m in history.messages[-6:]])

    result = rag_chain.invoke({
        "context": context_text,
        "question": query,
        "video_id": video_id,
        "seconds": timestamp,
        "chat_history": history_str 
    })

    history.add_user_message(query)
    history.add_ai_message(result)

    print(f"\nAI ({intent} mode): {result}")


AI: Video processed. I'm ready!
Video is standard size. Using full transcript...

AI (SUMMARY mode): The main topics about AI discussed in this video are:

1. Rethinking Intelligence Benchmarks: The speaker argues that the Turing Test is an insufficient formal benchmark for AI and advocates for more rigorous, general tests that evaluate AI capabilities across a multitude of cognitive tasks to achieve true general intelligence.
2. Games as Foundational for AI: DeepMind's early success was rooted in using games like Go as efficient, well-defined environments to develop and scale reinforcement learning algorithms, demonstrating AI's capacity for self-play and surpassing human performance.
3. AlphaFold's Impact on Biology: The solution to the protein folding problem by AlphaFold 2 marks a significant scientific breakthrough, illustrating AI's transformative potential in biology, accelerating drug discovery, and paving the way for simulating complex biological systems like virtual cells.
4

: 

In [None]:
parser = StrOutputParser()

# question = "is the topic of origin of life discussed in this video? if yes then what was discussed"
# question = "What does Demis say about aliens or life on other planets?"
# question = "What is the fundamental problem that AlphaFold solved, and why is it significant for biology and medicine?"
# question = "Who first articulated the protein folding problem, and when?"
# question = "Does Demis Hassabis believe we are living in a computer game-like simulation, and what is his alternative view on understanding the universe?"
# question = "What game does Demis consider the most impressive example of reinforcement learning in a computer game, and what was its core mechanic?"
# question = "What specific topic is Demis Hassabis discussing right at (34:00) in the video?"
# question = "Dr. Divyakirti ne evolutionary psychology ka use karke, men mein emotions suppress karne ke phenomenon ko kaise explain kiya hai?"
# question = "Video mein, Dr. Divyakirti ne poverty scar hypothesis ke baare mein kya bataya? Aur unka personal experience is hypothesis se kaise align karta hai ya differ karta hai? "
question = "Dr. Divyakirti ne fame aur popularity ko kaise dekha hai, especially jab negative incidents unse associate kiye jaate hain, even if it's a misunderstanding? Unhone is situation ko handle karne ke liye kaun sa analogy use kiya?"

retrieved_docs = retriever.invoke(question) 
context_text = format_docs(retrieved_docs)
first_timestamp = retrieved_docs[0].metadata['start'] if retrieved_docs else 0

result = (prompt | model_google | parser).invoke({
    "context": context_text,
    "question": question,
    "video_id": video_id,
    "seconds": first_timestamp
})

print(result)