### Install Libraries

In [1]:
# !pip install -q youtube-transcript-api langchain-community langchain-openai faiss-cpu tiktoken python-dotenv google-api-python-client

In [48]:
# !pip install youtube-transcript-api==1.1.0

In [47]:
# !pip install tiktoken

In [49]:
# !pip show youtube-transcript-api

In [46]:
# !pip list

### Import Packages

In [1]:
import os
import re
import time
import json
import yt_dlp
import webvtt
from pprint import pprint
from urllib.parse import urlparse, parse_qs
from langchain_community.vectorstores import FAISS
from langchain_core.messages import HumanMessage, AIMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableMap, RunnableLambda
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI

### Setup API & Model

In [None]:
# Set up your API key
os.environ["GOOGLE_API_KEY"] = ""

# Set up Google Gemini LLM
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.5, max_tokens=500)
response = llm.invoke("Hi, How are you?")
print(response.content)
print(response.usage_metadata)

I am doing well, thank you for asking! How are you today?
{'input_tokens': 6, 'output_tokens': 16, 'total_tokens': 22, 'input_token_details': {'cache_read': 0}}


### Extracting YT Video Transcript

In [6]:
def extract_youtube_video_id(url: str) -> str:
    parsed = urlparse(url)
    
    if parsed.hostname in ("www.youtube.com", "youtube.com"):
        return parse_qs(parsed.query).get("v", [None])[0]
    
    if parsed.hostname == "youtu.be":
        return parsed.path.lstrip("/")
    
    match = re.search(r"(?:v=|\/|embed\/)([0-9A-Za-z_-]{11})", url)
    return match.group(1) if match else None
    
# Example usage
yt_url = "https://www.youtube.com/watch?v=67_aMPDk2zw&t=1s"
video_id = extract_youtube_video_id(yt_url)
print("🎯 Extracted Video ID:", video_id)

🎯 Extracted Video ID: 67_aMPDk2zw


In [7]:
# from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

# video_id = "67_aMPDk2zw" # only the ID, not full URL
# try:
#     # If you don’t care which language, this returns the “best” one
#     transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=["en"])

#     # Flatten it to plain text
#     transcript = " ".join(chunk["text"] for chunk in transcript_list)
#     print(transcript)

# except TranscriptsDisabled:
#     print("No captions available for this video.")

In [8]:
## Extracting Transcript

video_id = video_id
priority_lang = "en"
max_retries = 5
    

def retry_get_transcript(video_id, lang_code, retries=5):
    for attempt in range(retries):
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang_code])
            return transcript
        except Exception as e:
            print(f"🔁 Retrying {lang_code} ({attempt + 1}/{retries}) due to error: {e}")
            time.sleep(2)
    return None


def get_transcript_with_priority(video_id, priority_lang="en", retries=5):
    try:
        # Step 1: Get transcript list
        transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
        all_langs = [t.language_code for t in transcripts]
        print(f"🎯 Available languages: {all_langs}")

        # Step 3: Try preferred language with retries
        if priority_lang in all_langs:
            transcript_chunks = retry_get_transcript(video_id, priority_lang, retries)
            if transcript_chunks:
                print(f"✅ Found transcript in preferred language: {priority_lang}")
                transcript = " ".join(chunk["text"] for chunk in transcript_chunks)

            return transcript_chunks, transcript

        # Step 4: Fallback to first available language with retries
        if all_langs:
            fallback_lang = all_langs[0]
            transcript_chunks = retry_get_transcript(video_id, fallback_lang, retries)
            if transcript_chunks:
                print(f"⚠️ Preferred language not found. Falling back to: {fallback_lang}")
                transcript = " ".join(chunk["text"] for chunk in transcript_chunks)

            return transcript_chunks, transcript


    except TranscriptsDisabled:
        print("❌ Transcripts are disabled for this video.")
    except Exception as e:
        print(f"❗ Unexpected error: {e}")

    return None

# 🔽 Run the logic
transcript_list, transcript = get_transcript_with_priority(video_id, retries=5)
if transcript:
    print("\n📄 Transcript Preview:\n", transcript[:500])
    print("\n📄 Transcript Chunks Preview:\n", transcript_list[:500])
else:
    print("❗ Could not retrieve transcript after multiple attempts.")

🎯 Available languages: ['en']
✅ Found transcript in preferred language: en

📄 Transcript Preview:
 foreign [Music] has a curious parrot called buddy buddy has a great mimicking ability and a sharp memory buddy listens to all the conversations in Peter's home and can mimic them very accurately now when he hears feeling hungry I would like to have some for this case the probability of him saying Biryani cherries or food is much higher than the words such as bicycle or book but he doesn't understand the meaning of Biryani or food or cherries the way humans do all he is doing is using statistical

📄 Transcript Chunks Preview:
 [{'text': 'foreign', 'start': 0.24, 'duration': 3.14}, {'text': '[Music]', 'start': 0.88, 'duration': 5.72}, {'text': 'has a curious parrot called buddy buddy', 'start': 3.38, 'duration': 5.139}, {'text': 'has a great mimicking ability and a', 'start': 6.6, 'duration': 3.3}, {'text': 'sharp memory', 'start': 8.519, 'duration': 3.361}, {'text': 'buddy listens to all t

In [None]:
print(f"Transcript List: {transcript_list[:5]}")
print(f"Transcript: {transcript[:100]}")

In [65]:
## Extracting Subtitle

VIDEO_URL = yt_url
VTT_FILENAME = "transcript_temp.en.vtt"

def download_subtitles(video_url: str):
    options = {
        'skip_download': True,
        'writesubtitles': False,
        'writeautomaticsub': True,
        'subtitleslangs': ['en'],
        'subtitlesformat': 'vtt',
        'outtmpl': 'transcript_temp.%(ext)s',
        'noplaylist': True
    }
    with yt_dlp.YoutubeDL(options) as ydl:
        ydl.download([video_url])


def parse_and_clean_vtt(vtt_filename: str):
    transcript_list = []
    transcript_text = ""

    for caption in webvtt.read(vtt_filename):
        line = caption.text.strip()
        transcript_list.append({
            "start": caption.start,
            "end": caption.end,
            "text": line
        })
        transcript_text += line + "\n"

    lines = transcript_text.strip().splitlines()

    # Remove consecutive duplicates
    cleaned_lines = []
    previous_line = None
    for line in lines:
        if line != previous_line:
            cleaned_lines.append(line)
        previous_line = line

    cleaned_text = "\n".join(cleaned_lines)
    return cleaned_text, transcript_list


print("Downloading subtitles...")
download_subtitles(VIDEO_URL)

print("Processing subtitle file...")
cleaned_text, transcript_data = parse_and_clean_vtt(VTT_FILENAME)

Downloading subtitles...
[youtube] Extracting URL: https://www.youtube.com/watch?v=zYGDpG-pTho&t=456s
[youtube] zYGDpG-pTho: Downloading webpage
[youtube] zYGDpG-pTho: Downloading tv client config
[youtube] zYGDpG-pTho: Downloading tv player API JSON
[youtube] zYGDpG-pTho: Downloading ios player API JSON
[youtube] zYGDpG-pTho: Downloading player fc2a56a5-main


         player = https://www.youtube.com/s/player/fc2a56a5/player_ias.vflset/en_US/base.js
         n = jopnKWHFxUulVqO ; player = https://www.youtube.com/s/player/fc2a56a5/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
         n = Id1dQbFg0AkRnKh ; player = https://www.youtube.com/s/player/fc2a56a5/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U
         n = _7ffU55wsHyyjuw ; player = https://www.youtube.com/s/player/fc2a56a5/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


[youtube] zYGDpG-pTho: Downloading m3u8 information
[info] zYGDpG-pTho: Downloading subtitles: en
[info] zYGDpG-pTho: Downloading 1 format(s): 616+234
Deleting existing file transcript_temp.en.vtt
[info] Writing video subtitles to: transcript_temp.en.vtt
[download] Destination: transcript_temp.en.vtt
[download] 100% of   90.44KiB in 00:00:00 at 292.63KiB/s
Processing subtitle file...


In [41]:
cleaned_text, transcript_data = parse_and_clean_vtt(VTT_FILENAME)
pprint(cleaned_text[:100])
pprint(transcript_data[:2])

('Remember how back in the day people\n'
 'would Google themselves? You type your\n'
 'name into a search engine')
[{'end': '00:00:02.879',
  'start': '00:00:02.869',
  'text': 'Remember how back in the day people'},
 {'end': '00:00:04.950',
  'start': '00:00:02.879',
  'text': 'Remember how back in the day people\n'
          'would Google themselves? You type your'}]


### Save Transcript

In [43]:
BASE_DIR = "youtube_data"
TRANSCRIPT_DIR = os.path.join(BASE_DIR, "transcript")
os.makedirs(TRANSCRIPT_DIR, exist_ok=True)


def save_transcript_to_txt(comments, filename="transcript.txt"):
    filepath = os.path.join(TRANSCRIPT_DIR, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        for i, c in enumerate(comments, start=1):
            f.write(f"{i}. {c['text']}\n")


def save_transcript_to_json(metadata, filename="transcript.json"):
    filepath = os.path.join(TRANSCRIPT_DIR, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=4, ensure_ascii=False)


def save_transcript(cleaned_text, transcript_data, txt_filename="transcript_subtitle.txt", json_filename="transcript_subtitle.json"):
    txt_path = os.path.join(TRANSCRIPT_DIR, txt_filename)
    json_path = os.path.join(TRANSCRIPT_DIR, json_filename)

    with open(txt_path, "w", encoding="utf-8") as txt_out:
        txt_out.write(cleaned_text)

    with open(json_path, "w", encoding="utf-8") as json_out:
        json.dump(transcript_data, json_out, indent=2, ensure_ascii=False)

# Saving all formats
# save_transcript_to_txt(transcript_list)
# save_transcript_to_json(transcript_list)
save_transcript(cleaned_text, transcript_data)

### Text Chunking

In [44]:
transcript = cleaned_text
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])
pprint(chunks[:4])
print("Chunk Size:", len(chunks))

[Document(metadata={}, page_content="Remember how back in the day people\nwould Google themselves? You type your\nname into a search engine and you see\nwhat it knows about you? Well, the\nmodern equivalent of that is to do the\nsame thing with a chatbot. So, when I\nask a large language model, who is\nMartin Keen? Well, the response varies\ngreatly depending upon which model I'm\nasking because different models, they\nhave different training data sets. They\nhave different knowledge cutoff dates.\nSo what a given model knows about me?\nWell, it differs greatly. But how could\nwe improve the model's answer? Well,\nthere's three ways. So let's start with\na model here. And we're going to see how\nwe can improve its responses. Well, the\nfirst thing it could do is it could go\nout and it could perform a search. a\nsearch for new data that either wasn't\nin its training data set or it was just\ndata that became available after the\nmodel finished training. And then it\ncould incorporate t

### Embedding Generation & Utilizing Vector Store

In [46]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector_store = FAISS.from_documents(chunks, embeddings)

In [47]:
print(vector_store.index_to_docstore_id[2])
print(vector_store.get_by_ids(['d905d677-7a93-4c13-adf6-bd4215f31491']))

a71330c9-af01-4835-a465-cb6e91597d2c
[]


### Retrieval Process

In [48]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})
pprint(retriever)

VectorStoreRetriever(tags=['FAISS', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x00000196C8095F30>, search_kwargs={'k': 4})


In [49]:
retriever_docs = retriever.invoke('What is in the video')
pprint(retriever_docs[0].page_content)

('in its training data set or it was just\n'
 'data that became available after the\n'
 'model finished training. And then it\n'
 'could incorporate those results from the\n'
 'search back into its answer. That is\n'
 'called rag or retrieval augmented\n'
 "generation. That's one method. Or we\n"
 'could pick a specialized model, a model\n'
 "that's been trained on, let's say,\n"
 'transcripts of these videos. That would\n'
 'be an example of something called fine\n'
 'tuning. Or we could ask the model a\n'
 "query that better specifies what we're\n"
 'looking for. So maybe the LLM already\n'
 'knows plenty about the Martin Keen of\n'
 "the world, but let's tell the model that\n"
 "we're referring to the Martin Keen who\n"
 'works at IBM rather than the Martin Keen\n'
 'that founded Keen Shoes. That is an\n'
 'example of prompt\n'
 'engineering. Three ways to get better\n'
 'outputs out of large language models,\n'
 'each with their pluses and\n'
 "minuses. Let's start with rag. So, le

### Augmentation Process

In [50]:
template = """ You are a helpful assistant. Answer ONLY from the provided transcript context. 
                 If the context is insufficient, just say you don't know.
                 Context: {context}
             """
prompt = ChatPromptTemplate.from_messages([MessagesPlaceholder(variable_name="chat_history"),
                                           ("system", template), 
                                           ("human", "question:{input}")
                                           ])

In [51]:
question = "What is this topic about?"
retrieved_docs = retriever.invoke(question)
pprint(retrieved_docs)

[Document(id='77b88199-0940-4a68-89fc-61bbf04c6d1c', metadata={}, page_content="we've got documents that need to be\nvector embeddings and we need to store\nthese vector embeddings in a database.\nAll of this adds to processing costs. It\nadds to infrastructure costs to make\nthis solution work. All right, next up,\nfine tuning. So remember how we\ndiscussed getting better answers about\nme by training a model specifically on\nlet's say my video transcripts? Well,\nthat is fine tuning in action. So what\nwe do with fine-tuning is we take a\nmodel but specifically an existing model\nand that existing model has broad\nknowledge and then we're going to give\nit additional specialized training on a\nfocused data set. So this is now\nspecialized to what we want to develop\nparticular expertise on. Now during\nfine-tuning, we're updating the model's\ninternal parameters through additional\ntraining. So the model starts out with\nsome weights here like\nthis. And those weights were optimized\

In [52]:
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
pprint(context_text[:500])

("we've got documents that need to be\n"
 'vector embeddings and we need to store\n'
 'these vector embeddings in a database.\n'
 'All of this adds to processing costs. It\n'
 'adds to infrastructure costs to make\n'
 'this solution work. All right, next up,\n'
 'fine tuning. So remember how we\n'
 'discussed getting better answers about\n'
 'me by training a model specifically on\n'
 "let's say my video transcripts? Well,\n"
 'that is fine tuning in action. So what\n'
 'we do with fine-tuning is we take a\n'
 'model but specifically an existing model\n'
 'and ')


In [53]:
chat_history = []
final_prompt = prompt.invoke({"context": context_text, "input": question, "chat_history":chat_history})
pprint(final_prompt)

ChatPromptValue(messages=[SystemMessage(content=" You are a helpful assistant. Answer ONLY from the provided transcript context. \n                 If the context is insufficient, just say you don't know.\n                 Context: we've got documents that need to be\nvector embeddings and we need to store\nthese vector embeddings in a database.\nAll of this adds to processing costs. It\nadds to infrastructure costs to make\nthis solution work. All right, next up,\nfine tuning. So remember how we\ndiscussed getting better answers about\nme by training a model specifically on\nlet's say my video transcripts? Well,\nthat is fine tuning in action. So what\nwe do with fine-tuning is we take a\nmodel but specifically an existing model\nand that existing model has broad\nknowledge and then we're going to give\nit additional specialized training on a\nfocused data set. So this is now\nspecialized to what we want to develop\nparticular expertise on. Now during\nfine-tuning, we're updating the 

### Generation Process

In [54]:
answer = llm.invoke(final_prompt)
pprint(answer.content)

('The topic is about fine-tuning, which involves taking an existing model with '
 'broad knowledge and giving it additional specialized training on a focused '
 'data set to develop particular expertise. It also discusses prompt '
 'engineering, which involves including specific elements in a prompt to '
 "direct the model's attention to relevant patterns learned during training.")


### Building End-to-End Chain

In [55]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [56]:
retriever_chain = RunnableMap({
    "context": lambda inputs: retriever.invoke(inputs["input"]),
    "input": lambda inputs: inputs["input"],
    "chat_history": lambda inputs: inputs["chat_history"],
})

format_chain = RunnableLambda(lambda x: {**x, "context": format_docs(x["context"])})

rag_chain = retriever_chain | format_chain | prompt | llm 

In [57]:
chat_history = []

question = "Can you suumarise the Video"

response = rag_chain.invoke({"input": question, "chat_history": chat_history})

chat_history.extend([HumanMessage(content=question), AIMessage(content=response.content)])

print(response.content)

The video discusses three ways to improve the outputs of large language models (LLMs): Retrieval Augmented Generation (RAG), fine-tuning, and prompt engineering. RAG involves searching for new data and incorporating it into the answer. Fine-tuning involves training a model on a specialized data set. Prompt engineering involves better specifying what we're looking for. The video also touches on the pluses and minuses of each approach.


In [58]:
question = "Please Elaborate"

response = rag_chain.invoke({
    "input": question,
    "chat_history": chat_history
})

chat_history.extend([
    HumanMessage(content=question),
    AIMessage(content=response.content)
])

print(response.content)

Okay, let's elaborate on each of the three methods for improving LLM outputs: Retrieval Augmented Generation (RAG), Fine-tuning, and Prompt Engineering, along with their pros and cons.

**1. Retrieval Augmented Generation (RAG)**

*   **What it is:** RAG is a technique that enhances LLMs by allowing them to access and incorporate information from external knowledge sources *before* generating a response. Think of it as giving the LLM access to a "library" or a database to research before answering your question.

*   **How it works:**

    1.  **User Query:** The user submits a question or prompt.
    2.  **Retrieval:** The system identifies relevant documents or information from an external knowledge base (e.g., a database, a collection of documents, a website). This is often done using semantic search, which looks for meaning rather than just keywords.
    3.  **Augmentation:** The retrieved information is combined with the original user query.
    4.  **Generation:** The augmented p

In [59]:
chat_history

[HumanMessage(content='Can you suumarise the Video', additional_kwargs={}, response_metadata={}),
 AIMessage(content="The video discusses three ways to improve the outputs of large language models (LLMs): Retrieval Augmented Generation (RAG), fine-tuning, and prompt engineering. RAG involves searching for new data and incorporating it into the answer. Fine-tuning involves training a model on a specialized data set. Prompt engineering involves better specifying what we're looking for. The video also touches on the pluses and minuses of each approach.", additional_kwargs={}, response_metadata={}),
 HumanMessage(content='Please Elaborate', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Okay, let\'s elaborate on each of the three methods for improving LLM outputs: Retrieval Augmented Generation (RAG), Fine-tuning, and Prompt Engineering, along with their pros and cons.\n\n**1. Retrieval Augmented Generation (RAG)**\n\n*   **What it is:** RAG is a technique that enhances L

In [60]:
for details in response:
    print(details)

('content', 'Okay, let\'s elaborate on each of the three methods for improving LLM outputs: Retrieval Augmented Generation (RAG), Fine-tuning, and Prompt Engineering, along with their pros and cons.\n\n**1. Retrieval Augmented Generation (RAG)**\n\n*   **What it is:** RAG is a technique that enhances LLMs by allowing them to access and incorporate information from external knowledge sources *before* generating a response. Think of it as giving the LLM access to a "library" or a database to research before answering your question.\n\n*   **How it works:**\n\n    1.  **User Query:** The user submits a question or prompt.\n    2.  **Retrieval:** The system identifies relevant documents or information from an external knowledge base (e.g., a database, a collection of documents, a website). This is often done using semantic search, which looks for meaning rather than just keywords.\n    3.  **Augmentation:** The retrieved information is combined with the original user query.\n    4.  **Gene

In [61]:
## Chain without History

#  parallel_chain = RunnableParallel({'context': retriever | RunnableLambda(format_docs),
#                                    'question': RunnablePassthrough()
#                                    })
# retriever_chain = parallel_chain.invoke('who is Demis')
# retriever_chain

# parser = StrOutputParser()
# rag_chain = parallel_chain | prompt | llm | parser
# response = rag_chain.invoke('Can you summarize the video')