<a href="https://colab.research.google.com/github/RoshniBanu/SDC_Codes_Colab/blob/main/Langchain%2BRAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

YOUTUBE VIDEO SUMMARIZER WITH LANGCHAIN AND RAG

In [11]:
!pip install langchain faiss-cpu youtube-transcript-api sentence-transformers transformers




In [22]:
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from transformers import pipeline
from langchain.embeddings.base import Embeddings
import re

# 1. Custom wrapper for SentenceTransformer to work with LangChain
class LocalEmbedding(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts)

    def embed_query(self, text):
        return self.model.encode([text])[0]

# 2. Get transcript from YouTube video
def get_transcript(video_url):
    video_id = re.search(r"(?<=v=)[^&#]+", video_url)
    if not video_id:
        raise ValueError("Invalid YouTube URL")
    video_id = video_id.group(0)
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = " ".join([entry["text"] for entry in transcript])
    return text

# 3. Split long text into smaller chunks
def split_text_into_docs(text, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.create_documents([text])

# 4. Create FAISS vector store with local embeddings
def create_vectorstore(docs):
    embedding = LocalEmbedding()
    db = FAISS.from_documents(docs, embedding)
    return db

# 5. Retrieve relevant chunks and summarize
def summarize_retrieved_docs(vectorstore, query="Summarize the video"):
    retriever = vectorstore.as_retriever()
    docs = retriever.get_relevant_documents(query)
    combined_text = " ".join([doc.page_content for doc in docs])

    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(combined_text, max_length=300, min_length=50, do_sample=False)[0]["summary_text"]
    return summary

# 6. Complete pipeline
def summarize_youtube_video(video_url):
    print("📥 Getting transcript...")
    text = get_transcript(video_url)

    print("✂️ Splitting into chunks...")
    docs = split_text_into_docs(text)

    print("📚 Building vectorstore...")
    vectorstore = create_vectorstore(docs)

    print("🧠 Summarizing...")
    summary = summarize_retrieved_docs(vectorstore)

    return summary

# 7. Run it!
video_url = "https://www.youtube.com/watch?v=d4yCWBGFCEs"  # Replace with your video
summary = summarize_youtube_video(video_url)
print("\n📋 Summary:\n", summary)


📥 Getting transcript...
✂️ Splitting into chunks...
📚 Building vectorstore...
🧠 Summarizing...


Device set to use cpu



📋 Summary:
 This tool is ready now it's going to be very useful to my equity research analyst my peter Panda who is investing on aoki's behalf because now you don't have to read so many articles whatever question you have you can ask it to this news research tool it will not only give you the answer but it gives you the sources reference. By the way with llm boom many clients are building this kind of text to or video model you can just Google open as s you'll find the demo.


YOUTUBE VIDEO SUMMARIZER WITH URL INPUT WITH LANGCHAIN AND RAG

In [13]:
!pip install langchain faiss-cpu youtube-transcript-api sentence-transformers transformers ipywidgets


Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.2


In [23]:
import ipywidgets as widgets
from IPython.display import display
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from transformers import pipeline
from langchain.embeddings.base import Embeddings
import re

# 1. Custom wrapper for SentenceTransformer to work with LangChain
class LocalEmbedding(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts)

    def embed_query(self, text):
        return self.model.encode([text])[0]

# 2. Get transcript from YouTube video
def get_transcript(video_url):
    video_id = re.search(r"(?<=v=)[^&#]+", video_url)
    if not video_id:
        raise ValueError("Invalid YouTube URL")
    video_id = video_id.group(0)
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = " ".join([entry["text"] for entry in transcript])
    return text

# 3. Split long text into smaller chunks
def split_text_into_docs(text, chunk_size=1000, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.create_documents([text])

# 4. Create FAISS vector store with local embeddings
def create_vectorstore(docs):
    embedding = LocalEmbedding()
    db = FAISS.from_documents(docs, embedding)
    return db

# 5. Retrieve relevant chunks and summarize
def summarize_retrieved_docs(vectorstore, query="Summarize the video"):
    retriever = vectorstore.as_retriever()
    docs = retriever.get_relevant_documents(query)
    combined_text = " ".join([doc.page_content for doc in docs])

    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summary = summarizer(combined_text, max_length=300, min_length=50, do_sample=False)[0]["summary_text"]
    return summary

# 6. Complete pipeline
def summarize_youtube_video(video_url):
    print("📥 Getting transcript...")
    text = get_transcript(video_url)

    print("✂️ Splitting into chunks...")
    docs = split_text_into_docs(text)

    print("📚 Building vectorstore...")
    vectorstore = create_vectorstore(docs)

    print("🧠 Summarizing...")
    summary = summarize_retrieved_docs(vectorstore)

    return summary

# 7. Interactive Widgets for URL Input
def on_button_click(b):
    video_url = url_textbox.value
    if not video_url:
        print("Please provide a valid URL.")
        return
    print("⌛ Processing the video...")
    summary = summarize_youtube_video(video_url)
    print("\n📋 Summary:\n", summary)

# Create a URL input field
url_textbox = widgets.Text(
    description='YouTube URL:',
    placeholder='Enter YouTube video URL',
    layout=widgets.Layout(width='50%')
)

# Create a button to trigger the summarization
summarize_button = widgets.Button(description="Summarize Video", layout=widgets.Layout(width='20%'))

# Bind button click to the function
summarize_button.on_click(on_button_click)

# Display the input field and button
display(url_textbox, summarize_button)


Text(value='', description='YouTube URL:', layout=Layout(width='50%'), placeholder='Enter YouTube video URL')

Button(description='Summarize Video', layout=Layout(width='20%'), style=ButtonStyle())

⌛ Processing the video...
📥 Getting transcript...
✂️ Splitting into chunks...
📚 Building vectorstore...
🧠 Summarizing...


Device set to use cpu



📋 Summary:
 This tool is ready now it's going to be very useful to my equity research analyst my peter Panda who is investing on aoki's behalf because now you don't have to read so many articles whatever question you have you can ask it to this news research tool it will not only give you the answer but it gives you the sources reference. By the way with llm boom many clients are building this kind of text to or video model you can just Google open as s you'll find the demo.


MEDICAL q&a CHATBOT WITH PUBMEN

In [15]:
!pip install requests langchain openai


Collecting openai
  Using cached openai-1.74.0-py3-none-any.whl.metadata (25 kB)
Using cached openai-1.74.0-py3-none-any.whl (644 kB)
Installing collected packages: openai
Successfully installed openai-1.74.0


In [17]:
!pip install requests transformers bs4 sentence-transformers langchain faiss-cpu


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [25]:
import requests
from bs4 import BeautifulSoup
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# 1. Function to search PubMed and retrieve relevant articles
def search_pubmed(query, max_results=5):
    # PubMed API endpoint
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        'db': 'pubmed',
        'term': query,
        'retmax': max_results,
        'retmode': 'xml',
        'sort': 'relevance',
    }

    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        raise Exception("Error searching PubMed")

    # Parse the response and extract article IDs
    soup = BeautifulSoup(response.content, 'xml')
    article_ids = [id_tag.text for id_tag in soup.find_all('Id')]
    return article_ids

# 2. Function to fetch article details from PubMed
def fetch_article_details(article_ids):
    # PubMed article details API endpoint
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    ids = ','.join(article_ids)
    params = {
        'db': 'pubmed',
        'id': ids,
        'retmode': 'xml',
    }

    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        raise Exception("Error fetching article details from PubMed")

    # Parse the response and extract article titles and abstracts
    soup = BeautifulSoup(response.content, 'xml')
    articles = []
    for article in soup.find_all('PubmedArticle'):
        title = article.find('ArticleTitle').text if article.find('ArticleTitle') else 'No title'
        abstract = article.find('AbstractText').text if article.find('AbstractText') else 'No abstract available'
        articles.append(f"Title: {title}\nAbstract: {abstract}")
    return articles

# 3. Function to create a medical Q&A chatbot
def medical_qa_chatbot(query):
    # Search PubMed for relevant articles
    print(f"🔍 Searching PubMed for: {query}")
    article_ids = search_pubmed(query, max_results=5)

    if not article_ids:
        return "Sorry, I couldn't find any relevant articles on PubMed."

    # Fetch article details (titles and abstracts)
    articles = fetch_article_details(article_ids)

    # Combine articles into one document for summarization
    documents = "\n\n".join(articles)

    # Use SentenceTransformer to create embeddings
    model = SentenceTransformer("all-MiniLM-L6-v2")
    embeddings = model.encode([documents])

    # Create FAISS index to store and retrieve the embeddings
    index = faiss.IndexFlatL2(embeddings.shape[1])  # L2 distance (Euclidean distance)
    index.add(np.array([embeddings[0]]).astype(np.float32))

    # Use HuggingFace model for question answering
    qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")

    # Retrieve relevant documents from FAISS index
    retrieved_text = documents  # In this case, we're directly using the full text

    # Use the question answering model to answer based on the retrieved text
    answer = qa_model(question=query, context=retrieved_text)

    return answer['answer']

# 4. Example usage
query = input("Enter your medical question: ")
response = medical_qa_chatbot(query)
print("\n🔑 Answer:", response)



Enter your medical question: symptoms of flu
🔍 Searching PubMed for: symptoms of flu


Device set to use cpu



🔑 Answer: fever, fatigue, cough, and body aches


In [26]:
!pip install pydub transformers torch SpeechRecognition moviepy langchain


Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting SpeechRecognition
  Downloading speechrecognition-3.14.2-py3-none-any.whl.metadata (30 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Downloading speechrecognition-3.14.2-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydub, SpeechRecognition
Successfully installed SpeechRecognition-3.14.2 pydub-0.25.1


In [30]:
!pip install pytube moviepy SpeechRecognition transformers torch pydub
!pip install yt-dlp





Collecting yt-dlp
  Downloading yt_dlp-2025.3.31-py3-none-any.whl.metadata (172 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.2/172.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.3.31-py3-none-any.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.3.31


In [32]:
!pip install yt-dlp moviepy SpeechRecognition transformers torch
!apt-get install ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 30 not upgraded.


In [33]:
import os
import subprocess
import speech_recognition as sr
from transformers import pipeline
import moviepy.editor as mp
from yt_dlp import YoutubeDL

# Step 1: Download audio from YouTube using yt-dlp and convert to WAV
def download_youtube_audio(youtube_url, output_filename="downloaded_audio.wav"):
    print("🎬 Downloading audio using yt-dlp...")

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'temp_audio.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'quiet': False
    }

    with YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    return "temp_audio.wav"

# Step 2: Transcribe audio to text using SpeechRecognition
def audio_to_text(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        print("🗣️ Transcribing...")
        audio_data = recognizer.record(source)
        try:
            return recognizer.recognize_google(audio_data)
        except sr.UnknownValueError:
            return "Could not understand the audio."
        except sr.RequestError:
            return "Error with the transcription service."

# Step 3: Answer user questions using HuggingFace QA model
def answer_question(text, question):
    print("❓ Answering question...")
    qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
    return qa(question=question, context=text)["answer"]

# Step 4: Combine everything in one function
def handle_youtube_qa(youtube_url, user_question):
    try:
        audio_file = download_youtube_audio(youtube_url)
        transcript = audio_to_text(audio_file)

        print("\n📜 Transcript preview:\n", transcript[:300], "...\n")
        answer = answer_question(transcript, user_question)
        print("✅ Answer:", answer)
    except Exception as e:
        print("❌ Error:", str(e))

# 🔹 Interactive usage
youtube_url = input("Paste the YouTube video URL: ")
user_question = input("What do you want to ask about the video? ")

handle_youtube_qa(youtube_url, user_question)


Paste the YouTube video URL: https://www.youtube.com/watch?v=uhiBAQmkdb0
What do you want to ask about the video? what is the video about?
🎬 Downloading audio using yt-dlp...
[youtube] Extracting URL: https://www.youtube.com/watch?v=uhiBAQmkdb0
[youtube] uhiBAQmkdb0: Downloading webpage
[youtube] uhiBAQmkdb0: Downloading tv client config
[youtube] uhiBAQmkdb0: Downloading player 64be519f-main
[youtube] uhiBAQmkdb0: Downloading tv player API JSON
[youtube] uhiBAQmkdb0: Downloading ios player API JSON
[youtube] uhiBAQmkdb0: Downloading m3u8 information
[info] uhiBAQmkdb0: Downloading 1 format(s): 251
[download] Destination: temp_audio.webm
[download] 100% of    6.21MiB in 00:00:00 at 14.46MiB/s  
[ExtractAudio] Destination: temp_audio.wav
Deleting original file temp_audio.webm (pass -k to keep)
🗣️ Transcribing...

📜 Transcript preview:
 Error with the transcription service. ...

❓ Answering question...


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu


✅ Answer: Error with the transcription service


In [34]:
!pip install openai-whisper


Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20240930-py3-none-any.whl size=803405 sha256=6e859f85f2b57b2220b20af5f792691b3506898a301273e23b0a64964d9e55ee
  Stored in directory: /root/.cache/pip/wheels/2f/f2/ce/6eb23db4091d026238ce76703bd66da60b969d70bcc81d5d3a
Successfully built openai-whisper
Installing collected packages: openai-whisper
Successfully installed openai-whisper-20240930


In [36]:
import os
import subprocess
import whisper
from transformers import pipeline
import yt_dlp

# Step 1: Download audio from YouTube using yt-dlp and convert to WAV
def download_youtube_audio(youtube_url, output_filename="downloaded_audio.wav"):
    print("🎬 Downloading audio using yt-dlp...")

    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': 'temp_audio.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '192',
        }],
        'quiet': False
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([youtube_url])

    return "temp_audio.wav"

# Step 2: Transcribe audio to text using Whisper (more robust than SpeechRecognition)
def audio_to_text(audio_path):
    print("🗣️ Transcribing with Whisper...")
    model = whisper.load_model("base")  # You can try 'small' or 'large' for better quality
    result = model.transcribe(audio_path)
    return result['text']

# Step 3: Answer user questions using HuggingFace QA model
def answer_question(text, question):
    print("❓ Answering question...")
    qa = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
    return qa(question=question, context=text)["answer"]

# Step 4: Combine everything in one function
def handle_youtube_qa(youtube_url, user_question):
    try:
        audio_file = download_youtube_audio(youtube_url)
        transcript = audio_to_text(audio_file)

        print("\n📜 Transcript preview:\n", transcript[:300], "...\n")
        answer = answer_question(transcript, user_question)
        print("✅ Answer:", answer)
    except Exception as e:
        print("❌ Error:", str(e))

# 🔹 Interactive usage
youtube_url = input("Paste the YouTube video URL: ")
user_question = input("What do you want to ask about the video? ")

handle_youtube_qa(youtube_url, user_question)


Paste the YouTube video URL: https://www.youtube.com/watch?v=uhiBAQmkdb0
What do you want to ask about the video? explain the first quiz answer?
🎬 Downloading audio using yt-dlp...
[youtube] Extracting URL: https://www.youtube.com/watch?v=uhiBAQmkdb0
[youtube] uhiBAQmkdb0: Downloading webpage
[youtube] uhiBAQmkdb0: Downloading tv client config
[youtube] uhiBAQmkdb0: Downloading player 64be519f-main
[youtube] uhiBAQmkdb0: Downloading tv player API JSON
[youtube] uhiBAQmkdb0: Downloading ios player API JSON
[youtube] uhiBAQmkdb0: Downloading m3u8 information
[info] uhiBAQmkdb0: Downloading 1 format(s): 251
[download] Destination: temp_audio.webm
[download] 100% of    6.21MiB in 00:00:00 at 6.28MiB/s   
[ExtractAudio] Destination: temp_audio.wav
Deleting original file temp_audio.webm (pass -k to keep)
🗣️ Transcribing with Whisper...



Device set to use cpu



📜 Transcript preview:
  Number 10 Number 9 Number 8 Number 7 Number 6 Number 5 Number 4 Number 4 Number 3 Number 2 Number 1 Number 7 Number 7 ...

❓ Answering question...
✅ Answer: 7
