<a href="https://colab.research.google.com/github/Nikhitaa2329/genAI1/blob/main/RAG_ytsummariser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# ✅ Step 1: Install Dependencies
!pip install langchain langchain-community youtube-transcript-api google-generativeai faiss-cpu langchain-google-genai

# ✅ Step 2: Import Libraries
import os
import re
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
import google.generativeai as genai

# ✅ Step 3: Set Your Gemini API Key
GOOGLE_API_KEY = "AIzaSyD2FpyhNgfXSl5kRgzx-AU2L8xd3At6TrE"  # 🔑 Replace with your actual Gemini 1.5 API key
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)

# ✅ Step 4: Extract YouTube Video ID (supports youtube.com and youtu.be)
def extract_video_id(url):
    patterns = [
        r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([^&]+)",
        r"(?:https?://)?(?:www\.)?youtu\.be/([^?&]+)"
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

# ✅ Step 5: Get Transcript from YouTube
def get_transcript(youtube_url):
    try:
        video_id = extract_video_id(youtube_url)
        if not video_id:
            return None
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        full_text = " ".join([entry["text"] for entry in transcript])
        return full_text
    except (TranscriptsDisabled, NoTranscriptFound):
        return None
    except Exception as e:
        return f"❌ Error while fetching transcript: {str(e)}"

# ✅ Step 6: Split transcript into chunks
def split_text_into_docs(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.create_documents([text])
    return docs

# ✅ Step 7: Build RAG pipeline
def build_rag_chain(docs):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vectorstore = FAISS.from_documents(docs, embeddings)
    retriever = vectorstore.as_retriever()

    llm = ChatGoogleGenerativeAI(model="models/gemini-1.5-pro-latest", temperature=0.2)  # Fixed model name

    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=False
    )
    return rag_chain

# ✅ Step 8: Main Summary Function
def summarize_youtube_video(youtube_url):
    transcript = get_transcript(youtube_url)
    if not transcript:
        return "❌ Transcript not available or invalid video URL."
    if transcript.startswith("❌"):
        return transcript

    docs = split_text_into_docs(transcript)
    rag_chain = build_rag_chain(docs)

    query = "Give a clear and concise summary of this video."
    summary = rag_chain.run(query)
    return summary

# ✅ Step 9: Get user input and run the summarizer
video_url = input("Enter the YouTube video URL: ")  # User input for video URL
summary = summarize_youtube_video(video_url)

# Print the summary
print("\n🎯 Video Summary:\n")
print(summary)


Enter the YouTube video URL: https://www.youtube.com/watch?v=dQw4w9WgXcQ

🎯 Video Summary:

The provided text seems to be a garbled portion of the lyrics to Rick Astley's song "Never Gonna Give You Up".  It describes a relationship where the singer is reassuring the other person that they will always be there for them.


In [4]:
# ✅ Step 1: Install Required Libraries
!pip install langchain langchain-community youtube-transcript-api google-generativeai gradio faiss-cpu langchain-google-genai

# ✅ Step 2: Import Libraries
import os
import re
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
import google.generativeai as genai

# ✅ Step 3: Setup Gemini API Key
GOOGLE_API_KEY = "AIzaSyD2FpyhNgfXSl5kRgzx-AU2L8xd3At6TrE"  # 🔑 Replace this with your actual Gemini API key
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
genai.configure(api_key=GOOGLE_API_KEY)

# ✅ Step 4: Helper - Extract YouTube Video ID from various formats
def extract_video_id(url):
    patterns = [
        r"(?:https?://)?(?:www\.)?youtube\.com/watch\?v=([^&]+)",
        r"(?:https?://)?(?:www\.)?youtu\.be/([^?&]+)"
    ]
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

# ✅ Step 5: Fetch YouTube Transcript using video ID
def get_transcript(youtube_url):
    try:
        video_id = extract_video_id(youtube_url)
        if not video_id:
            return None
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        full_text = " ".join([entry["text"] for entry in transcript])
        return full_text
    except (TranscriptsDisabled, NoTranscriptFound):
        return None
    except Exception as e:
        return f"❌ Error while fetching transcript: {str(e)}"

# ✅ Step 6: Split transcript into documents
def split_text_into_docs(text):
    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    docs = splitter.create_documents([text])
    return docs

# ✅ Step 7: Build RAG chain with LangChain + Gemini
def build_rag_chain(docs):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    vectorstore = FAISS.from_documents(docs, embeddings)
    retriever = vectorstore.as_retriever()

    llm = ChatGoogleGenerativeAI(model="gemini-pro", temperature=0.2)

    rag_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=False
    )
    return rag_chain

# ✅ Step 8: Main function for summarization
def summarize_video(youtube_url):
    transcript = get_transcript(youtube_url)
    if not transcript:
        return "❌ Transcript not available or invalid YouTube URL."
    if transcript.startswith("❌"):
        return transcript

    docs = split_text_into_docs(transcript)
    rag_chain = build_rag_chain(docs)

    question = "Give a concise summary of this video."
    result = rag_chain.run(question)
    return result

# ✅ Step 9: Gradio UI for the app
gr.Interface(
    fn=summarize_video,
    inputs=gr.Textbox(label="Enter YouTube Video URL"),
    outputs=gr.Textbox(label="Video Summary"),
    title="🎬 YouTube Video Summarizer using Gemini + RAG",
    description="Summarize any YouTube video using LangChain RAG and Gemini 1.5. Works best for videos with transcripts.",
    theme="soft"
).launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://11f9077eab9f9f6d3a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


