<a href="https://colab.research.google.com/github/Sundar0207/SDC/blob/main/YoutubeSummarization2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ STEP 1: Install Required Packages
!pip install youtube-transcript-api langchain faiss-cpu sentence-transformers transformers accelerate

# ✅ STEP 2: Import Libraries
import re
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

# ✅ STEP 3: Extract YouTube Transcript
def get_transcript(video_url):
    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", video_url)
    if not match:
        raise ValueError("Could not extract video ID from URL")
    video_id = match.group(1)
    print(f"Extracted Video ID: {video_id}")
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    return " ".join([t['text'] for t in transcript])

# ✅ STEP 4: Chunk Text
def chunk_text(text, chunk_size=1000, overlap=100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    return splitter.split_text(text)

# ✅ STEP 5: Create FAISS Index with SentenceTransformer Embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

def create_faiss_index(chunks):
    embeddings = embedding_model.encode(chunks)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index, embeddings

# ✅ STEP 6: Load Free LLM from Hugging Face
def load_llm():
    model_name = "google/flan-t5-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)
    return summarizer

# ✅ STEP 7: Retrieve Relevant Chunks
def retrieve_relevant_chunks(query, chunks, embeddings, index, top_k=5):
    query_vec = embedding_model.encode([query])
    D, I = index.search(query_vec, top_k)
    return [chunks[i] for i in I[0]]

# ✅ STEP 8: Summarize Chunks
def summarize_chunks(chunks, summarizer):
    text = " ".join(chunks)
    if len(text) > 1000:
        text = text[:1000]  # trim to fit model
    summary = summarizer(text, max_length=150, min_length=50, do_sample=False)
    return summary[0]['summary_text']

# ✅ STEP 9: Full Pipeline
def summarize_youtube_video_free_llm(video_url):
    print("🔍 Getting transcript...")
    transcript = get_transcript(video_url)

    print("✂️ Chunking...")
    chunks = chunk_text(transcript)

    print("📦 Creating FAISS index...")
    index, embeddings = create_faiss_index(chunks)

    print("⚙️ Loading LLM...")
    summarizer = load_llm()

    print("🔎 Retrieving relevant chunks...")
    retrieved = retrieve_relevant_chunks("Summarize the video", chunks, embeddings, index)

    print("📝 Generating summary...")
    return summarize_chunks(retrieved, summarizer)

# ✅ STEP 10: Run the Summarizer
video_url = "https://youtu.be/ENrzD9HAZK4?si=Kgq__9irkwVjAnvc"
summary = summarize_youtube_video_free_llm(video_url)
print("\n📄 Summary:\n", summary)


🔍 Getting transcript...
Extracted Video ID: ENrzD9HAZK4
✂️ Chunking...
📦 Creating FAISS index...
⚙️ Loading LLM...


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu


🔎 Retrieving relevant chunks...
📝 Generating summary...

📄 Summary:
 if this video helped you please like And subscribe and if you really want to dive deep into node.js and express consider becoming a pro member at fireship iio I have a whole bunch of advanced content covering real world use cases with these Technologies thanks for watching and I will see you in the next one [Music]
