In [1]:
pip install nltk youtube-transcript-api transformers torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import re
import heapq
import nltk
import torch
from collections import defaultdict
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5Tokenizer, T5ForConditionalGeneration

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
def extract_video_id(youtube_url):
    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", youtube_url)
    return match.group(1) if match else None

# Fetch transcript from YouTube video
def get_youtube_transcript(youtube_url):
    video_id = extract_video_id(youtube_url)
    if not video_id:
        return "Invalid YouTube URL"
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([t['text'] for t in transcript])
    except Exception as e:
        return f"Error: {e}"

In [5]:
def textrank_summarize(text, summary_length=1):
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words("english"))
    word_frequencies = defaultdict(int)
    
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word not in stop_words and word.isalnum():
                word_frequencies[word] += 1
    
    max_freq = max(word_frequencies.values(), default=1)
    for word in word_frequencies:
        word_frequencies[word] /= max_freq
    
    sentence_scores = defaultdict(int)
    for sentence in sentences:
        for word in word_tokenize(sentence.lower()):
            if word in word_frequencies:
                sentence_scores[sentence] += word_frequencies[word]
    
    summary_sentences = heapq.nlargest(summary_length, sentence_scores, key=sentence_scores.get)
    return " ".join(summary_sentences)


In [6]:
def summarize_text(tokenizer, model, text):
    inputs = tokenizer(text, max_length=1024, truncation=True, return_tensors="pt")
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [7]:
pegasus_checkpoint = "google/pegasus-large"
bart_checkpoint = "sshleifer/distilbart-cnn-12-6"
pegasus_tokenizer = AutoTokenizer.from_pretrained(pegasus_checkpoint)
pegasus_model = AutoModelForSeq2SeqLM.from_pretrained(pegasus_checkpoint)
bart_tokenizer = AutoTokenizer.from_pretrained(bart_checkpoint)
bart_model = AutoModelForSeq2SeqLM.from_pretrained(bart_checkpoint)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def chunk_text(text, max_tokens=450):
    words = text.split()
    chunks, current_chunk = [], []
    
    for word in words:
        current_chunk.append(word)
        if len(T5Tokenizer.from_pretrained("t5-small").encode(" ".join(current_chunk), add_special_tokens=False)) > max_tokens:
            current_chunk.pop()
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
    
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


In [9]:
def t5_summarize(text, max_length=100):
    tokenizer = T5Tokenizer.from_pretrained("t5-small")
    model = T5ForConditionalGeneration.from_pretrained("t5-small").to("cuda" if torch.cuda.is_available() else "cpu")
    
    chunks = chunk_text(text, max_tokens=450)
    summaries = []
    
    for chunk in chunks:
        input_text = "summarize: " + chunk
        input_ids = tokenizer.encode(input_text, return_tensors="pt", truncation=True, legacy= False).to(model.device)
        summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
        summaries.append(tokenizer.decode(summary_ids[0], skip_special_tokens=True))
    
    return " ".join(summaries)

In [10]:
youtube_url = "https://www.youtube.com/watch?v=_T73DugW_D8"
transcript = get_youtube_transcript(youtube_url)

if not transcript.startswith("Error"):
    print("\n🔹 NLTK TextRank Summary:")
    print(textrank_summarize(transcript))
    
    print("\n🔹 Pegasus Summary:")
    print(summarize_text(pegasus_tokenizer, pegasus_model, transcript))
    
    print("\n🔹 BART Summary:")
    print(summarize_text(bart_tokenizer, bart_model, transcript))
    
    print("\n🔹 T5 Summary:")
    print(t5_summarize(transcript))
else:
    print(transcript)



🔹 NLTK TextRank Summary:
mossdale cave incident it was June 24th 1967 and 10 cavers were about to enter mossdale Cavern to help clear some of the deeper passages and map them out the two leaders of the group had already explored this cave before which reassured everyone else this particular Cavern was known for its tight spaces and the very few areas where a person could fully stand after a few hours inside four of the cavers realized how difficult the cave actually was and decided to return to the surface they noticed a cloudy sky but nothing seemed out of the ordinary the remaining cavers continued their descent until they reached the Far Marathon passage which measured only 18 in x 12 in with a small stream of water flowing along the bottom everything seemed to be going well and the group on the surface soon noticed that it had started raining so they took cover initially the rain did not seem concerning but the cavers inside began to notice Rising water levels when the water becam

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Keyword arguments {'legacy': False} not recognized.
Keyword arguments {'legacy': False} not recognized.
Keyword arguments {'legacy': False} not recognized.
Keyword arguments {'legacy': False} not recognized.
Keyword arguments {'legacy': False} not recognized.


10 cavers were about to enter mossdale Cavern to help clear some of the deeper passages and map them out. the cave was known for its tight spaces and the very few areas where a person could fully stand after a few hours inside four of the cavers realized how difficult the cave actually was. the surface group noticed that it had started raining so they decided to end the day's work and head back even though they were still 3 hours four divers pled to go to the western section of the cave aiming for a depth of 250 ft which would have been a new record. in 1973 divers were still using regular air cylinders rather than mixed gas tanks for this reason they had to be more cautious as they went downwards because nitrogen Narcosis might have settled in as they went down the pressure would have pushed the nitrogen into the bloodstream producing a state of consciousness. cave diving was just 2 ft in diam and 2 years prior two people died in it because they managed to get through a rock slide fil