In [2]:
import youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
import re
from nltk.corpus import stopwords
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [3]:
link = "https://www.youtube.com/watch?v=Y8Tko2YC5hA" 
# link = "https://www.youtube.com/watch?v=iOdFUJiB0Zc"
unique_id = link.split("=")[-1]
sub = YouTubeTranscriptApi.get_transcript(unique_id)  
subtitle = " ".join([x['text'] for x in sub])

In [4]:
subtitle

'In this video, I\'m going to answer the top 3 questions my students ask me about Python. What is Python? What  can you do with it? And why is it so popular? In other words, what does it do that other programming languages don\'t? Python is the  world\'s fastest growing and most popular programming language, not just  amongst software engineers, but also amongst mathematicians,  data analysts, scientists, accountants, networking engineers, and even kids! Because it\'s a very beginner friendly programming  language. So people from different disciplines use Python for a variety of different tasks, such as data analysis and visualization,  artificial intelligence and machine learning, automation  in fact this is one of the big uses of Python amongst people who are not software developers. If you constantly have to do boring, repetitive  tasks, such as copying files and folders around, renaming them,  uploading them to a server, you can easily write a Python script to automate all that and

In [5]:
from nltk.tokenize import sent_tokenize

In [6]:
subtitle = subtitle.replace("\n","")
sentences = sent_tokenize(subtitle)

In [7]:
sentences

["In this video, I'm going to answer the top 3 questions my students ask me about Python.",
 'What is Python?',
 'What  can you do with it?',
 'And why is it so popular?',
 "In other words, what does it do that other programming languages don't?",
 "Python is the  world's fastest growing and most popular programming language, not just  amongst software engineers, but also amongst mathematicians,  data analysts, scientists, accountants, networking engineers, and even kids!",
 "Because it's a very beginner friendly programming  language.",
 'So people from different disciplines use Python for a variety of different tasks, such as data analysis and visualization,  artificial intelligence and machine learning, automation  in fact this is one of the big uses of Python amongst people who are not software developers.',
 'If you constantly have to do boring, repetitive  tasks, such as copying files and folders around, renaming them,  uploading them to a server, you can easily write a Python sc

In [39]:
organized_sent = {k:v for v,k in enumerate(sentences)}

In [40]:
# tf_idf = TfidfVectorizer(min_df=2, 
#                                     strip_accents='unicode',
#                                     max_features=None,
#                                     lowercase = True,
#                                     token_pattern=r'w{1,}',
#                                     ngram_range=(1, 3), 
#                                     use_idf=1,
#                                     smooth_idf=1,
#                                     sublinear_tf=1,
#                                     stop_words = 'english')
tf_idf = TfidfVectorizer(min_df=2, 
                         strip_accents='unicode',
                         max_features=None,
                         lowercase=True,
                         token_pattern=r'\w{1,}',
                         ngram_range=(1, 3), 
                         use_idf=True,
                         smooth_idf=True,  # Changed to True
                         sublinear_tf=True,
                         stop_words='english')

In [21]:
sentence_vectors = tf_idf.fit_transform(sentences)
sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel()

In [22]:
N = 3
top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N]]

In [None]:
# mapping the scored sentences with their indexes as in the subtitle
mapped_sentences = [(sentence,organized_sent[sentence]) for sentence in top_n_sentences]
# Ordering the top-n sentences in their original order
mapped_sentences = sorted(mapped_sentences, key = lambda x: x[1])
ordered_sentences = [element[0] for element in mapped_sentences]
# joining the ordered sentence
summary = " ".join(ordered_sentences)
summary

# Using Transformers

In [8]:
# import transformers
# from transformers import BartTokenizer, BartForConditionalGeneration
# # Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("VortexKnight7/Video-Summ")
model = AutoModelForCausalLM.from_pretrained("VortexKnight7/Video-Summ")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards: 100%|██████████| 2/2 [28:32<00:00, 856.28s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [00:51<00:00, 25.89s/it]


In [45]:
# tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
# model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

In [9]:
input_tensor = tokenizer.encode( subtitle, return_tensors="pt", max_length=512)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
outputs_tensor = model.generate(input_tensor,max_new_tokens =150, min_length=120, length_penalty=2.0, num_beams=4, early_stopping=True)
# outputs_tensor = model.generate(input_tensor,max_new_tokens =150, max_length=160, min_length=120, length_penalty=2.0, num_beams=4, early_stopping=True)
outputs_tensor

In [None]:
print(tokenizer.decode(outputs_tensor[0]))

# Using Pipeline

In [30]:
from transformers import pipeline

In [None]:
summarizer = pipeline('summarization', model="facebook/bart-large-cnn")
pipeline_summary = summarizer(subtitle, max_length=180, min_length=30, truncation=True)[0]['summary_text']
print("\nPipeline Summary:\n", pipeline_summary)

In [None]:
# summary = summarizer(subtitle, max_length = 180, min_length =  30)

In [None]:
import youtube_transcript_api
from youtube_transcript_api import YouTubeTranscriptApi
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline

# Fetch the transcript
link = "https://www.youtube.com/watch?v=Y8Tko2YC5hA" 
unique_id = link.split("=")[-1]
sub = YouTubeTranscriptApi.get_transcript(unique_id)
subtitle = " ".join([x['text'] for x in sub])

# Sentence tokenization
subtitle = subtitle.replace("\n", "")
sentences = sent_tokenize(subtitle)
organized_sent = {k: v for v, k in enumerate(sentences)}

# TF-IDF Vectorization
tf_idf = TfidfVectorizer(min_df=2, 
                         strip_accents='unicode',
                         max_features=None,
                         lowercase=True,
                         token_pattern=r'\w{1,}',
                         ngram_range=(1, 3), 
                         use_idf=True,
                         smooth_idf=True,  # Changed to True
                         sublinear_tf=True,
                         stop_words='english')

sentence_vectors = tf_idf.fit_transform(sentences)
sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel()

# Top-N sentences
N = 3
top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:N]]

# Mapping the scored sentences with their indexes as in the subtitle
mapped_sentences = [(sentence, organized_sent[sentence]) for sentence in top_n_sentences]

# Ordering the top-n sentences in their original order
mapped_sentences = sorted(mapped_sentences, key=lambda x: x[1])
ordered_sentences = [element[0] for element in mapped_sentences]

# Joining the ordered sentences to form the summary
summary = " ".join(ordered_sentences)
print("TF-IDF Summary:\n", summary)

# BART Model Summarization
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Encoding and generating summary
input_tensor = tokenizer.encode(subtitle, return_tensors="pt", max_length=1024, truncation=True)
outputs_tensor = model.generate(input_tensor, max_length=160, min_length=120, length_penalty=2.0, num_beams=4, early_stopping=True)
bart_summary = tokenizer.decode(outputs_tensor[0], skip_special_tokens=True)
print("\nBART Model Summary:\n", bart_summary)

# Using the Hugging Face pipeline for summarization
summarizer = pipeline('summarization', model="facebook/bart-large-cnn")
pipeline_summary = summarizer(subtitle, max_length=180, min_length=30, truncation=True)[0]['summary_text']
print("\nPipeline Summary:\n", pipeline_summary)