In [4]:
from youtube_transcript_api import YouTubeTranscriptApi
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

import nltk
nltk.download('stopwords')


# Function to fetch transcript from YouTube video URL
def fetch_transcript(video_url):
    video_id = video_url.split("=")[-1]
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = ' '.join([t['text'] for t in transcript])
    return text

# Function to preprocess text
def preprocess_text(text):
    sentences = sent_tokenize(text)
    cleaned_sentences = [sentence.lower() for sentence in sentences]
    return cleaned_sentences

# Function to calculate sentence similarity using cosine similarity
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
 
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
 
    all_words = list(set(sent1 + sent2))
 
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
 
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
 
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
 
    return 1 - cosine_distance(vector1, vector2)

# Function to create similarity matrix
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros((len(sentences), len(sentences)))
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2: # ignore if both are same sentences
                continue 
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)

    return similarity_matrix

# Function to generate summary
# Function to generate summary
def generate_summary(text, top_n=5):
    stop_words = stopwords.words('english')
    summarize_text = []

    # Step 1 - Read text and tokenize
    sentences = preprocess_text(text)

    # Step 2 - Generate similarity matrix across sentences
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)

    # Step 3 - Rank sentences in similarity matrix
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)

    # Step 4 - Sort the rank and pick top sentences
    ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)    

    # Check if there are enough sentences to summarize
    if len(ranked_sentence) < top_n:
        top_n = len(ranked_sentence)

    for i in range(top_n):
        summarize_text.append(" ".join(ranked_sentence[i][1]))

    # Step 5 - Output the summarized text
    return ". ".join(summarize_text)

# Example usage
if __name__ == "__main__":
    video_url = "https://www.youtube.com/watch?v=FkQWpQd9Zdo"
    text = fetch_transcript(video_url)
    summary = generate_summary(text)
    print(summary)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shruthika/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


t h i s   i s   e a r t h   t h i s   i s   w h e r e   y o u   l i v e   a n d   t h i s   i s   w h e r e   y o u   l i v e   i n   y o u r   n e i g h b o r h o o d   t h e   s o l a r   s y s t e m   [ a p p l a u s e ]   h e r e ' s   t h e   d i s t a n c e   b e t w e e n   t h e   e a r t h   a n d   t h e   m o o n   d o e s n ' t   l o o k   t o o   f a r   d o e s   i t   t h i n k   a g a i n   a t   t h e i r   f a r t h e s t   p o i n t   t h e   e a r t h   a n d   t h e   m o o n   a r e   t w o   h u n d r e d   a n d   f i f t y   t w o   t h o u s a n d   e i g h t y   e i g h t   m i l e s   a w a y   i n s i d e   t h a t   d i s t a n c e   y o u   c o u l d   f i t   e v e r y   p l a n e t   i n   o u r   e n t i r e   s o l a r   s y s t e m   b u t   l e t ' s   t a l k   a b o u t   p l a n e t s   t h e   g r e a t   r e d   s p o t   o n   j u p i t e r   t h a t ' s   a b o u t   t w o   t i m e s   a s   b i g   a s   e a r t h   a n d   s a t u r n   i 