In [1]:
### All imports

import youtube_transcript_api
import tkinter.font as tkFont
import os
import nltk
import re
import sklearn
import numpy as np

from youtube_transcript_api import YouTubeTranscriptApi
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize
from gensim.summarization import summarize
from tkinter import *
from tkinter import filedialog
from tkinter import messagebox

import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_transcript(url):
    """
    PARAMETER
    url: Valid YouTube link with valid transcipt.
    
    RETURN
    transcript: String of whole video transcript
    sentences: Transcript divided as list of sentence tokens.
    
    """
    global unique_id
    unique_id = url.split("=")[1:]
    unique_id = "=".join([x for x in unique_id])
    try:
        sub = YouTubeTranscriptApi.get_transcript(unique_id)
        transcript = " ".join([x['text'] for x in sub])

        transcript = transcript.replace("\n","")
        sentences = sent_tokenize(transcript)
    except:
        print("Try with a valid YouTube URL")
    return transcript, sentences

In [3]:
def tf_idf_based_summary(sentences, fraction):
    """
    PARAMETER
    sentences: Transcript divided as list of sentence tokens.
    fraction: Decimal value of desired length of the summary.
    
    RETURN
    tf_idf_summary: Summary generated based on the Tf-Idf method.
    
    """    
    organized_sent = {k:v for v,k in enumerate(sentences)} # Generating sequential integers for the sentences
    tf_idf = TfidfVectorizer(min_df=2, 
                                    strip_accents='unicode',
                                    max_features=None,
                                    lowercase = True,
                                    token_pattern=r'w{1,}',
                                    ngram_range=(1, 3), 
                                    use_idf=1,
                                    smooth_idf=1,
                                    sublinear_tf=1,
                                    stop_words = 'english')
    sentence_vectors = tf_idf.fit_transform(sentences)
    
    sent_scores = np.array(sentence_vectors.sum(axis=1)).ravel() # Generating scores for all the sentences
    
    num_sent=int(np.ceil(len(sentences)*fraction)) # Arriving at a number of sentences to be used to create summary based on 
                                                   # "fraction" demial value
    
    top_n_sentences = [sentences[index] for index in np.argsort(sent_scores, axis=0)[::-1][:num_sent]]

    mapped_sentences = [(sentence,organized_sent[sentence]) for sentence in top_n_sentences]

    mapped_sentences = sorted(mapped_sentences, key = lambda x: x[1])

    ordered_sentences = [element[0] for element in mapped_sentences]

    tf_idf_summary = " ".join(ordered_sentences)

    return tf_idf_summary

In [4]:
def transcript_summary(transcript, sentences, option, fraction):
    """
    PARAMETERS
    transcript: String of whole video transcript
    sentences: Transcript divided as list of sentence tokens.
    option: Method to be used for the summary generation
    fraction: Decimal value of desired length of the summary.
    
    RETURN
    Summary generated based on the selected method and fraction
    
    """
    if len(sentences) > 1:
        if option == "TfIdf":
            return tf_idf_based_summary(sentences, fraction)
        if option == "Gensim":
            return summarize(text=transcript, ratio=fraction, split=False).replace("\n", " ")
    else:
        print("Transcript invalid")

In [5]:
### GUI BLOCK
root = Tk(baseName="YouTube Video Summarizer")
root.title("Transcript Based Video Summarizer")
root.configure(background='#F0F0F8')
root.geometry("700x500+500+400")
root.resizable(0, 0)

title = Label(root, text="YouTube Video Summarizer", font="bold 26",
              bg="#F0F0F8", padx=140, pady=10).grid(row=0, column=0)

url_label = Label(root, text="YouTube URL:", font="bold",
                  bg='#F0F0F8', justify="right", bd=1)
url_label.place(height=50, x=40, y=70)

model_label = Label(root, text="Method:", font="bold",
                    bg='#F0F0F8', justify="right", bd=1)
model_label.place(height=50, x=85, y=135)

fraction_label = Label(root, text="Fraction:", font="bold",
                       bg='#F0F0F8', justify="right", bd=1)
fraction_label.place(height=50, x=80, y=210)

folder_label = Label(root, text="Destination:", font="bold",
                     bg='#F0F0F8', justify="right", bd=1)
folder_label.place(height=50, x=60, y=280)

get_url = Entry(root, width=40)
get_url.place(width=300, height=30, x=150, y=80)

options = ["TfIdf", "Gensim"]

default_option = StringVar(root)
default_option.set(options[0])
drop = OptionMenu(root, default_option, *options)
drop.place(width=200, x=150, y=145)

get_fraction = Entry(root, width=40)
get_fraction.place(width=300, height=30, x=150, y=220)

get_folder = Entry(root, width=40)
get_folder.place(width=300, height=30, x=150, y=290)

folder = StringVar(root)


def browse():
    global folder
    folder = filedialog.askdirectory(initialdir='/')
    get_folder.insert(0, folder)


browse = Button(root, text="Browse", command=browse)
browse.place(height=30, x=475, y=290)


def on_clear():
    default_option.set(options[0])
    get_url.delete(0, END)
    get_folder.delete(0, END)
    get_fraction.delete(0, END)


clear = Button(root, text="Clear", command=on_clear)
clear.place(width=50, x=240, y=350)

def on_submit():
    global url, choice, fraction, current, folder
    url = get_url.get()
    choice = default_option.get()
    fraction = float(get_fraction.get())
    current = os.getcwd()
    folder = get_folder.get()
    os.chdir(folder)
    print(url,choice,fraction,folder)
    transcript, sentences = get_transcript(url)
    if len(sentences) > 1:
        with open("transcript.txt",'w+') as c:
            print(transcript,file=c)

        summary = transcript_summary(transcript, sentences, choice, fraction)
        filename = unique_id+" "+choice+'.txt'
        filename = re.sub(r'[\/:*?<>|]', ' ', filename)
        with open(filename, 'w+') as f:
            print(summary, file=f)
        os.chdir(current)
        openpath = Button(root, text="Open Folder",
                          command=lambda: os.startfile(get_folder.get()))
        openpath.place(x=360, y=350)
    else:
        messagebox.showerror("Warning!", "Transcript invalid")

submit = Button(root, text="Submit", command=on_submit)
submit.place(width=50, x=300, y=350)

root.mainloop()