In [8]:
from transformers import pipeline
from transformers import BartTokenizer, BartForConditionalGeneration
from youtube_transcript_api import YouTubeTranscriptApi
from tqdm.auto import tqdm
import gradio as gr
import torch

import warnings
warnings.filterwarnings("ignore")

In [9]:
gr.__version__

'4.38.1'

MVP concept

<img src="figure/figure2.png" alt="drawing" width="1000"/>

In [2]:
# load model and tokenizer
device = "cuda" if torch.cuda.is_available() else "cpu" 
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

link = "https://www.youtube.com/watch?v=z4fai9N8HtQ" # link to the video
# count model parameters
num_params = sum(p.numel() for p in model.parameters())
print(f"Max number of tokens: {model.config.max_position_embeddings}")
print(f"Number of parameters: {num_params}")

Max number of tokens: 1024
Number of parameters: 406290432


<img src="figure/figure1.png" alt="drawing" width="1000"/>

In [3]:
def summarize(tokens, model, max_summary_length=512, device="cpu"):
    """
    tokens: input token (tensor of token)
    max_summary_length: maximum number of tokens in summary (int)
    device: device to run model (str)
    
    return summarize input text using model (str)
    """
    token_input = tokens.to(device)
    model = model.to(device)
    summary_ids = model.generate(token_input, min_length=int(max_summary_length//5), max_length=max_summary_length, num_beams=4,  early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def split_token_chunks(text, tokenizer, max_tokens=1024, overlap=0.2, device="cpu"):
    """
    text: input text (str)
    max_tokens: maximum number of tokens per chunk (int)
    overlap: number of overlapping tokens between chunks (int)
    device: device to run model (str)
    
    create overlapping token chunks from input
    
    return: list of token chunks (list of tensor)
    """
    tokens = tokenizer("summarize: " + text, return_tensors="pt")["input_ids"].to(device)
    token_chunks = [tokens[:, i:i+max_tokens] for i in range(0, tokens.shape[1], max_tokens-int(max_tokens*overlap))] # split token into chunks
    return token_chunks
    
def summarize_long_text(text, max_summary_length=512, level=0, max_token_length=1024, device="cpu"):
    """
    text: input text (str)
    max_summary_length: maximum number of tokens in summary (int)
    level: level of recursion (int)
    max_token_length: maximum number of tokens per chunk (int)
    device: device to run model (str)

    recursively summarize long text by splitting into chunks
    
    return summarize input text using model (str)
    """
    level = level + 1
    print(f"Level: {level}")
    token_chunks = split_token_chunks(text, tokenizer, device=device)
    summary_ls = []
    for token_chunk in tqdm(token_chunks):
        summary = summarize(token_chunk, model, max_summary_length=int(max_summary_length//3), device=device)
        summary_ls.append(summary)
    summary_concat = " ".join(summary_ls)
    tokens_summary_concat = tokenizer(summary_concat, return_tensors="pt")["input_ids"]
    
    if tokens_summary_concat.shape[1] > max_token_length:
        return summarize_long_text(summary_concat, max_summary_length=max_summary_length, level=level,  device=device)
    if level > 100:
        print("Level > 100, return summary_concat")
        return summary_concat
    else:
        final_summary = summarize(tokens_summary_concat, model, max_summary_length=max_summary_length, device=device)
        return final_summary

def get_transcript(video_link):
    """
    video_link: youtube video link (str)
    
    return transcript of the video (str)
    """
    video_id = video_link.split("v=")[1]
    eng_transcript = YouTubeTranscriptApi.get_transcript(video_id)
    transcript = " ".join([line['text'] for line in eng_transcript])
    return transcript  

def get_video_summary(video_link, max_summary_length=512, device="cpu"):
    """
    video_link: youtube video link (str)
    
    return transcript of the video (str)
    """
    try:
        transcript = get_transcript(video_link)
    except Exception as e:
        print(f"Error: {e}")
        return "Error: No transcript found"
    summary = summarize_long_text(transcript, max_summary_length=max_summary_length, device=device)
    return summary


In [4]:
video_summary = get_video_summary(link, max_summary_length=512, device=device)

Token indices sequence length is longer than the specified maximum sequence length for this model (2847 > 1024). Running this sequence through the model will result in indexing errors


Level: 1


  0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
print(video_summary)

 AI video boom was all started thanks to the release of cing published by a Chinese social media company called quiso which is similar to Tik Tok. Luma dream machine which was released 5 days after cling was another big but pleasant surprise too. These models are roughly equally good and equally bad at the same time they are all still terrible at Anatomy well it's still expected as AI image Generations still can't handle the anatomy well either. These AI video generators are still far from being World simulators but creative wise it still has some pretty interesting touch it can provide.


In [7]:
demo = gr.Interface(
    fn=lambda text, max_length: get_video_summary(text, max_length, device=device), 
    inputs=["text", 
            gr.Slider(100, 5000, value=512, label="Max Summary Length", step=100)],
    outputs="text",
    title="Video Summarization", 
    description="Paste youtube url below")

demo.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


