In [1]:
!pip install git+https://github.com/openai/whisper.git
!pip install pytube
!pip install transformers
!pip install unstructured
!pip install ffmpeg-python

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-jl867nkx
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-jl867nkx
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20231117)
  Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20231117-py3-none-an

In [2]:
!sudo apt update && sudo apt install -y ffmpeg

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [1[0m[33m0% [Connecting to archive.ubuntu.com (185.125.190.39)] [Waiting for headers] [C[0m                                                                               Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,063 kB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1,398 kB]
Get:8 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease [18.1 kB]
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InR

In [3]:
import whisper
from pytube import YouTube
from transformers import pipeline
import os
from typing import List
import logging
logging.basicConfig(filename='demo.log', encoding='utf-8', level=logging.ERROR)

URL = "https://www.youtube.com/watch?v=62DxELjuRec"
VIDEO_NAME="demo"

In [4]:
def download_audio_from_youtube(url: str, video_name: str) -> str:
    video_url= YouTube(url)
    video = video_url.streams.filter(only_audio=True).first()
    filename = video_name + ".mp3"
    video.download(filename=filename)
    return filename

In [5]:
def load_whisper_model(model_name: str = "medium"):
    return whisper.load_model(model_name)

def transcribe_audio_to_text(model, audio_path: str, language: str = "English"):
    return model.transcribe(audio_path, fp16=False, language=language)

def save_text_to_file(text: str, file_name: str):
    try:
        with open(file_name, "w+") as file:
            file.write(text)
    except (IOError, OSError, FileNotFoundError, PermissionError) as e:
        logging.debug(f"Error in file operation: {e}")

def get_text(url: str, video_name: str) -> None:
    model = load_whisper_model()
    audio_path = download_audio_from_youtube(url, video_name)
    result = transcribe_audio_to_text(model, audio_path)
    save_text_to_file(result["text"], video_name + ".txt")

get_text(url=URL, video_name=VIDEO_NAME)

100%|█████████████████████████████████████| 1.42G/1.42G [00:15<00:00, 97.7MiB/s]


In [6]:
import nltk
nltk.download('punkt')

def read_file(file_name: str) -> str:
    try:
        with open(file_name + ".txt", "r", encoding="utf8") as file:
            return file.read()
    except FileNotFoundError as e:
        logging.error(f"{e}: File '{file_name}.txt' not found.")
        return ""
    except Exception as e:
        logging.error(f"Error reading file: {e}")
        return ""

def split_text_into_chunks(document: str, max_tokens: int) -> List[str]:
    if not document:
        return []

    chunks, current_chunk, current_length = [], [], 0

    try:
        for sentence in nltk.sent_tokenize(document):
            sentence_length = len(sentence)

            if current_length + sentence_length < max_tokens:
                current_chunk.append(sentence)
                current_length += sentence_length
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk, current_length = [sentence], sentence_length

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks
    except Exception as e:
        logging.error(f"Error splitting text into chunks: {e}")
        return []

long_text = read_file(VIDEO_NAME)
if long_text:
    text_chunks = split_text_into_chunks(long_text, max_tokens=4000)
    logging.info(f"Text chunks: {text_chunks}")
else:
    logging.error("Error: Unable to process the text.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
import logging
from transformers import pipeline
from typing import Callable, List, Dict

bart_params = {
    "max_length": 5000,
    "min_length": 100,
    "do_sample": False,
    "truncation": True,
    "repetition_penalty": 2.0,
}

def create_summarizer(model: str) -> Callable:
    summarizer = pipeline("summarization", model=model)
    return summarizer

def get_summary_bart(
    list_chunks: List[str], summarizer: Callable, summarization_params: Dict[str, int]
) -> str:
    # Generate summaries for each text chunk
    try:
        summaries = [
            summarizer(chunk, **summarization_params)[0]["summary_text"]
            for chunk in list_chunks
        ]
        return " ".join(summaries)
    except Exception as e:
        logging.error(f"Error generating summaries: {e}")
        return ""


def save_summary_to_file(summary: str, file_name: str) -> None:
    try:
        # Save the summary to a file
        with open(f"{file_name}.txt", "a") as fp:
            fp.write(summary)
    except Exception as e:
        logging.error(f"Error saving summary to file: {e}")


# Assume text_chunks is already defined and contains the chunks of text from the previous steps
summarizer = create_summarizer("facebook/bart-large-cnn")

# Try generating a summary for the full text
try:
    summary = get_summary_bart(text_chunks, summarizer, bart_params)
except Exception as e:
    logging.error(f"Error generating summary: {e}")
    summary = ""

# If the summary is too long, split it into smaller chunks and summarize each chunk separately
if len(summary) > 5000:
    text_chunks = split_text_into_chunks(summary, max_tokens=1000)
    short_summary = get_summary_bart(text_chunks, summarizer, bart_params)
else:
    short_summary = summary

# Save the summary to a file
save_summary_to_file(short_summary, f"short_summary_{VIDEO_NAME}")
logging.info("Summary saved to file.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Your max_length is set to 5000, but your input_length is only 868. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=434)
Your max_length is set to 5000, but your input_length is only 613. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=306)
