In [1]:
# Select the source of the audio/video file to be transcribed
input_format = "youtube" #@param ["youtube", "local"]

# Enter the URL of the YouTube video or the path of the audio file to be transcribed
file = "https://www.youtube.com/watch?v=o3Kn5OGeokU" #@param {type:"string"}

#Click here if you'd like to save the transcription as text file
plain = True #@param {type:"boolean"}

# Click here if you'd like to save the transcription as an SRT file
srt = True #@param {type:"boolean"}

#Click here if you'd like to save the transcription as a VTT file
vtt = True #@param {type:"boolean"}

#Click here if you'd like to save the transcription as a TSV file
tsv = True #@param {type:"boolean"}

In [2]:
# Dependencies

import os, re
import torch
from pathlib import Path
from pytube import YouTube

import whisper
from whisper.utils import get_writer

In [3]:
# Use CUDA, if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load the desired model
model = whisper.load_model("medium.en").to(DEVICE)

In [4]:
def to_snake_case(name):
    return name.lower().replace(" ", "_").replace(":", "_").replace("__", "_")

def download_youtube_audio(url,  file_name = None, out_dir = "."):
    "Download the audio from a YouTube video"
    yt = YouTube(url)
    if file_name is None:
        file_name = Path(out_dir, to_snake_case(yt.title)).with_suffix(".mp4")
    yt = (yt.streams
            .filter(only_audio = True, file_extension = "mp4")
            .order_by("abr")
            .desc())
    return yt.first().download(filename = file_name)

In [5]:
def transcribe_file(model, file, plain, srt, vtt, tsv):
    """
    Runs Whisper on an audio file

    Parameters
    ----------
    model: Whisper
        The Whisper model instance.

    file: str
        The file path of the file to be transcribed.

    plain: bool
        Whether to save the transcription as a text file or not.

    srt: bool
        Whether to save the transcription as an SRT file or not.

    vtt: bool
        Whether to save the transcription as a VTT file or not.

    tsv: bool
        Whether to save the transcription as a TSV file or not.

    download: bool
        Whether to download the transcribed file(s) or not.

    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    """
    file_path = Path(file)
    print(f"Transcribing file: {file_path}\n")

    output_directory = file_path.parent

    # Run Whisper
    result = model.transcribe(file, verbose = False, language = "en")

    if plain:
        txt_path = file_path.with_suffix(".txt")
        print(f"\nCreating text file")

        with open(txt_path, "w", encoding="utf-8") as txt:
            txt.write(result["text"])
    if srt:
        print(f"\nCreating SRT file")
        srt_writer = get_writer("srt", output_directory)
        srt_writer(result, str(file_path.stem))

    if vtt:
        print(f"\nCreating VTT file")
        vtt_writer = get_writer("vtt", output_directory)
        vtt_writer(result, str(file_path.stem))

    if tsv:
        print(f"\nCreating TSV file")

        tsv_writer = get_writer("tsv", output_directory)
        tsv_writer(result, str(file_path.stem))

    # if download:
    #     from google.colab import files

    #     colab_files = Path("/content")
    #     stem = file_path.stem

    #     for colab_file in colab_files.glob(f"{stem}*"):
    #         if colab_file.suffix in [".txt", ".srt", ".vtt", ".tsv"]:
    #             print(f"Downloading {colab_file}")
    #             files.download(str(colab_file))

    return result

In [None]:
if input_format == "youtube":
    # Download the audio stream of the YouTube video
    audio = download_youtube_audio(file)
    print(f"Downloading audio stream: {audio}")


    # Run Whisper on the audio stream
    result = transcribe_file(model, audio, plain, srt, vtt, tsv)

elif input_format == "local":
    # Run Whisper on the specified file
    result = transcribe_file(model, file, plain, srt, vtt, tsv)