In [2]:
import whisper
import ffmpeg
import tempfile
import os
import warnings

In [3]:
def get_audio(paths) -> dict:
    temp_dir = tempfile.gettempdir()
    audio_paths= {}
    for path in paths:
        filename = os.path.basename(path).split('.')[0]
        print(f"Extracting audio from {os.path.basename(path)}...")
        output_path = os.path.join(temp_dir, f"{filename}.wav")

        ffmpeg.input(path).output(
            output_path,
            acodec="pcm_s16le", ac=1, ar="16k"
        ).run(quiet=True, overwrite_output=True)
        audio_paths[path] = output_path
    return audio_paths

In [4]:
def write_transcript(audio_path, text_path, transcribe: callable):
    print(f"Generating transcript for {os.path.basename(audio_path)} audio... This might take a while.")

    warnings.filterwarnings("ignore")
    result = transcribe(audio_path)
    warnings.filterwarnings("default")

    with open(text_path, "w", encoding="utf-8") as f:
        f.write(result["text"])
    return result

In [5]:
def get_transcript(audio_paths: list, output_text: bool, output_dir: str, transcribe: callable):
    text_path = output_dir if output_text else tempfile.gettempdir()
    for path, audio_path in audio_paths.items():
        filename = os.path.basename(path).split('.')[0]
        text_path = os.path.join(text_path, f"{filename}.txt")

        result = write_transcript(audio_path, text_path, transcribe)
    return result

In [6]:
def initiate_stt(video_path:str, model:str, output_dir:str, srt:bool, verbose:bool):
    os.makedirs(output_dir, exist_ok=True)
    if model.endswith(".en"):
        print(f"{model} is a English model")
    model = whisper.load_model(model)
    audio = get_audio(video_path)
    subtitle = get_transcript(
                audio, srt, output_dir, lambda audio_path: model.transcribe(audio_path, 
                                                verbose=verbose, task='transcribe'))
    return subtitle    

In [8]:
video_path = [r"D:\MLProject6\Video_Summarizer\uploads\sample.mp4"]

transcript = initiate_stt(video_path=video_path, model='tiny', output_dir='transcript_text', srt=True, verbose=False)

  checkpoint = torch.load(fp, map_location=device)


Extracting audio from sample.mp4...
Generating transcript for sample.wav audio... This might take a while.
Detected language: English


100%|██████████| 15257/15257 [00:09<00:00, 1666.25frames/s]


In [9]:
transcript['text']

" If we have a one minute speech to deliver, the main challenge that we face is how to be very consolidated so much important information into just one minute. And more importantly, how do we communicate this in a clear manner? That's where this new clicking communication thing work called, brek, comes into the picture. This framework will not only help us speak in a concise manner, why making sure all the important points are in place, but the best thing about it is that if we have one minute speeches or short speeches to give in almost any occasion, this framework can be applied there. So it also improves our ability to speak on this part, even when we're not prepared. So what does this framework stand for? brek, it's an abbreviation with stands for point, reason, example point. We basically start off with the main point or message of our speech, then we want to give the reason for that main point. Then we give an example and finally end it again with our main point. Let's look at an

In [11]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("philschmid/bart-large-cnn-samsum")
model = AutoModelForSeq2SeqLM.from_pretrained("philschmid/bart-large-cnn-samsum")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [12]:
inputs = tokenizer(transcript['text'], max_length=512, truncation=True, return_tensors='pt')

In [13]:
summary_ids = model.generate(inputs["input_ids"], max_new_tokens=512)
tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

'In brek, the main point or message of a speech is followed by the reason for that main point, followed by an example, and then the last point. The framework can be applied to a variety of topics, such as the topic of favorite music or short speeches.'