In [None]:
# ! pip install git+https://github.com/openai/whisper.git 

In [None]:
# !yt-dlp -f 'ba' -x --audio-format mp3 <URL>

In [None]:
import datetime
import whisper
import pathlib
import os

# CODE FROM: https://www.codingforentrepreneurs.com/blog/getting-started-with-openai-whisper/
def timedelta_to_videotime(delta):
  """
  Here's a janky way to format a 
  datetime.timedelta to match 
  the format of vtt timecodes. 
  """
  parts = delta.split(":")
  if len(parts[0]) == 1:
    parts[0] = f"0{parts[0]}"
  new_data = ":".join(parts)
  parts2 = new_data.split(".")
  if len(parts2) == 1:
    parts2.append("000")
  elif len(parts2) == 2:
    parts2[1] = parts2[1][:2]
  final_data = ".".join(parts2)
  return final_data


def whisper_segments_to_vtt_data(result_segments):
  """
  This function iterates through all whisper
  segements to format them into WebVTT.
  """
  data = "WEBVTT\n\n"
  for idx, segment in enumerate(result_segments):
    num = idx + 1
    data += f"{num}\n"
    start_ = datetime.timedelta(seconds=segment.get('start'))
    start_ = timedelta_to_videotime(str(start_))
    end_ = datetime.timedelta(seconds=segment.get('end'))
    end_ = timedelta_to_videotime(str(end_))
    data += f"{start_} --> {end_}\n"
    text = segment.get('text').strip()
    data += f"{text}\n\n"
  return data


whisper_model = whisper.load_model("large")

In [None]:
!rm -rf .ipynb_checkpoints
AUDIO_FILES_PATH = "./"

# This file should primarily be moved inside a folder with all of the audio files 
# which is why the path to full text and to captions should be the directory before that audio folder 
path_to_where_full_text_goes = "../full_text_output/"
path_to_where_captions_go = "../caption_output/"

for file in os.listdir(AUDIO_FILES_PATH):
    # make sure file is an image
    if file.endswith(('.webm', '.mp3')):
        audio_title = file.split(".")[0]
        print(audio_title)
        
        #transcribe audio
        transcription = whisper_model.transcribe(file)
        with open(path_to_where_full_text_goes+audio_title+".txt", "w", encoding="utf-8") as f:
            f.write(transcription["text"])
        
        #captions
        caption_data = whisper_segments_to_vtt_data(transcription['segments']) #print(caption_data) 
        with open(path_to_where_captions_go+audio_title+".vtt", "w", encoding="utf-8") as f:
            f.write(caption_data)
        
        print("")
