### Using Whisper

In [8]:
import whisper
import torch
import os

def transcribe_audio(audio_path, model_size='base', language='en'):

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = whisper.load_model(model_size).to(device)
    result = model.transcribe(audio_path, language=language)
    return result['text']

audio_file = "output.mp3"
transcribed_text = transcribe_audio(audio_file)

print(transcribed_text)

 After reading tons of productivity books, I came across so many rules. Like the two-year rule, the five-minute rule, the five-second rule. No, not that five-second rule. The problem is that these rules are meant for companies or entrepreneurs. But I was able to adapt them to my studies during med school and drastically cut down on my procrastination. So I'm going to share with you two different two-minute rules for the next two minutes. The first two-minute rule comes from getting things done by David Allen. He says, if it takes two minutes to do, get it done right now. For example, if I need to take out the trash today, it takes two minutes to do. So if I'm thinking about it now, I might as well just do it now. Instead of writing it down on a to-do list or probably forgetting about it or having to come back to it later, which takes more than two minutes. That's how I see it. So here's a list of things that might take two minutes throughout the day, like organizing your desk or wateri

In [None]:
import whisper
import torch
import os

def transcribe_audio_with_subtitle_timestamps(audio_path, model_size='base', language='en', words_per_subtitle=10):
    """Transcribes audio with subtitles (groups of words) and timestamps for all segments."""

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = whisper.load_model(model_size).to(device)
    result = model.transcribe(audio_path, language=language, word_timestamps=True)
    segments = result['segments']  # Get all segments

    all_subtitles = []

    for segment in segments:  # Iterate through each segment
        words = segment['words']
        subtitles = []
        current_subtitle = []
        current_start = None
        last_end = None

        for word_info in words:
            word = word_info['word']
            start = word_info['start']
            end = word_info['end']

            if current_start is None:
                current_start = start

            current_subtitle.append(word)

            if len(current_subtitle) >= words_per_subtitle:
                subtitle_text = " ".join(current_subtitle)
                subtitles.append({'start': current_start, 'end': end, 'text': subtitle_text})
                current_subtitle = []
                current_start = None

            last_end = end

        # Handle remaining words in the segment
        if current_subtitle:
            subtitle_text = " ".join(current_subtitle)
            subtitles.append({'start': current_start, 'end': last_end, 'text': subtitle_text})

        all_subtitles.extend(subtitles) # Add the subtitles from this segment to the complete list.

    return all_subtitles

audio_file = "test.mp3"  # Replace with your audio file
subtitles = transcribe_audio_with_subtitle_timestamps(audio_file)

if subtitles:
    for subtitle in subtitles:
        print(f"[{subtitle['start']:.2f} - {subtitle['end']:.2f}] {subtitle['text']}")
else:
    print("Transcription failed.")

print(subtitles)

[0.00 - 2.16]  If  you  have  trouble  speaking  on  the  spot,
[2.34 - 4.74]  you  have  nothing  prepared,  try  this  format.
[5.12 - 6.52]  It's  called  the  prep  format.
[6.92 - 9.52]  You  make  your  point,  you  provide  a  reason
[9.52 - 11.96]  why  you  chose  that,  you  give  an  example,
[11.96 - 13.50]  and  then  you  make  a  point  again.
[14.04 - 15.82]  So  here's  an  example  of  this.
[16.16 - 18.80]  Let's  say  the  question  was,  what  is  your  favorite  fruit?
[19.34 - 20.88]  You're  not  prepared  to  answer  that  question,
[20.98 - 22.48]  but  if  you  were  to  do  it,  you  could  say
[22.48 - 22.72]  something
[22.72 - 25.14]  like  my  favorite  fruit  is  the  banana.
[25.78 - 28.32]  The  reason  I  like  it  is  because  it  tastes  really
[28.32 - 28.48]  good
[28.48 - 29.54]  in  my  protein  shakes.
[30.04 - 33.30]  For  example,  when  I  came  back  from  the  gym  yesterday,
[33.64 - 35.94]  I  want  something  to  eat  or  drink  quickl

### Subtitle Macker

In [1]:
!pip install openai-whisper



In [3]:
!pip install moviepy

Collecting moviepy
  Downloading moviepy-2.1.2-py3-none-any.whl.metadata (6.9 kB)
Collecting imageio_ffmpeg>=0.2.0 (from moviepy)
  Downloading imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting proglog<=1.0.0 (from moviepy)
  Downloading proglog-0.1.11-py3-none-any.whl.metadata (794 bytes)
Downloading moviepy-2.1.2-py3-none-any.whl (126 kB)
Downloading imageio_ffmpeg-0.6.0-py3-none-win_amd64.whl (31.2 MB)
   ---------------------------------------- 0.0/31.2 MB ? eta -:--:--
   ----- ---------------------------------- 4.5/31.2 MB 22.4 MB/s eta 0:00:02
   --------------- ------------------------ 11.8/31.2 MB 29.6 MB/s eta 0:00:01
   --------------- ------------------------ 12.3/31.2 MB 29.7 MB/s eta 0:00:01
   ---------------- ----------------------- 13.1/31.2 MB 16.1 MB/s eta 0:00:02
   ------------------- -------------------- 15.5/31.2 MB 14.5 MB/s eta 0:00:02
   ------------------------ --------------- 18.9/31.2 MB 14.7 MB/s eta 0:00:01
   ----------------------

In [5]:
# Video to Audio COnverter

from moviepy.editor import VideoFileClip

def convert_video_to_audio(video_path, audio_path):
  try:
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio
    audio_clip.write_audiofile(audio_path)
    video_clip.close()
    audio_clip.close()
    print(f"Succesfully converted {video_path} to {audio_path}")
  except Exception as e:
    print(f"Error: {e}")

video_path = 'vtest.mp4'
audio_path = 'output.mp3'
convert_video_to_audio(video_path, audio_path)

ModuleNotFoundError: No module named 'moviepy.editor'

In [22]:
import whisper
import torch
import os

def subtitle_and_timestamps(audio_path, model_size='base', language='en', words_per_subtitle=10):

  # Using GPU
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = whisper.load_model(model_size).to(device)
  result = model.transcribe(audio_path, language=language, word_timestamps=True)
  segments = result['segments']

  # print(segments)

  all_subtitles = []

  for segment in segments:
    words = segment['words']
    subtitles = []
    current_subtitle = []
    current_start = None
    last_end = None

    for word_info in words:
      word = word_info['word']
      start = word_info['start']
      end = word_info['end']

      if current_start is None:
        current_start = start

      current_subtitle.append(word)

      if len(current_subtitle) >= words_per_subtitle:
        subtitle_text = ' '.join(current_subtitle)
        subtitles.append({'start':current_start, 'end':end, 'text':subtitle_text})
        current_subtitle = []
        current_start = None

      last_end = end

    if current_subtitle:
      subtitle_text = ' '.join(current_subtitle)
      subtitles.append({'start': current_start, 'end': last_end, 'text': subtitle_text})

    all_subtitles.extend(subtitles)
    # print(all_subtitles)

  return all_subtitles

audio_path = 'output.mp3'
subtitles = subtitle_and_timestamps(audio_path)

print(subtitles)

if subtitles:
    for subtitle in subtitles:
        print(f"[{subtitle['start']:.2f} - {subtitle['end']:.2f}] {subtitle['text']}")
else:
    print("Transcription failed.")


[{'start': np.float64(0.0), 'end': np.float64(2.7), 'text': ' After  reading  tons  of  productivity  books,  I  came  across  so'}, {'start': np.float64(2.7), 'end': np.float64(3.38), 'text': ' many  rules.'}, {'start': np.float64(3.8), 'end': np.float64(6.36), 'text': ' Like  the  two -year  rule,  the  five -minute  rule,  the'}, {'start': np.float64(6.36), 'end': np.float64(7.48), 'text': ' five -second  rule.'}, {'start': np.float64(7.68), 'end': np.float64(9.22), 'text': ' No,  not  that  five -second  rule.'}, {'start': np.float64(9.34), 'end': np.float64(11.6), 'text': ' The  problem  is  that  these  rules  are  meant  for  companies'}, {'start': np.float64(11.6), 'end': np.float64(12.66), 'text': ' or  entrepreneurs.'}, {'start': np.float64(13.16), 'end': np.float64(15.38), 'text': ' But  I  was  able  to  adapt  them  to  my  studies'}, {'start': np.float64(15.38), 'end': np.float64(16.32), 'text': ' during  med  school'}, {'start': np.float64(16.32), 'end': np.float64(18.6)