<a href="https://colab.research.google.com/github/Splurth/Colabs/blob/main/Transcribe_mp3_OpenAI_whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OpenAI Whisper - subtitles and transcription generation from MP3

inspired by   [towardsdatascience](https://towardsdatascience.com/transcribe-audio-files-with-openais-whisper-e973ae348aa7) article

go to runtime > change runtime type 

 select GPU

In [None]:
#@title Config

gdriveMp3Path = '/content/gdrive/MyDrive/audio' #@param {type:"string"}
gdriveOutputPath ='/content/gdrive/MyDrive/transcripts/' #@param {type:"string"}

modelSize = "large" #@param ["tiny", "base","small", "medium", "large"]


# preparation

In [None]:
#@title install ffmpeg {display-mode: "form"}

%%capture
!sudo apt update && sudo apt install ffmpeg

In [None]:
#@title install whisper, pip deps {display-mode: "form"}

%%capture
%pip install git+https://github.com/openai/whisper.git
%pip install pydub

In [None]:
#@title Mp3 preprocessing definition

from pydub import AudioSegment

def mp3toWav(source):
  sound = AudioSegment.from_mp3(source) # load source
  sound = sound.set_channels(1) # mono
  sound = sound.set_frame_rate(16000) # 16000Hz
  output_path = os.path.basename(source)+".wav"
  sound.export(output_path, format="wav")
  return output_path



In [None]:
#@title subtitles SRT output format

from datetime import timedelta
import os

def writeStrFile(transcribe, file):
     segments = transcribe['segments']

     for segment in segments:
        startTime = str(0)+str(timedelta(seconds=int(segment['start'])))+',000'
        endTime = str(0)+str(timedelta(seconds=int(segment['end'])))+',000'
        text = segment['text']
        segmentId = segment['id']+1
        segment = f"{segmentId}\n{startTime} --> {endTime}\n{text[1:] if text[0] is ' ' else text}\n\n"

        with open(file, 'w', encoding='utf-8') as srtFile:
            srtFile.write(segment)


In [None]:
#@title process definition : inference + generating output

import os

def process(file, destination):
  if file.endswith("mp3"): 
    base, filename = os.path.split(file)
    txtFile = os.path.join(destination,filename+'.txt')
    srtFile = os.path.join(destination,filename+'.srt')

    if os.path.exists(txtFile) :
      print("already processed "+filename)
    else:
      #convert mp3 to wav
      print("processing "+filename)
      wavFile = mp3toWav(file)

      #transcribe
      result = model.transcribe(wavFile)
      print(result["text"])

      #save plain text
      with open(txtFile, 'w') as writefile:
        writefile.write(result["text"].replace(".", ".\r\n"))
        
      #save srt
      writeStrFile(result,srtFile)



In [None]:
#@title model loading & initialization

model = whisper.load_model(modelSize)

# Option 1 :  upload mp3 to transcribe  

In [None]:
#@title Upload file

from google.colab import files
uploaded = files.upload()
uploadedFilename = list(uploaded)[0]

In [None]:
#@title Process upload

import whisper

process(uploadedFilename, '/content/')


In [None]:
#@title download output

from google.colab import files
files.download('/content/'+uploadedFilename+'.srt')
files.download('/content/'+uploadedFilename+'.txt')


# Option 2 : batch transcribe files from Google drive

In [None]:
#@title connect drive

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)


In [None]:
#@title Process Gdrive files (long running task...)

import whisper
import glob

for file in glob.glob(gdriveMp3Path+"/*.mp3"):
  process(file,gdriveOutputPath)


JS for long running colab :

```
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}
setInterval(ConnectButton,60000);
```

