[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/biodatlab/whisper-th-demo/blob/main/whisper_th_demo.ipynb)


# **Thonburian Whisper**

Automatic Speech Recognition (ASR) model for Thai

<img src="https://raw.githubusercontent.com/biodatlab/whisper-th-demo/main/assets/Thonburian-Whisper-1.jpg" width="800"/>
---



> By Crews from Looloo Technology and Mahidol University




## **Install Dependencies** ⚙

In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!sudo apt install ffmpeg
!pip install torchaudio ipywebrtc notebook
!pip install -q gradio
!pip install pytube
!jupyter nbextension enable --py widgetsnbextension

## **Load and Set-up Thonburian Whisper 🤗**


In [None]:
import os
import torch
from transformers import pipeline

MODEL_NAME = "biodatlab/whisper-th-medium-combined"
lang = "th"

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)

pipe.model.config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
# NOTE: this is a hack to make the model work with the new generation API which returns timestamps
pipe.model.generation_config.forced_decoder_ids = pipe.tokenizer.get_decoder_prompt_ids(language=lang, task="transcribe")
pipe.model.generation_config.no_timestamps_token_id = 50363
pipe.model.generation_config.max_initial_timestamp_index = None

## **Try it with your own voice** 🎥

### Record your own audio here in the notebook!

In [None]:
from ipywebrtc import AudioRecorder, CameraStream
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
camera = CameraStream(constraints={'audio': True, 'video': False})
recorder = AudioRecorder(stream=camera)
recorder

In [None]:
# Save the recorded audio.
recorder.save("audio.mp3")

### Now let our *Thonburian Whisper* do the work!!

In [None]:
transcriptions = pipe("audio.mp3", generate_kwargs={
                      "language": "<|th|>", "task": "transcribe",  "repetition_penalty": 1.2}, return_timestamps=True, batch_size=16)["text"]
print(transcriptions)


## **Transcribe a Youtube Video?** 

> [![Watch the video](https://img.youtube.com/vi/jwBqoBIDv3o/default.jpg)](https://www.youtube.com/watch?v=jwBqoBIDv3o)




In [None]:
import pytube as pt


def yt_transcribe(yt_url: str):
    """Transcribe a given Youtube URL"""
    yt = pt.YouTube(yt_url)
    stream = yt.streams.filter(only_audio=True)[0]
    stream.download(filename="audio.mp3")
    text = pipe(
        "audio.mp3",
        generate_kwargs={"language": "<|th|>", "task": "transcribe", "repetition_penalty": 1.2},
        return_timestamps=True, batch_size=16
    )
    return text

In [None]:
# This may take some time depending on the length of the video.
url = "https://www.youtube.com/watch?v=jwBqoBIDv3o"

transcriptions = yt_transcribe(url)
print(transcriptions)

In [None]:
# create an srt file from the timestamps
from datetime import timedelta
srt_filename = "output.srt"

for id, chunk in enumerate(transcriptions["chunks"]):
    timestamp = chunk["timestamp"]
    starttime = str(0) + str(timedelta(seconds=int(timestamp[0]))) + ",000"
    endtime = str(0) + str(timedelta(seconds=int(timestamp[1]))) + ",000"
    text = chunk["text"]
    text = "inaudible" if text.strip() == '' else text
    segment_id = id + 1
    segment = f"{segment_id}\n{starttime} --> {endtime}\n{text}\n\n"

    with open(srt_filename, "a", encoding="utf-8") as srtfile:
        srtfile.write(segment)