<a href="https://colab.research.google.com/github/VishnuKunchur/konnakol-to-text/blob/main/model_tests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# load code and data directory mounted on Google Drive
from google.colab import drive
drive.mount('/content/drive')
gdrive_base_path = '/content/drive/MyDrive/konnakol-to-text/'

from time import time
import numpy as np
import pandas as pd
import torch
import librosa
from transformers import WhisperForConditionalGeneration, WhisperProcessor

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install -r /content/drive/MyDrive/konnakol-to-text/requirements.txt
! pip install torchaudio ipywebrtc

Collecting ipywebrtc
  Using cached ipywebrtc-0.6.0-py2.py3-none-any.whl (260 kB)
Installing collected packages: ipywebrtc
Successfully installed ipywebrtc-0.6.0


FUNCTIONS FOR KONNAKOL TRANSCRIPTION

In [None]:
def transcribe_konnakol_audio(whisper_model: str, audio_array: np.array, WHISPER_SAMPLING_RATE=16_000):
  """
  convert a konnakol sequence audio array derived from a .wav file (sr = whisper sampling rate)

  whisper_model: str, fine-tuned whisper-konnakol model path
  audio_array: np.array, audio as array
  """
  # check if cuda device is available
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
  processor = WhisperProcessor.from_pretrained(whisper_model)
  # load model
  model = WhisperForConditionalGeneration.from_pretrained(whisper_model)

  # enforce english transcription, i.e. prevent output language auto-detection
  model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language='english', task='transcribe')
  model = model.to(device)
  print(f'{whisper_model} load complete on {device}')

  # Whisper expects 30s of audio for shortform transcription
  if len(audio_array) < 30 * WHISPER_SAMPLING_RATE:
      # shortform transcription
      input_features = processor(audio_array, sampling_rate=WHISPER_SAMPLING_RATE, return_tensors='pt').input_features
  else:
      # longform transcription
      input_features = processor(audio_array, sampling_rate=WHISPER_SAMPLING_RATE, return_tensors='pt',
                              truncation=False, padding='longest', return_attention_mask=True).input_features

  input_features = input_features.to(device)
  # GENERATION (MODEL INFERENCE)
  sta = time()
  # generate token ids
  print(f'{whisper_model} inference in progress..')
  predicted_ids = model.generate(input_features, language='en')
  # decode tokens to text
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
  print(transcription)
  end = time()
  print(f'inference time: {round(end-sta, 3)}s')
  transcription = transcription[0]
  return transcription

In [None]:
def print_konnakol_sequence(transcription: str, type: str):
  """
  sequence: str, konnakol token sequence
  type: str, one of 'prediction' or 'ground'
  """
  assert type in ['prediction', 'groundtruth'], "only ['prediction', 'groundtruth'] are accepted values for 'type'"
  if type == 'prediction':
    print(f'MODEL: {whisper_model}')
    print(f'{lesson_audio_filepath} TRANSCRIPTION:\n\n**')

  elif type == 'groundtruth':
    print(f'{lesson_audio_filepath} GROUND TRUTH:\n\n**')
    transcription = metadata.loc[metadata['file_name'] == '/'.join(lesson_audio_filepath.split('/')[-2:])]['transcription'].values[0]

  temp_str = ''
  for idx, word in enumerate(transcription.split(' ')):
    temp_str += word + ' '
    if idx % 5 == 0:
      print(f'{temp_str}\n')
      temp_str = ''
  print(temp_str)

  pass

LOAD AND INFER SAMPLES FROM CHAPTER LESSON RECORDINGS

In [None]:
# baseline model:
# USE ONLY FOR TESTING. REMOVE IN PRODUCTION.
# whisper_model = 'openai/whisper-medium'

In [None]:
# load konnakol audio to be transcribed: train/test (Chapter-Lesson recordings)
whisper_model = f'{gdrive_base_path}/models/whisper-medium-konnakol-test'
lesson_audio_filepath = f'{gdrive_base_path}/data/test/ch1_l9.wav'
RECORDED_SAMPLING_RATE = 44_100
WHISPER_SAMPLING_RATE = 16_000

audio_array, _ = librosa.load(lesson_audio_filepath, sr=RECORDED_SAMPLING_RATE)
audio_array = librosa.resample(audio_array, orig_sr=RECORDED_SAMPLING_RATE, target_sr=WHISPER_SAMPLING_RATE)

In [None]:
transcription = transcribe_konnakol_audio(whisper_model, audio_array)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/content/drive/MyDrive/konnakol-to-text//models/whisper-medium-konnakol-test load complete on cpu
/content/drive/MyDrive/konnakol-to-text//models/whisper-medium-konnakol-test inference in progress..
['THA-TAH DHOM KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THA-LAM DHOM KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THA-KA-DHOM THA-MTHA-KA THA-KA-THA-RI KI-TA-THA-KA THA-MTHA-KA THA-KA-DHIN THA-KA-THA-RI KI-TA-THA-KA THA-MTHA-KA THA-KA-THA-RI KI-TA-THA-KA THA-MTHA-KA TA-DHOM KI-TA-THA-KA THA-MTHA-KA DHOM KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THA-MTHA-KA THA-RI KI-TA-THA-KA THA-MTHA-KA THA-KA THA-RI KI-TA-THA-KA THA-RI KI-TA-THA-KA THA-LAM DHOM KI-TA-KI-TA-THA-KA THA-LAM DHOM KI-TA-KI-TA-THA-KA THA-KA-DHOM KI-TA-THA-KA THA-LAM DHOM THA-LAM KI-TA-THA-KA THA-LAM DHOM THA-LAM KI-TA-KI-TA-THA-KA THA-LAM DHOM THA-LAM KI-TA-KI-TA-THA-KA THA-LAM DHOM THA-LAM KI-TA-KI-TA-THA-KA THA-LAM DHOM THA-LAM KI-TA-KI-TA-THA-KATHA-LAM-KA-DHOM KITA-KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA']
inference time: 176.474s


In [None]:
metadata = pd.read_csv(gdrive_base_path + 'data/metadata.csv')

In [None]:
print_konnakol_sequence(transcription, type='prediction')

MODEL: /content/drive/MyDrive/konnakol-to-text//models/whisper-medium-konnakol-test
/content/drive/MyDrive/konnakol-to-text//data/test/ch1_l9.wav TRANSCRIPTION:

**
THA-TAH 

DHOM KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THA-LAM 

DHOM KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THA-KA-DHOM 

THA-MTHA-KA THA-KA-THA-RI KI-TA-THA-KA THA-MTHA-KA THA-KA-DHIN 

THA-KA-THA-RI KI-TA-THA-KA THA-MTHA-KA THA-KA-THA-RI KI-TA-THA-KA 

THA-MTHA-KA TA-DHOM KI-TA-THA-KA THA-MTHA-KA DHOM 

KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THA-MTHA-KA THA-RI 

KI-TA-THA-KA THA-MTHA-KA THA-KA THA-RI KI-TA-THA-KA 

THA-RI KI-TA-THA-KA THA-LAM DHOM KI-TA-KI-TA-THA-KA 

THA-LAM DHOM KI-TA-KI-TA-THA-KA THA-KA-DHOM KI-TA-THA-KA 

THA-LAM DHOM THA-LAM KI-TA-THA-KA THA-LAM 

DHOM THA-LAM KI-TA-KI-TA-THA-KA THA-LAM DHOM 

THA-LAM KI-TA-KI-TA-THA-KA THA-LAM DHOM THA-LAM 

KI-TA-KI-TA-THA-KA THA-LAM DHOM THA-LAM KI-TA-KI-TA-THA-KATHA-LAM-KA-DHOM 

KITA-KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA 


In [None]:
print_konnakol_sequence(transcription, type='groundtruth')

/content/drive/MyDrive/konnakol-to-text//data/test/ch1_l9.wav GROUND TRUTH:

**
TA 

DHOM KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THA 

LAM DHOM KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA 

THA KA DHOM THAM THA 

KA THA-KA-THA-RI KI-TA-THA-KA THAM THA 

KA THA-KA-DIN-NA THA-KA-THA-RI KI-TA-THA-KA THAM 

THA KA TA DHOM KI-TA-THA-KA 

THAM THA KA TA DHOM 

KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THAM THA 

KA THA-KA-THA-RI KI-TA-THA-KA THAM THA 

KA THA-KA-THA-RI KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA 

THA-LAM KA-DHOM KI TA KI-TA-THA-KA 

THA LAM KA DHOM KI 

TA KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA THA-LAM 

KA-DHOM THA-LAM KA-DHOM THA-LAM KA-DHOM 

KI TA KI-TA-THA-KA THA-KA-THA-RI KI-TA-THA-KA 




**LIVE RECORDING TRANSCRIPTION**

In [None]:
from ipywebrtc import AudioRecorder, CameraStream
import torchaudio
from IPython.display import Audio
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
camera = CameraStream(constraints={'audio': True, 'video': False})
recorder = AudioRecorder(stream=camera)
recorder

AudioRecorder(audio=Audio(value=b'', format='webm'), stream=CameraStream(constraints={'audio': True, 'video': …

In [None]:
with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)
!ffmpeg -i recording.webm -ac 1 -f wav file.wav -y -hide_banner -loglevel panic
sig, sr = torchaudio.load("file.wav")
Audio(data=sig, rate=sr)
audio_array = sig.detach().numpy()

In [None]:
# inference model (fine-tuned)
whisper_model = f'{gdrive_base_path}/models/whisper-medium-konnakol-test'
RECORDED_SAMPLING_RATE = sr
WHISPER_SAMPLING_RATE = 16_000
audio_array = librosa.resample(audio_array, orig_sr=RECORDED_SAMPLING_RATE, target_sr=WHISPER_SAMPLING_RATE)

In [None]:
transcription = transcribe_konnakol_audio(whisper_model, audio_array)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


/content/drive/MyDrive/konnakol-to-text//models/whisper-medium-konnakol-test load complete on cpu
/content/drive/MyDrive/konnakol-to-text//models/whisper-medium-konnakol-test inference in progress..
['THA KI-TA-THA-KA THA KI-TA THA KI-TA-THA-KA THA-TI-KI-TA-THA THA KI-TA-THA-KA THA KI-TA THA KI-TA-THA-KA THA-TI-KI-TA-THA-MA']
inference time: 43.429s


In [None]:
# comparison with base openai model:
whisper_model_name = 'openai/whisper-medium'
#model = WhisperForConditionalGeneration.from_pretrained(whisper_model_name)
#processor = WhisperProcessor.from_pretrained(whisper_model_name)
#model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language='english', task='transcribe')
transcription = transcribe_konnakol_audio(whisper_model=whisper_model_name, audio_array=audio_array)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


openai/whisper-medium load complete on cpu
openai/whisper-medium inference in progress..
[' Taki Tata Taka Taki Tata Taki Tata Tiki Tata Taki Tata Taka Taki Tata Tiki Tata']
inference time: 32.698s
