In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Dependencies

In [35]:
!pip install flashlight-text
!pip install kenlm
!pip install jiwer



# Config

In [53]:
ACOUSTIC_MODEL_FILE = '/content/drive/MyDrive/speech_recognition/demo_model.pt'

TEMP_AUDIO_FILE = 'temp.wav'
SAMPLE_RATE = 32000

# CTC beam search decoder
LM_WEIGHT = 3.23
WORD_SCORE = -0.26
LEXICON_FILE = '/content/drive/MyDrive/speech_recognition/lexicon.txt'
TOKENS_FILE = '/content/drive/MyDrive/speech_recognition/tokens.txt'
LANGUAGE_MOREL_FILE = '/content/drive/MyDrive/speech_recognition/lm.bin'
N_GRAMS = 3
BEAM_SIZE = 1500

# Imports

In [57]:
from IPython.display import Javascript
import IPython
from google.colab import output
from base64 import b64decode

import librosa
import librosa.display
from scipy.io.wavfile import write

import time
from typing import List

import torch
import torchaudio

from torchaudio.models.decoder import ctc_decoder
from torchaudio.utils import download_asset

from jiwer import wer, cer

import warnings
warnings.filterwarnings('ignore')

# Google colab audio recording

In [38]:
def record_and_save(seconds=3, temp_file="audio_temp.wav"):
    RECORD = """
    const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
    const b2text = blob => new Promise(resolve => {
    const reader = new FileReader()
    reader.onloadend = e => resolve(e.srcElement.result)
    reader.readAsDataURL(blob)
    })
    var record = time => new Promise(async resolve => {
    stream = await navigator.mediaDevices.getUserMedia({ audio: true })
    recorder = new MediaRecorder(stream)
    chunks = []
    recorder.ondataavailable = e => chunks.push(e.data)
    recorder.start()
    await sleep(time)
    recorder.onstop = async ()=>{
        blob = new Blob(chunks)
        text = await b2text(blob)
        resolve(text)
    }
    recorder.stop()
    })
    """

    def record(sec):
        display(Javascript(RECORD))
        s = output.eval_js('record(%d)' % (sec*1000 + 1500))
        b = b64decode(s.split(',')[1])
        with open(temp_file,'wb+') as f:
            f.write(b)
        return 'saved'

    print(f"Wait 3 seconds and speak to your microphone for {seconds} seconds...")
    record(seconds)
    print("Done recording!")

    speech, rate = librosa.load(temp_file)
    write('audio_temp.wav', rate, speech)

# Loading acoustic model

In [51]:
acoustic_model = torch.jit.load(ACOUSTIC_MODEL_FILE)

def model_predict(audio_file):
    if audio_file == None:
        print("[Error] No audio file provided")

    waveform, sample_rate = torchaudio.load(audio_file)

    if sample_rate != SAMPLE_RATE:
        waveform = torchaudio.functional.resample(waveform, sample_rate, SAMPLE_RATE)

    model_output, _ = acoustic_model(waveform)
    return model_output

# Decoders

## Beam Search Decoder





In [49]:
beam_search_decoder = ctc_decoder(
    lexicon=LEXICON_FILE,
    tokens=TOKENS_FILE,
    lm=LANGUAGE_MOREL_FILE,
    nbest=N_GRAMS,
    beam_size=BEAM_SIZE,
    lm_weight=LM_WEIGHT,
    word_score=WORD_SCORE,
)

## Greedy Decoder




In [48]:
tokens = []
with open(TOKENS_FILE, 'r') as tokens_file:
    tokens = tokens_file.read().split('\n')

class GreedyCTCDecoder(torch.nn.Module):
    def __init__(self, labels, blank=0):
        super().__init__()
        self.labels = labels
        self.blank = blank

    def forward(self, emission: torch.Tensor) -> List[str]:
        indices = torch.argmax(emission, dim=-1)
        indices = torch.unique_consecutive(indices, dim=-1)
        indices = [i for i in indices if i != self.blank]
        joined = "".join([self.labels[i] for i in indices])
        return joined.replace("|", " ").strip().split()


greedy_decoder = GreedyCTCDecoder(tokens)

## Decoding result

In [47]:
def get_greedy_result(model_output):
    return " ".join(greedy_decoder(model_output[0]))

def get_beam_search_result(model_output):
    return " ".join(beam_search_decoder(model_output)[0][0].words).strip()

# Demo main

In [85]:
def record_predict_and_evaluate(recording_time=5):
    record_and_save(seconds=recording_time, temp_file=TEMP_AUDIO_FILE)

    model_output = model_predict(TEMP_AUDIO_FILE)

    greedy_result = get_greedy_result(model_output)
    beam_search_result = get_beam_search_result(model_output)
    print(f'\n\n=========================== PREDICTION RESULTS ===========================')
    print(f'\nGREEDY RESULT:\n{greedy_result}')
    print(f'\nBEAM SEARCH RESULT:\n{beam_search_result}')

    print(f'\n\n============================== EVALUATION ===============================')
    actual_sentance = input('\nENTER ACTUAL SENTANCE:\n').lower()

    if len(actual_sentance) == 0:
        print('\n=========================================================================')
        return

    gready_wer = wer(actual_sentance, greedy_result)
    gready_cer = cer(actual_sentance, greedy_result)

    beam_search_wer = wer(actual_sentance, beam_search_result)
    beam_search_cer = cer(actual_sentance, beam_search_result)

    print(f'\nGREEDY:\n   WER: {gready_wer:.2f}\n   CER: {gready_cer:.2f}')
    print(f'\nBEAM SEARCH:\n   WER: {beam_search_wer:.2f}\n   CER: {beam_search_cer:.2f}')
    print('\n=========================================================================')

# Run demo


In [88]:
record_predict_and_evaluate()

Wait 3 seconds and speak to your microphone for 5 seconds...


<IPython.core.display.Javascript object>

Done recording!



GREEDY RESULT:
ther praviost solifene was to god

BEAM SEARCH RESULT:
the previous solution was too good



ENTER ACTUAL SENTANCE:
the previous solution wat too good

GREEDY:
   WER: 1.00
   CER: 0.35

BEAM SEARCH:
   WER: 0.17
   CER: 0.03

