# Speech recognition with timestamps with vosk

## Import Libraries

In [1]:
import os
import sys
import wave
import json

from vosk import Model, KaldiRecognizer, SetLogLevel
# !pip install vosk
import Word

SetLogLevel(0)

## Loading a vosk model

In [2]:
# path to vosk model downloaded from
# https://alphacephei.com/vosk/models
model_path = "../models/vosk-model-en-us-0.21"

if not os.path.exists(model_path):
    print(f"Please download the model from https://alphacephei.com/vosk/models and unpack as {model_path}")
    sys.exit()

print(f"Reading your vosk model '{model_path}'...")
model = Model(model_path)
print(f"'{model_path}' model was successfully read")

Reading your vosk model '../models/vosk-model-en-us-0.21'...
'../models/vosk-model-en-us-0.21' model was successfully read


## Specify the file name to recognize

In [3]:
# name of the audio file to recognize
audio_filename = "../audio/speech_recognition_systems.wav"
# name of the text file to write recognized text
text_filename = "../audio/speech_recognition_systems_vosk_with_timestamps.txt"

## Reading a file

In [4]:
if not os.path.exists(audio_filename):
    print(f"File '{audio_filename}' doesn't exist")
    sys.exit()

print(f"Reading your file '{audio_filename}'...")
wf = wave.open(audio_filename, "rb")
print(f"'{audio_filename}' file was successfully read")

Reading your file '../audio/speech_recognition_systems.wav'...
'../audio/speech_recognition_systems.wav' file was successfully read


In [5]:
# check if audio is mono wav
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print("Audio file must be WAV format mono PCM.")
    sys.exit()

## Recognize

In [6]:
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

In [8]:
results = []

# recognize speech using vosk model
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        part_result = json.loads(rec.Result())
        results.append(part_result)

part_result = json.loads(rec.FinalResult())
results.append(part_result)

`results` - list of json dictionaries, each of them has the following structure:

```
{'result': [
  # first word in a sentence
  {'conf': 0.84, # confidence
   'end': 4.5, # end time
   'start': 4.05, # start time
   'word': 'test'},
  # then, same parameters for 
  # the second word in a sentence
  {'conf': 0.87, 
   'end': 5.7, 
   'start': 5.1, 
   'word': 'library'},
  ... ], # and so on 
 # and a full text of the sentence
 'text': 'test library ...'}
 ```

In [9]:
results

[{'result': [{'conf': 1.0, 'end': 1.92, 'start': 1.47, 'word': 'some'},
   {'conf': 1.0, 'end': 2.4, 'start': 1.92, 'word': 'speech'},
   {'conf': 1.0, 'end': 3.09, 'start': 2.4, 'word': 'recognition'},
   {'conf': 1.0, 'end': 4.02, 'start': 3.09, 'word': 'systems'},
   {'conf': 1.0, 'end': 4.8, 'start': 4.08, 'word': 'require'},
   {'conf': 1.0, 'end': 5.67, 'start': 4.92, 'word': 'training'},
   {'conf': 1.0, 'end': 6.9, 'start': 6.03, 'word': 'alphago'},
   {'conf': 0.720008, 'end': 7.95, 'start': 6.93, 'word': 'enrollment'},
   {'conf': 0.739668, 'end': 8.7, 'start': 8.34, 'word': 'burn'},
   {'conf': 1.0, 'end': 9.42, 'start': 8.7, 'word': 'individual'},
   {'conf': 1.0, 'end': 10.17, 'start': 9.42, 'word': 'speaker'},
   {'conf': 0.670045, 'end': 10.68, 'start': 10.23, 'word': 'reads'},
   {'conf': 1.0, 'end': 11.34, 'start': 10.689899, 'word': 'text'},
   {'conf': 0.987625, 'end': 11.7, 'start': 11.37, 'word': 'or'},
   {'conf': 1.0, 'end': 12.48, 'start': 11.73, 'word': 'isolat

In [11]:
# convert list of JSON dictionaries to list of 'Word' objects

list_of_words = []
for sentence in results:
    if len(sentence) == 1:
        # sometimes there are bugs in recognition 
        # and it returns an empty dictionary
        # {'text': ''}
        continue
    for obj in sentence['result']:
        w = custom_Word.Word(obj)  # create custom Word object
        list_of_Words.append(w)  # and add it to list

In [12]:
for word in list_of_words:
    print(word.to_string())

some                 from 1.47 sec to 1.92 sec, confidence is 100.00%
speech               from 1.92 sec to 2.40 sec, confidence is 100.00%
recognition          from 2.40 sec to 3.09 sec, confidence is 100.00%
systems              from 3.09 sec to 4.02 sec, confidence is 100.00%
require              from 4.08 sec to 4.80 sec, confidence is 100.00%
training             from 4.92 sec to 5.67 sec, confidence is 100.00%
alphago              from 6.03 sec to 6.90 sec, confidence is 100.00%
enrollment           from 6.93 sec to 7.95 sec, confidence is 72.00%
burn                 from 8.34 sec to 8.70 sec, confidence is 73.97%
individual           from 8.70 sec to 9.42 sec, confidence is 100.00%
speaker              from 9.42 sec to 10.17 sec, confidence is 100.00%
reads                from 10.23 sec to 10.68 sec, confidence is 67.00%
text                 from 10.69 sec to 11.34 sec, confidence is 100.00%
or                   from 11.37 sec to 11.70 sec, confidence is 98.76%
isolated         

In [13]:
# forming a final string from the words
text = ''
for r in results:
    text += r['text'] + ' '

print("\tVosk thinks you said:\n")
print(text)

	Vosk thinks you said:

some speech recognition systems require training alphago enrollment burn individual speaker reads text or isolated vocabulary into the system the system analyzes the person specific voice and use it to fine tune the recognition of the person's speech resulting in increased accuracy systems that do not use training are called speaker independent systems systems that use training are called speaker dependent 


In [14]:
print(f"Saving text to '{text_filename}'...")
with open(text_filename, "w") as text_file:
    text_file.write(text)
print(f"Text successfully saved")

Saving text to '../audio/speech_recognition_systems_vosk_with_timestamps.txt'...
Text successfully saved
