# Speech recognition with timestamps with vosk

## Import Libraries

In [22]:
import os
import sys
import time
import wave
import json

from vosk import Model, KaldiRecognizer, SetLogLevel
# !pip install vosk

import Word

SetLogLevel(0)

## Loading a vosk model

In [23]:
# path to vosk model downloaded from
# https://alphacephei.com/vosk/models
model_path = "../models/vosk-model-en-us-0.21"

if not os.path.exists(model_path):
    print(f"Please download the model from https://alphacephei.com/vosk/models and unpack as {model_path}")
    sys.exit()

print(f"Reading your vosk model '{model_path}'...")
model = Model(model_path)
print(f"'{model_path}' model was successfully read")

Reading your vosk model '../models/vosk-model-en-us-0.21'...
'../models/vosk-model-en-us-0.21' model was successfully read


## Specify the file name to recognize

In [24]:
# name of the audio file to recognize (wav preferably)
audio_filename = "../audio/speech_recognition_systems.wav"
# name of the text file to write recognized text
text_filename = "../audio/speech_recognition_systems_vosk_with_timestamps.txt"

## Reading a file

In [25]:
if not os.path.exists(audio_filename):
    print(f"File '{audio_filename}' doesn't exist")
    sys.exit()

print(f"Reading your file '{audio_filename}'...")
wf = wave.open(audio_filename, "rb")
print(f"'{audio_filename}' file was successfully read")

Reading your file '../audio/speech_recognition_systems.wav'...
'../audio/speech_recognition_systems.wav' file was successfully read


In [26]:
# check if audio if mono wav
wf = wave.open(audio_filename, "rb")
if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
    print("Audio file must be WAV format mono PCM.")
    sys.exit()

## Recognize

In [27]:
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

In [28]:
print('Start converting to text. It may take some time...')
start_time = time.time()

Start converting to text. It may take some time...


In [29]:
results = []

# recognize speech using vosk model
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        part_result = json.loads(rec.Result())
        results.append(part_result)

part_result = json.loads(rec.FinalResult())
results.append(part_result)

`result` - list of json dictionaries, each of them has the following structure:

```
{'result': [{'conf': 0.849133, # confidence
             'end': 4.5, # end time
             'start': 4.05, # start time
             'word': 'test'}], # recognized word
 'text': 'test'}
 ```

In [52]:
# convert list of json dictionaries to list of 'Word' objects

timestamps = []
for sentence in results:
    for obj in sentence['result']:
        w = Word(obj) # create custom Word object
        timestamps.append(w) # and add it to list

In [53]:
for w in timestamps:
    print(w.to_string())

some                 from 1.47 sec to 1.92 sec, confidence is 100.00%
speech               from 1.92 sec to 2.40 sec, confidence is 100.00%
recognition          from 2.40 sec to 3.09 sec, confidence is 100.00%
systems              from 3.09 sec to 4.02 sec, confidence is 100.00%
require              from 4.08 sec to 4.80 sec, confidence is 100.00%
training             from 4.92 sec to 5.70 sec, confidence is 100.00%
alphago              from 6.03 sec to 6.90 sec, confidence is 100.00%
enrollment           from 6.93 sec to 7.95 sec, confidence is 72.03%
burn                 from 8.34 sec to 8.70 sec, confidence is 73.90%
individual           from 8.70 sec to 9.42 sec, confidence is 100.00%
speaker              from 9.42 sec to 10.17 sec, confidence is 100.00%
reads                from 10.23 sec to 10.68 sec, confidence is 66.77%
text                 from 10.69 sec to 11.34 sec, confidence is 100.00%
or                   from 11.37 sec to 11.70 sec, confidence is 98.81%
isolated         

In [36]:
obj["conf"]

1.0

In [18]:
results

[{'result': [{'conf': 1.0, 'end': 1.92, 'start': 1.5, 'word': 'сам'},
   {'conf': 0.734768, 'end': 2.52, 'start': 1.92, 'word': 'спеша'},
   {'conf': 0.984898, 'end': 2.70002, 'start': 2.528013, 'word': 'как'},
   {'conf': 0.664435, 'end': 3.0, 'start': 2.70002, 'word': 'знаешь'},
   {'conf': 0.700186, 'end': 3.723733, 'start': 3.0, 'word': 'инцестом'},
   {'conf': 0.914712, 'end': 3.931224, 'start': 3.723733, 'word': 'с'},
   {'conf': 0.886431, 'end': 4.77, 'start': 4.023877, 'word': 'прокла'},
   {'conf': 0.99399, 'end': 5.61, 'start': 4.92, 'word': 'джейн'},
   {'conf': 0.51425, 'end': 6.45, 'start': 6.09, 'word': 'алсо'},
   {'conf': 1.0, 'end': 6.9, 'start': 6.48, 'word': 'кол'},
   {'conf': 0.778538, 'end': 7.17, 'start': 6.99, 'word': 'энн'},
   {'conf': 0.935332, 'end': 7.92, 'start': 7.173516, 'word': 'роланд'},
   {'conf': 1.0, 'end': 9.0, 'start': 8.34, 'word': 'брэнды'},
   {'conf': 0.333007, 'end': 9.39, 'start': 9.0, 'word': 'ввиду'},
   {'conf': 0.990838, 'end': 10.14, '

In [16]:
# forming a final string from the words
text = ''
for r in results:
    text += r['text'] + ' '

In [17]:
time_elapsed = time.strftime(
    '%H:%M:%S', time.gmtime(time.time() - start_time))
print(f'Done! Elapsed time = {time_elapsed}')

Done! Elapsed time = 00:00:07


In [18]:
print("\tVosk thinks you said:\n")
print(text)

	Vosk thinks you said:

some speech recognition systems require training alphago enrollment burn individual speaker reads text or isolated vocabulary into the system the system analyzes the person specific voice and use it to fine tune the recognition of the person's speech resulting in increased accuracy systems that do not use training are called speaker independent systems systems that use training are called speaker dependent 


In [19]:
print(f"Saving text to '{text_filename}'...")
with open(text_filename, "w") as text_file:
    text_file.write(text)
print(f"Text successfully saved")

Saving text to 'audio/speech_recognition_systems_vosk.txt'...
Text successfully saved
