## Import Libraries

In [1]:
import os
import sys
import time
import wave
import json
from vosk import Model, KaldiRecognizer, SetLogLevel
# !pip install vosk

SetLogLevel(0)

# Specify the file name and the path to the model

In [2]:
# name of the audio file to recognize (wav preferably)
audio_filename = "audio/test.wav"
# path to vosk model downloaded from 
# https://alphacephei.com/vosk/models
model_path = "models/vosk-model-en-us-0.21"

# name of the text file to write recognized text
text_filename = "audio/test.txt"

## Reading a file and a model

In [3]:
if not os.path.exists(audio_filename):
    print(f"File '{audio_filename}' doesn't exist")
    sys.exit()

print(f"Reading your file '{audio_filename}'...")
wf = wave.open(audio_filename, "rb")
print(f"'{audio_filename}' file was successfully read")

Reading your file 'audio/test.wav'...
'audio/test.wav' file was successfully read


In [4]:
if not os.path.exists(model_path):
    print(f"Please download the model from https://alphacephei.com/vosk/models and unpack as {model_path}")
    sys.exit()

print(f"Reading your vosk model '{model_path}'...")
model = Model(model_path)
rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)
print(f"'{model_path}' model was successfully read")

Reading your vosk model 'models/vosk-model-en-us-0.21'...
'models/vosk-model-en-us-0.21' model was successfully read


## Recognize

In [5]:
print('Start converting to text. It may take some time...')
start_time = time.time()

Start converting to text. It may take some time...


In [6]:
results = []

# recognize speech using vosk model
while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        part_result = json.loads(rec.Result())
        results.append(part_result)

part_result = json.loads(rec.FinalResult())
results.append(part_result)

`result` - list of json dictionaries, each of them has the following structure:

```
{'result': [{'conf': 0.849133, # confidence
             'end': 4.5, # end time
             'start': 4.05, # start time
             'word': 'test'}], # recognized word
 'text': 'test'}
 ```

In [7]:
results

[{'result': [{'conf': 1.0, 'end': 2.55, 'start': 1.98, 'word': 'deus'},
   {'conf': 1.0, 'end': 3.33, 'start': 2.64, 'word': 'vos'},
   {'conf': 1.0, 'end': 4.44, 'start': 3.75, 'word': 'speech'},
   {'conf': 1.0, 'end': 5.25, 'start': 4.44, 'word': 'recognition'},
   {'conf': 1.0, 'end': 6.03, 'start': 5.25, 'word': 'library'}],
  'text': 'deus vos speech recognition library'}]

In [8]:
# forming a final string from the words
text = ''
for r in results:
    text += r['text'] + ' '

In [9]:
time_elapsed = time.strftime(
    '%H:%M:%S', time.gmtime(time.time() - start_time))
print(f'Done! Elapsed time = {time_elapsed}\n')

Done! Elapsed time = 00:00:01



In [10]:
print("\tVosk thinks you said:\n")
print(text)

	Vosk thinks you said:

deus vos speech recognition library 


In [11]:
print(f"\nSaving text to '{text_filename}'...")
with open(text_filename, "w") as text_file:
    text_file.write(text)
print(f"Text successfully saved")


Saving text to 'audio/test.txt'...
Text successfully saved
