In [9]:
import numpy as np
from scipy.io import loadmat, wavfile
from scipy import signal
import matplotlib.pyplot as plt
import matplotlib as mpl
from Signal_Analysis.features.signal import get_F_0, get_HNR
from importlib import reload
import wave
import json
from vosk import Model, KaldiRecognizer,SetLogLevel
import os.path
import math
from folderFunctions import*


# tools work in progress
import tools
reload(tools)
from tools import *

plt.style.use("dark_background")
mpl.rcParams["lines.linewidth"] = 0.5
plt.close("all")



In [10]:
#model_path = "models/vosk-model-en-us-0.22"
#model_path = "models/vosk-model-small-en-us-0.15"
model_path = "models/vosk-model-small-sv-rhasspy-0.15/"
if not "model" in locals():
    model = Model(model_path)


In [11]:

class Word:
    ''' A class representing a word from the JSON format for vosk speech recognition API '''

    def __init__(self, dict):
        '''
        Parameters:
          dict (dict) dictionary from JSON, containing:
            conf (float): degree of confidence, from 0 to 1
            end (float): end time of the pronouncing the word, in seconds
            start (float): start time of the pronouncing the word, in seconds
            word (str): recognized word
        '''

        self.conf = dict["conf"]
        self.end = dict["end"]
        self.start = dict["start"]
        self.word = dict["word"]

    def to_string(self):
        ''' Returns a string describing this instance '''
        return "{:20} from {:.2f} sec to {:.2f} sec, confidence is {:.2f}%".format(
            self.word, self.start, self.end, self.conf*100)

In [12]:
#audio_filename = "wav_examples/kaviar_sv.wav"
audio_filename = "wav_examples/digitala_resurser_sv.wav"
#audio_filename = "wav_examples/sammarbete_sv.wav"
wf = wave.open(audio_filename, "rb")

print(wf.getnframes())
print(wf.getframerate())

192384
32000


In [13]:
wf = wave.open(audio_filename, "rb")

rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

# get the list of JSON dictionaries
results = []
# recognize speech using vosk model
while True:
    data = wf.readframes(wf.getframerate())
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        part_result = json.loads(rec.Result())
        results.append(part_result)
part_result = json.loads(rec.FinalResult())
results.append(part_result)


# convert list of JSON dictionaries to list of 'Word' objects
list_of_words = []
for sentence in results:
    if len(sentence) == 1:
        # sometimes there are bugs in recognition 
        # and it returns an empty dictionary
        # {'text': ''}
        continue
    for obj in sentence['result']:
        w = Word(obj)  # create custom Word object
        list_of_words.append(w)  # and add it to list

#wf.close()  # close audiofile

# output to the screen
for word in list_of_words:
    print(word.to_string())


på                   from 0.66 sec to 0.84 sec, confidence is 100.00%
många                from 0.84 sec to 1.14 sec, confidence is 100.00%
skolor               from 1.14 sec to 1.77 sec, confidence is 100.00%
saknas               from 1.80 sec to 2.52 sec, confidence is 100.00%
såväl                from 2.52 sec to 2.88 sec, confidence is 100.00%
digitala             from 2.88 sec to 3.60 sec, confidence is 100.00%
som                  from 3.81 sec to 3.99 sec, confidence is 100.00%
analoga              from 3.99 sec to 4.59 sec, confidence is 100.00%
resurser             from 4.59 sec to 5.25 sec, confidence is 100.00%


## Test timestamps

In [14]:
def HNR_peaks(audio, Fs):
    tt = np.linspace(0, len(audio) / Fs, len(audio))
    fl = int(0.05 * Fs)
    frames, frames_start = split_frames(audio, fl, Fs, overlap=int(1 * fl / 8))
    tt_frames_center = tt[frames_start] + int(fl/2)/Fs
    hnr_frames = []
    for f in frames:
        hnr_frames.append(get_HNR(f, Fs, silence_threshold=0.5))
    min_h = max(hnr_frames)/4
    peaks, peaks_prop = signal.find_peaks(
        hnr_frames,
        height=min_h,
        )
    return frames, peaks_prop, peaks

def checkVowels(word, vowels):
    foundVowels = [letter for letter in word if letter in vowels]
    return foundVowels


In [15]:
language = 'Swedish'
vowels_sv = ['a', 'e', 'i','o','u','y','å','ä','ö']
createLanguageFolder(language, vowels_sv)
Fs, audio = wavfile.read(audio_filename)
segments = []
vowels = []
for word in list_of_words:
    if word.conf == 1:
        vowels.append(checkVowels(word.word.lower(),vowels_sv))
        start = round(word.start*Fs) #start of word
        end = math.ceil(word.end*Fs + Fs/10) #end of word. Might need some fine-tuning
        segments.append(audio[start:end]) #adding word to the list
        path = 'test_timestamps/' + word.word + ".wav" #path to save
        wavfile.write(path, Fs, segments[-1]) #saving file in order to listen 

print(vowels)

[['å'], ['å', 'a'], ['o', 'o'], ['a', 'a'], ['å', 'ä'], ['i', 'i', 'a', 'a'], ['o'], ['a', 'a', 'o', 'a'], ['e', 'u', 'e']]


In [16]:

for i in range(len(segments)) :
    frames, peaks_prop, peaks = HNR_peaks(segments[i], Fs)
    if len(peaks) == len(vowels[i]):
        for j in range(len(peaks)):
            updateFolder(language, frames[peaks[j]],vowels[i][j], vowels[i][j] + str(i) + str(j), Fs)
            
    else:
        print("Did not manage to identify all vowels from HNR")

frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 6
frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 9
Did not manage to identify all vowels from HNR
frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 16
Did not manage to identify all vowels from HNR
frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 18
Did not manage to identify all vowels from HNR
frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 10
Did not manage to identify all vowels from HNR
frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 18
frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 6
frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 15
frame length    : 1600 samples
frame length    : 0.05 seconds
number of frames: 17
