In [1]:
import os
import re
import json
import librosa
from vosk import Model, KaldiRecognizer
import numpy as np

In [2]:
# Setting sampling rate default = 22050
sr = 22050

In [3]:
# Setting Model Vosk
vosk_model_path = "vosk-model-small-en-us-0.15" 
model = Model(vosk_model_path)
recognizer = KaldiRecognizer(model, sr)

In [4]:
# Function Recognizer
def recognizer_audio(pcm_data):
    recognizer.AcceptWaveform(pcm_data)
    result = recognizer.FinalResult()
    result_dict = json.loads(result)
    print(result_dict['text'])

In [5]:
# Setting Detecting Unvoice Frame by Librosa
custom_ref = 0.2
top_db = 20
def split_on_silence(y, sr):
    intervals = librosa.effects.split(y, top_db=top_db, ref=custom_ref)
    return intervals

In [6]:
# Setting Path_Folder
folder_path = 'audio_files'
files = os.listdir(folder_path)

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split('([0-9]+)', s)]

files.sort(key=natural_sort_key)


In [7]:
array_temps = []

for i, filename in enumerate(files):
    file_path = os.path.join(folder_path, filename)
    y, sr = librosa.load(file_path) 
    intervals_array = split_on_silence(y, sr)

    # Case 1: Audio has all unvoice frame
    if intervals_array.size == 0: 
        pcm_data = b''.join([(temp * 32767).astype(np.int16).tobytes() for temp in array_temps])
        recognizer_audio(pcm_data)
        array_temps = []
    else:
        # Case 2: Audio has the unvoice at the first, has the voice at the end
        if (np.subtract(intervals_array[0][0], 0) < 500) and (np.subtract(sr, intervals_array[-1][1]) < 2000):
            pcm_data = b''.join([(temp * 32767).astype(np.int16).tobytes() for temp in array_temps])
            recognizer_audio(pcm_data)
            array_temps = []
            array_temps.append(y)
        # Case 3: Audio has the unvoice at the end, has the voice at the first
        elif (np.subtract(intervals_array[0][0], 0) > 500) and (np.subtract(sr, intervals_array[-1][1]) > 2000):
            array_temps.append(y)
            pcm_data = b''.join([(temp * 32767).astype(np.int16).tobytes() for temp in array_temps])
            recognizer_audio(pcm_data)
            array_temps = []
        # Case 4: Audio has all voice frame    
        else:
            array_temps.append(y)

# Case 5: Last Audio
if array_temps:
    pcm_data = b''.join([(temp * 32767).astype(np.int16).tobytes() for temp in array_temps])
    recognizer_audio(pcm_data)

the birds canoes lid on the smooth plane
glue the see to the doctor background
it was easy to tell the death of a well

the sega mega they were dish
right and often served and round bowls
good use of lemons makes find punch
the boxes on the side the punch
the hunter said shop coin and garbage

for our the steady work sisters
a large times
even hard to sell


In [6]:
import librosa

In [7]:
y, sr = librosa.load("audio/upload_14.wav")
len(y)

21168

In [9]:
pcm_data = (y * 32767).astype(np.int16).tobytes()
len(pcm_data)

42336

In [18]:
recognizer.AcceptWaveform(pcm_data)
result = recognizer.FinalResult()
json_str = json.dumps(result)
print(type(result))
print(json_str)



{'text': 'arriving from'}
arriving from
