In [2]:
df = pd.read_json('ESC_DATASET_v1.2\\annotation\\luga.json')
df

Unnamed: 0,audio_filepath,id,text,label,attribute
0,21_11_2023/2023_11_21__09_54_58.wav,2023_11_21__09_54_58,назад с башмака,17,-1
1,15_11_2023/2023_11_15__11_38_51.wav,2023_11_15__11_38_51,прекратить зарядку тормозной магистрали,20,-1
2,02_11_2023/2023_11_02__10_41_09.wav,2023_11_02__10_41_09,осадить на двадцать восемь вагонов,4,28
3,15_11_2023/2023_11_15__11_38_41.wav,2023_11_15__11_38_41,зарядка тормозной магистрали,6,-1
4,15_11_2023/2023_11_15__09_41_25.wav,2023_11_15__09_41_25,вышел из межвагонного пространства,7,-1
...,...,...,...,...,...
605,02_11_2023/2023_11_02__10_49_36.wav,2023_11_02__10_49_36,протянуть на восемнадцать вагонов,10,18
606,03_07_2023/3faf9502-846e-11ee-8635-c09bf4619c0...,3faf9502-846e-11ee-8635-c09bf4619c03,осадить на десять вагонов,4,10
607,21_11_2023/2023_11_21__10_38_03.wav,2023_11_21__10_38_03,осадить на восемнадцать вагонов,4,18
608,15_11_2023/2023_11_15__09_45_41.wav,2023_11_15__09_45_41,начать осаживание,3,-1


In [6]:
import json
import pandas as pd
import numpy as np
import time
import psutil
from vosk import Model, KaldiRecognizer

In [7]:
def calculate_wer(reference, hypothesis):
    r = reference.split()
    h = hypothesis.split()
    d = np.zeros((len(r) + 1, len(h) + 1))

    for i in range(len(r) + 1):
        d[i][0] = i
    for j in range(len(h) + 1):
        d[0][j] = j

    for i in range(1, len(r) + 1):
        for j in range(1, len(h) + 1):
            cost = 0 if r[i - 1] == h[j - 1] else 1
            d[i][j] = min(d[i - 1][j] + 1,      # deletion
                          d[i][j - 1] + 1,      # insertion
                          d[i - 1][j - 1] + cost)  # substitution

    wer = d[len(r)][len(h)] / len(r) if len(r) > 0 else float('inf')
    return wer

In [8]:

def calculate_f1(true_labels, pred_labels):
    # Пример простой реализации F1
    true_positive = sum(1 for true, pred in zip(true_labels, pred_labels) if true == pred)
    precision = true_positive / len(pred_labels) if len(pred_labels) > 0 else 0
    recall = true_positive / len(true_labels) if len(true_labels) > 0 else 0
    if precision + recall == 0:
        return 0.0
    f1 = 2 * (precision * recall) / (precision + recall)
    return f1

In [9]:
# Загрузка модели
model = Model("vosk-model-small-ru-0.22")
recognizer = KaldiRecognizer(model, 16000)

In [10]:
# Проверка на исходных данных
with open('luga.json', 'r', encoding='utf-8') as f:
    data = pd.read_json(f)

wer_results = []
f1_results = []
latency_results = []
peak_ram_results = []

for index, row in data.iterrows():
    audio_filepath = row['audio_filepath']
    reference_text = row['text']

    # Замер Latency (CPU)
    start_time = time.time()
    process = psutil.Process()
    peak_ram_start = process.memory_info().rss

    # Открытие аудиофайла и распознавание
    with open(audio_filepath, 'rb') as audio_file:
        while True:
            data = audio_file.read(4000)
            if len(data) == 0:
                break
            if recognizer.AcceptWaveform(data):
                result = recognizer.Result()
            else:
                recognizer.PartialResult()

    result_text = recognizer.FinalResult()
    result_dict = json.loads(result_text)
    recognized_text = result_dict.get('text', '')

    # Расчет WER и F1
    wer = calculate_wer(reference_text, recognized_text)
    f1 = calculate_f1(reference_text.split(), recognized_text.split())

    # Замер Latency и Peak RAM
    latency = time.time() - start_time
    peak_ram_end = process.memory_info().rss

    wer_results.append(wer)
    f1_results.append(f1)
    latency_results.append(latency)
    peak_ram_results.append(peak_ram_end - peak_ram_start)

In [11]:
# Вывод результатов
print(f"Average WER: {np.mean(wer_results)}")
print(f"Average F1: {np.mean(f1_results)}")
print(f"Average Latency: {np.mean(latency_results)} seconds")
print(f"Average Peak RAM: {np.mean(peak_ram_results)} bytes")

Average WER: 0.9760382513661202
Average F1: 0.023440257374683603
Average Latency: 0.14879364654666088 seconds
Average Peak RAM: 350019.9868852459 bytes


In [14]:
# Замер по данным с примененным шумоподавлением
with open('luga_denoised.json', 'r', encoding='utf-8') as f:
    data = pd.read_json(f)

wer_results = []
f1_results = []
latency_results = []
peak_ram_results = []

for index, row in data.iterrows():
    audio_filepath = 'D:\\CodesNodes\\SZFO\\wavAudio\\VAL\\' + row['id'] + '.wav'
    reference_text = row['text']

    # Замер Latency (CPU)
    start_time = time.time()
    process = psutil.Process()
    peak_ram_start = process.memory_info().rss

    # Открытие аудиофайла и распознавание
    with open(audio_filepath, 'rb') as audio_file:
        while True:
            data = audio_file.read(4000)
            if len(data) == 0:
                break
            if recognizer.AcceptWaveform(data):
                result = recognizer.Result()
            else:
                recognizer.PartialResult()

    result_text = recognizer.FinalResult()
    result_dict = json.loads(result_text)
    recognized_text = result_dict.get('text', '')

    # Расчет WER и F1
    wer = calculate_wer(reference_text, recognized_text)
    f1 = calculate_f1(reference_text.split(), recognized_text.split())

    # Замер Latency и Peak RAM
    latency = time.time() - start_time
    peak_ram_end = process.memory_info().rss

    wer_results.append(wer)
    f1_results.append(f1)
    latency_results.append(latency)
    peak_ram_results.append(peak_ram_end - peak_ram_start)

In [15]:
# Вывод результатов
print(f"Average WER: {np.mean(wer_results)}")
print(f"Average F1: {np.mean(f1_results)}")
print(f"Average Latency: {np.mean(latency_results)} seconds")
print(f"Average Peak RAM: {np.mean(peak_ram_results)} bytes")

Average WER: 0.269831223628692
Average F1: 0.7036738999397228
Average Latency: 0.06980058845085434 seconds
Average Peak RAM: 14911.513924050632 bytes
