In [1]:
%cd ..

/home/oleg/asr-eval


In [2]:
# type: ignore

from typing import Any

from jiwer import wer
from datasets import Dataset, load_dataset, Audio
import numpy as np
import pandas as pd

from asr_eval.streaming.sender import StreamingAudioSender
from asr_eval.streaming.caller import transribe_parallel
from asr_eval.streaming.models.vosk import VoskStreaming
from asr_eval.streaming.transcription import PartialTranscription

In [3]:
# type: ignore

dataset: Dataset = load_dataset('bond005/podlodka_speech')['test'].take(10)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
df = pd.DataFrame(list(dataset))

def get_id(audio: dict[str, Any]) -> str:
    return audio['path']

def get_streaming_sender(id: str, audio: dict[str, Any]) -> dict[str, Any]:
    assert audio['sampling_rate'] == 16_000
    return StreamingAudioSender(
        audio=np.int16(audio['array'] * 32768).tobytes(),
        id=id,
        sampling_rate=16_000,
        real_time_interval_sec=1 / 25,
        speed_multiplier=25,
        track_history=True,
    )

df['id'] = df.audio.apply(get_id)
df['sender'] = df.apply(lambda row: get_streaming_sender(row.id, row.audio), axis=1)

In [4]:
# type: ignore

asr = VoskStreaming(model_name='vosk-model-ru-0.42')
asr.start_thread()

results = transribe_parallel(asr=asr, senders=df.sender, n_threads=8)

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 1 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 2 orphan components.
LOG (VoskAPI:Collapse():nnet-utils.cc:1488) Added 1 components, removed 2
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /home/oleg/.cache/vosk/vosk-model-ru-0.42/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from /home/oleg/.cache/vosk/vosk-model-ru-0.42/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:297) Loading words from /home/oleg/.cache/vosk/vosk-model-ru-0.42/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:30

Transcribing sound_test_0001.wavTranscribing sound_test_0002.wav

Transcribing sound_test_0003.wav
Transcribing sound_test_0004.wav
Transcribing sound_test_0005.wav
Transcribing sound_test_0006.wav
Transcribing sound_test_0007.wav
Transcribing sound_test_0008.wav
Transcribed sound_test_0001.wav: и поэтому использовать их в повседневности не получается мы вынуждены отступать
Transcribing sound_test_0009.wav
Transcribed sound_test_0007.wav: неужто не может быть какое-то количество дискретных столбиков где каждый [...]
Transcribing sound_test_0010.wav
Transcribed sound_test_0002.wav: максимально ухудшить идеальную систему в воде туда какие-то элементы или [...]
Transcribed sound_test_0006.wav: и мне кажется абсолютно все замечали что детские крики раздражают там и ты [...]
Transcribed sound_test_0004.wav: да это отсутствие долго живущие бранчей другими словами отсутствие какому- [...]
Transcribed sound_test_0003.wav: человек недавно ставший тимлидом и который сказал знаешь заходит в кабин

In [6]:
df['preds'] = [PartialTranscription.join([c.data for c in x]) for x in results.values()]

In [7]:
# type: ignore

wer = wer(reference=df['transcription'].tolist(), hypothesis=df['preds'].tolist())
print(wer)

0.5010845986984815


In [8]:
for sender in df.sender:
    sender.remove_waveforms_from_history()

In [9]:
df.sender.iloc[0].history

[InputChunk(data=b'', start_time=0.0, end_time=1.0, put_timestamp=1747819996.9244716, get_timestamp=1747819996.9677382),
 InputChunk(data=b'', start_time=1.0, end_time=2.0, put_timestamp=1747819996.9647076, get_timestamp=1747819997.063811),
 InputChunk(data=b'', start_time=2.0, end_time=3.0, put_timestamp=1747819997.0048919, get_timestamp=1747819997.0672882),
 InputChunk(data=b'', start_time=3.0, end_time=4.0, put_timestamp=1747819997.0451238, get_timestamp=1747819997.300374),
 InputChunk(data=b'', start_time=4.0, end_time=5.0, put_timestamp=1747819997.08531, get_timestamp=1747819998.0748181),
 InputChunk(data=b'', start_time=5.0, end_time=6.0, put_timestamp=1747819997.1255112, get_timestamp=1747819998.500396),
 InputChunk(data=b'', start_time=6.0, end_time=7.0, put_timestamp=1747819997.1657193, get_timestamp=1747819998.9501324),
 InputChunk(data=b'', start_time=7.0, end_time=8.0, put_timestamp=1747819997.2058167, get_timestamp=1747819999.4919872),
 InputChunk(data=b'', start_time=8.0,

In [10]:
results['sound_test_0001.wav']

[OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747819996.983468, get_timestamp=1747819996.9836197),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747819997.064245, get_timestamp=1747819997.064325),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747819997.0677187, get_timestamp=1747819997.0677469),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747819997.412227, get_timestamp=1747819997.4124024),
 OutputChunk(data=PartialTranscription(id='__latest__', text='и поэтому', final=False), put_timestamp=1747819998.1340957, get_timestamp=1747819998.1342552),
 OutputChunk(data=PartialTranscription(id='__latest__', text='и поэтому', final=False), put_timestamp=1747819998.5535474, get_timestamp=1747819998.5536666),
 OutputChunk(data=PartialTranscription(id='__latest__', text='и поэтому использовать их в', final=