In [None]:
# type: ignore

from typing import Any

from jiwer import wer
from datasets import Dataset, load_dataset, Audio
import numpy as np
import pandas as pd

from asr_eval.streaming.sender import StreamingAudioSender
from asr_eval.streaming.caller import transribe_parallel
from asr_eval.streaming.models.vosk import VoskStreaming
from asr_eval.streaming.transcription import PartialTranscription

In [2]:
# type: ignore

dataset: Dataset = load_dataset('bond005/podlodka_speech')['test'].take(10)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
df = pd.DataFrame(list(dataset))

def get_id(audio: dict[str, Any]) -> str:
    return audio['path']

def get_streaming_sender(id: str, audio: dict[str, Any]) -> dict[str, Any]:
    assert audio['sampling_rate'] == 16_000
    return StreamingAudioSender(
        audio=np.int16(audio['array'] * 32768).tobytes(),
        id=id,
        sampling_rate=16_000,
        real_time_interval_sec=1 / 25,
        speed_multiplier=25,
    )

df['id'] = df.audio.apply(get_id)
df['sender'] = df.apply(lambda row: get_streaming_sender(row.id, row.audio), axis=1)

In [None]:
# type: ignore

asr = VoskStreaming(model_name='vosk-model-ru-0.42')
asr.start_thread()
asr.input_buffer.keep_history()
asr.output_buffer.keep_history()

results = transribe_parallel(asr=asr, senders=df.sender, n_threads=8)
df['preds'] = [PartialTranscription.join(x) for x in results.values()]

In [4]:
# type: ignore

wer = wer(reference=df['transcription'].tolist(), hypothesis=df['preds'].tolist())
print(wer)

0.5010845986984815


In [9]:
# type: ignore

input_chunks = asr.input_buffer.history.copy()
output_chunks = asr.output_buffer.history.copy()

for chunk, id in input_chunks:
    chunk.data = None

In [17]:
input_chunks[:10]

[(InputChunk(data=None, start_time=0.0, end_time=1.0, put_timestamp=1747772420.6469786, get_timestamp=1747772420.6480033),
  'sound_test_0001.wav'),
 (InputChunk(data=None, start_time=0.0, end_time=1.0, put_timestamp=1747772420.647268, get_timestamp=1747772420.6689687),
  'sound_test_0002.wav'),
 (InputChunk(data=None, start_time=0.0, end_time=1.0, put_timestamp=1747772420.6477287, get_timestamp=1747772420.6852953),
  'sound_test_0004.wav'),
 (InputChunk(data=None, start_time=0.0, end_time=1.0, put_timestamp=1747772420.6479168, get_timestamp=1747772420.701765),
  'sound_test_0003.wav'),
 (InputChunk(data=None, start_time=0.0, end_time=1.0, put_timestamp=1747772420.6486137, get_timestamp=1747772420.7174416),
  'sound_test_0005.wav'),
 (InputChunk(data=None, start_time=0.0, end_time=1.0, put_timestamp=1747772420.648904, get_timestamp=1747772420.7333894),
  'sound_test_0006.wav'),
 (InputChunk(data=None, start_time=0.0, end_time=1.0, put_timestamp=1747772420.6491137, get_timestamp=1747772

In [16]:
output_chunks[:10]

[(OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747772420.6689398, get_timestamp=1747772420.6691282),
  'sound_test_0001.wav'),
 (OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747772420.685273, get_timestamp=1747772420.68535),
  'sound_test_0002.wav'),
 (OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747772420.701743, get_timestamp=1747772420.7020895),
  'sound_test_0004.wav'),
 (OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747772420.7174172, get_timestamp=1747772420.7178807),
  'sound_test_0003.wav'),
 (OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747772420.7333694, get_timestamp=1747772420.7337346),
  'sound_test_0005.wav'),
 (OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1747772420.7491434, get_timestamp=1