In [1]:
from typing import TypedDict
import numpy as np
import pytest

from asr_eval.streaming.caller import receive_full_transcription
from asr_eval.streaming.model import DummyASR, Signal
from asr_eval.streaming.sender import StreamingAudioSender

In [2]:
asr = DummyASR()
asr.start_thread()

class Sample(TypedDict):
    input: StreamingAudioSender
    output: list[str]

samples: list[Sample] = [
    {
        'input': StreamingAudioSender(id=0, audio=np.zeros(16_000 * 5), speed_multiplier=27),
        'output': [str(x) for x in range(5)]
    },
    {
        'input': StreamingAudioSender(id=1, audio=np.zeros(16_000 * 10), speed_multiplier=20),
        'output': [str(x) for x in range(10)]
    },
]

for sample in samples:
    sample['input'].start_sending(send_to=asr.input_buffer)

for sample in samples:
    chunks = receive_full_transcription(asr=asr, id=sample['input'].id)
    assert [x.data.text for x in chunks if x.data is not Signal.FINISH] == sample['output']

for sample in samples:
    sample['input'].join()
asr.stop_thread()

PUT: id=0, finish=False
PUT: id=1, finish=False
PUT: id=0, finish=False
PUT: id=1, finish=False
PUT: id=0, finish=False
PUT: id=1, finish=False
PUT: id=1, finish=False
PUT: id=0, finish=False
PUT: id=1, finish=False
PUT: id=0, finish=False
PUT: id=0, finish=True
Transcribed 0: 0 1 2 3 4
PUT: id=1, finish=False
PUT: id=1, finish=False
PUT: id=1, finish=False
PUT: id=1, finish=False
PUT: id=1, finish=False
PUT: id=1, finish=False
PUT: id=1, finish=False
PUT: id=1, finish=False
PUT: id=1, finish=True
Transcribed 1: 0 1 2 3 4 5 6 7 8 9
