In [1]:
%cd ..

/home/oleg/asr-eval


In [15]:
# type: ignore

import typing
from typing import Any

import gigaam # pyright: ignore[reportMissingTypeStubs]
from gigaam.model import GigaAMASR # pyright: ignore[reportMissingTypeStubs]
import jiwer
from datasets import Dataset, load_dataset, Audio
import numpy as np
import pandas as pd
from termcolor import colored

from asr_eval.streaming.sender import StreamingAudioSender
from asr_eval.streaming.caller import transсribe_parallel
from asr_eval.streaming.models.vosk import VoskStreaming
from asr_eval.streaming.model import PartialTranscription
from asr_eval.streaming.evaluation import get_word_timings, words_count
from asr_eval.align.recursive import align
from asr_eval.align.parsing import parse_string
from asr_eval.align.data import Match

In [3]:
# type: ignore

dataset: Dataset = load_dataset('bond005/podlodka_speech')['test'].take(10)
dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
df = pd.DataFrame(list(dataset))

def get_id(audio: dict[str, Any]) -> str:
    return audio['path']

def get_streaming_sender(id: str, audio: dict[str, Any]) -> dict[str, Any]:
    assert audio['sampling_rate'] == 16_000
    return StreamingAudioSender(
        audio=np.int16(audio['array'] * 32768).tobytes(),
        id=id,
        sampling_rate=16_000,
        real_time_interval_sec=1 / 10,
        speed_multiplier=1,
        track_history=True,
    )

df['id'] = df.audio.apply(get_id)
df['sender'] = df.apply(lambda row: get_streaming_sender(row.id, row.audio), axis=1)

df = df.iloc[3:4]
df

Unnamed: 0,audio,transcription,episode,title,id,sender
3,"{'path': 'sound_test_0004.wav', 'array': [-0.0...","Да, это отсутствие долго живущих бранчей. Друг...",132,Дисфункции организаций,sound_test_0004.wav,"StreamingAudioSender(audio=b',\xff""\xfe\xc1\xf..."


In [4]:
# type: ignore

asr = VoskStreaming(model_name='vosk-model-ru-0.42')
asr.start_thread()

results = transсribe_parallel(asr=asr, senders=df.sender, n_threads=8)

LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 1 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 2 orphan components.
LOG (VoskAPI:Collapse():nnet-utils.cc:1488) Added 1 components, removed 2
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /home/oleg/.cache/vosk/vosk-model-ru-0.42/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from /home/oleg/.cache/vosk/vosk-model-ru-0.42/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:297) Loading words from /home/oleg/.cache/vosk/vosk-model-ru-0.42/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:30

Transcribing sound_test_0004.wav
Transcribed sound_test_0004.wav: да это отсутствие долго живущие бранчей другими словами отсутствие какому- [...]


In [5]:
df['preds'] = [PartialTranscription.join(x) for x in results.values()]

In [6]:
# type: ignore

wer = jiwer.wer(reference=df['transcription'].tolist(), hypothesis=df['preds'].tolist())
print(wer)

0.5625


In [7]:
df['preds'].tolist()

['да это отсутствие долго живущие бранчей другими словами отсутствие какому-то кода но кот сам по себе даже если он не проинтегрированы это ведь не такая уж проблема казалось бы ну да ну ещё немножко пылиться на полке но чуть позже этом юзер получит']

In [8]:
df.iloc[0].sender.history

[InputChunk(data=b'', start_time=0.0, end_time=0.1, put_timestamp=1749150601.1671393, get_timestamp=1749150601.1672778),
 InputChunk(data=b'', start_time=0.1, end_time=0.2, put_timestamp=1749150601.2673914, get_timestamp=1749150601.2675493),
 InputChunk(data=b'', start_time=0.2, end_time=0.30000000000000004, put_timestamp=1749150601.3682144, get_timestamp=1749150601.368714),
 InputChunk(data=b'', start_time=0.30000000000000004, end_time=0.4, put_timestamp=1749150601.468634, get_timestamp=1749150601.4687707),
 InputChunk(data=b'', start_time=0.4, end_time=0.5, put_timestamp=1749150601.568772, get_timestamp=1749150601.5689025),
 InputChunk(data=b'', start_time=0.5, end_time=0.6000000000000001, put_timestamp=1749150601.6690173, get_timestamp=1749150601.6691484),
 InputChunk(data=b'', start_time=0.6000000000000001, end_time=0.7000000000000001, put_timestamp=1749150601.7692723, get_timestamp=1749150601.7694201),
 InputChunk(data=b'', start_time=0.7000000000000001, end_time=0.8, put_timestam

In [9]:
results[df.iloc[0].id]

[OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1749150601.1845343, get_timestamp=1749150601.184599),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1749150601.2679334, get_timestamp=1749150601.2680762),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1749150601.3691862, get_timestamp=1749150601.3692784),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1749150601.4691355, get_timestamp=1749150601.4692903),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1749150601.5692947, get_timestamp=1749150601.5694356),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1749150601.6697032, get_timestamp=1749150601.6697786),
 OutputChunk(data=PartialTranscription(id='__latest__', text='', final=False), put_timestamp=1749150601.7697976, 

In [10]:
gigaam_model = typing.cast(GigaAMASR, gigaam.load_model('ctc', device='cuda'))

  checkpoint = torch.load(model_path, map_location="cpu")


In [11]:
sample = df.iloc[0]

text = sample['transcription']
text = text.lower().replace('ё', 'е').replace('-', ' ')
for char in ('.', ',', '!', '?', ';', ':', '"', '(', ')'):
    text = text.replace(char, '')

word_timings = get_word_timings(gigaam_model, sample['audio']['array'], text)

In [20]:
def colorize_predicted_word(match: Match) -> str:
    status = match.get_status()
    true_text = ' '.join([str(x.value) for x in match.true])
    pred_text = ' '.join([str(x.value) for x in match.pred])
    if status == 'correct':
        return pred_text
    elif status == 'insertion':
        return colored(pred_text, 'red')
    elif status == 'deletion':
        return colored(pred_text, 'yellow')
    else:
        return colored(pred_text, 'red')

output_chunks = results[sample['id']]

prev_text = ''
for i, output_chunk in enumerate(output_chunks):
    text: str = PartialTranscription.join(output_chunks[:i + 1])

    if text == prev_text:
        continue
    prev_text = text

    seconds_sent = 0
    for input_chunk in sample.sender.history:
        if input_chunk.put_timestamp < output_chunk.put_timestamp:
            seconds_sent = input_chunk.end_time
        else:
            break

    n_true_words, in_true_word = words_count(word_timings, seconds_sent)

    options = [' '.join([word for word, _, _ in word_timings[:n_true_words]])]
    if in_true_word:
        options.append(' '.join([word for word, _, _ in word_timings[:n_true_words + 1]]))
    
    alignments = [
        align(parse_string(true_text), parse_string(text)) # type: ignore
        for true_text in options
    ]

    best_index = np.argmin([a.total_n_errs for a in alignments])
    alignment = alignments[best_index]
    true_text = options[best_index]

    print(' '.join(colorize_predicted_word(match) for match in alignment.matches))

да [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m
да это [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m
да это отсутствие [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m
да это отсутствие [31mдолга[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m
да это отсутствие долго [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m
да это отсутствие долго [31mживущие[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m
да это отсутствие долго [31mживущие[0m бранчей [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m [33m[0m
да это отсутствие долго 