In [None]:
%pip install faster-whisper

In [None]:
from faster_whisper import WhisperModel
model_size = "large-v3"

# Run on GPU with FP16
gaam_model = WhisperModel(model_size, device="cuda", compute_type="float16")

# or run on GPU with INT8
# model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
# or run on CPU with INT8
# model = WhisperModel(model_size, device="cpu", compute_type="int8")

segments, info = gaam_model.transcribe("audio.mp3", beam_size=5)

print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

In [None]:
%pip install vosk soundfile

In [None]:
import time
import subprocess
import os

def transcribe_vosk(audio_path):
    """Транскрипция с помощью Vosk через командную строку"""
    output_file = "temp_vosk_output.txt"
    
    # Запускаем vosk-transcriber как в примере
    start_time = time.time()
    result = subprocess.run([
        'vosk-transcriber', 
        '-l', 'ru', 
        '-i', audio_path, 
        '-o', output_file
    ], capture_output=True, text=True)
    end_time = time.time()
    
    # Читаем результат
    try:
        with open(output_file, 'r', encoding='utf-8') as f:
            text = f.read().strip()
        times = end_time - start_time
        # Удаляем временный файл
        os.remove(output_file)
        return text, times
    except FileNotFoundError:
        print(f"Vosk output file not found for {audio_path}")
        return ""

transcribe_vosk('dataset/1.mp3')

In [None]:
%pip install transformers

In [None]:
from transformers import AutoModel

revision = "e2e_rnnt"  # can be any v3 model: ssl, ctc, rnnt, e2e_ctc, e2e_rnnt
gaam_model = AutoModel.from_pretrained(
    "ai-sage/GigaAM-v3",
    revision=revision,
    trust_remote_code=True,
)

transcription = gaam_model.transcribe("example.wav")

transcription()

In [None]:
%pip install pydub jiwer

In [None]:
import os
import time
import wave
import json
from pydub import AudioSegment
import io
from jiwer import wer, cer
import string

whisper_model = WhisperModel(model_size, device="cuda", compute_type="float16")
gigaam_model = gaam_model

def preprocess_text(text):
    """Предобработка текста для расчета метрик"""
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

def transcribe_whisper(audio_paths, prompt=None):
    transcripts = []
    times = []
    
    for audio_path in audio_paths:
        start_time = time.time()
        
        if prompt:
            segments, info = whisper_model.transcribe(audio_path, beam_size=5, initial_prompt=prompt, condition_on_previous_text=False )
        else:
            segments, info = whisper_model.transcribe(audio_path, beam_size=5)
        
        text = ' '.join([segment.text for segment in segments])
        end_time = time.time()
        
        transcripts.append(text)
        times.append(end_time - start_time)
    
    return transcripts, times

def transcribe_vosk(audio_paths):
    transcripts = []
    times = []
    
    for audio_path in audio_paths:

        output_file = "temp_vosk_output.txt"
        
        # Запускаем vosk-transcriber как в примере
        start_time = time.time()
        result = subprocess.run([
            'vosk-transcriber', 
            '-l', 'ru', 
            '-i', audio_path, 
            '-o', output_file
        ], capture_output=True, text=True)
        end_time = time.time()
        
        # Читаем результат
        try:
            with open(output_file, 'r', encoding='utf-8') as f:
                text = f.read().strip()
            time1 = end_time - start_time
            # Удаляем временный файл
            os.remove(output_file)
        except FileNotFoundError:
            print(f"Vosk output file not found for {audio_path}")
            return ""   
        transcripts.append(text)
        times.append(time1)
    
    return transcripts, times

def transcribe_gigaam(audio_paths):
    transcripts = []
    times = []
    
    for audio_path in audio_paths:
        start_time = time.time()
        transcription = gigaam_model.transcribe(audio_path)
        end_time = time.time()
        
        transcripts.append(transcription)
        times.append(end_time - start_time)
    
    return transcripts, times

def get_audio_duration_sec(audio_path):
    audio = AudioSegment.from_file(audio_path)
    return len(audio) / 1000.0

def evaluate_models():
    audio_files = [f"dataset/{i}.mp3" for i in range(1, 11)]
    
    references = []
    for i in range(1, 11):
        with open(f"dataset/{i}.txt", "r", encoding="utf-8") as f:
            references.append(f.read().strip())
    
    # Получение длительностей аудио
    durations = [get_audio_duration_sec(path) for path in audio_files]
    
    # Транскрипция всеми моделями
    print("Транскрипция Whisper...")
    whis_results, whis_times = transcribe_whisper(audio_files)
    
    print("Транскрипция Vosk...")
    vosk_results, vosk_times = transcribe_vosk(audio_files)
    
    print("Транскрипция GigaAM...")
    giga_results, giga_times = transcribe_gigaam(audio_files)
    
    # Предобработка текстов
    references_clean = [preprocess_text(ref) for ref in references]
    whis_clean = [preprocess_text(text) for text in whis_results]
    vosk_clean = [preprocess_text(text) for text in vosk_results]
    giga_clean = [preprocess_text(text) for text in giga_results]
    
    # Расчет метрик
    results = {}
    
    # Whisper
    results['whisper'] = {
        'wer': wer(references_clean, whis_clean),
        'cer': cer(references_clean, whis_clean),
        'rtf': [t/d for t, d in zip(whis_times, durations)],
        'transcriptions': whis_results
    }
    
    # Vosk
    results['vosk'] = {
        'wer': wer(references_clean, vosk_clean),
        'cer': cer(references_clean, vosk_clean),
        'rtf': [t/d for t, d in zip(vosk_times, durations)],
        'transcriptions': vosk_results
    }
    
    # GigaAM
    results['gigaam'] = {
        'wer': wer(references_clean, giga_clean),
        'cer': cer(references_clean, giga_clean),
        'rtf': [t/d for t, d in zip(giga_times, durations)],
        'transcriptions': giga_results
    }
    
    return results, references

# Запуск оценки
if __name__ == "__main__":
    results, references = evaluate_models()
    
    # Вывод результатов
    for model_name, metrics in results.items():
        avg_rtf = sum(metrics['rtf']) / len(metrics['rtf'])
        print(f"\n{model_name.upper()}:")
        print(f"WER: {metrics['wer']:.3f}")
        print(f"CER: {metrics['cer']:.3f}")
        print(f"RTF: {avg_rtf:.3f}")
        
        # Примеры транскрипций
        print("Примеры транскрипций:")
        for i, (ref, hyp) in enumerate(zip(references[:10], metrics['transcriptions'][:10])):
            print(f"  {i+1}. Reference: {ref}")
            print(f"     Pred: {hyp}")