# wav2vec2-mbart50-ru

##### https://huggingface.co/bond005/wav2vec2-mbart50-ru

In [2]:
import os
import warnings

import torch
from datasets import load_dataset, load_from_disk
from datasets.features import Audio
from transformers import SpeechEncoderDecoderModel, Wav2Vec2Processor

import torchaudio

In [3]:
LANG_ID = "ru"
MODEL_ID = "bond005/wav2vec2-mbart50-ru"
PATH_MODEL = '/home/redalexdad/recognition_speech/wav2vec2-mbart50-ru'
# Кол-во текстов для предсказания
SAMPLES = 10

In [4]:
num_processes = max(1, os.cpu_count())

In [5]:
%%time
# Проверка наличия модели в локальном пути
if os.path.exists(PATH_MODEL):
    # Загрузка процессора из локального пути
    processor = Wav2Vec2Processor.from_pretrained(PATH_MODEL)
    
    # Загрузка модели из локального пути
    model = SpeechEncoderDecoderModel.from_pretrained(PATH_MODEL)
    print('Успешно модель загружена')
else:
    # Загрузка процессора из сети и сохранение в локальный путь
    processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
    processor.save_pretrained(PATH_MODEL)
    
    # Загрузка модели из сети и сохранение в локальный путь
    model = SpeechEncoderDecoderModel.from_pretrained(MODEL_ID)
    model.save_pretrained(PATH_MODEL)
    print(f'Успешно модель скачана и сохранена в пути {PATH_MODEL}')

Успешно модель загружена
CPU times: user 14.3 s, sys: 6.13 s, total: 20.4 s
Wall time: 45.1 s


In [6]:
%%time
test_dataset = load_dataset("mozilla-foundation/common_voice_11_0", LANG_ID, split=f"test[:{SAMPLES}]", trust_remote_code=True)

CPU times: user 430 ms, sys: 81.6 ms, total: 512 ms
Wall time: 5.42 s


In [7]:
# Путь к звуковому файлу
# audio_file_path = '/home/redalexdad/recognition_speech/RUSLAN/000000_RUSLAN.wav'

audio_file_path = '/home/redalexdad/recognition_speech/buriy_audiobooks_2_val/0/00/47d11d73f818.wav'
text_file_path = '/home/redalexdad/recognition_speech/buriy_audiobooks_2_val/0/00/47d11d73f818.txt'

In [8]:
# Открываем файл в режиме чтения
with open(text_file_path, 'r', encoding='utf-8') as file:
    # Читаем содержимое файла
    content = file.read()
    print(content)

мы благодарны вам господин ренд



In [9]:
%%time
# Загрузим аудиофайл с использованием torchaudio
waveform, sample_rate = torchaudio.load(audio_file_path, normalize=True)

CPU times: user 0 ns, sys: 3.03 ms, total: 3.03 ms
Wall time: 98.9 ms


In [13]:
%%time
# Проведем ресэмплирование, если частота дискретизации отличается от 16000 Гц
if sample_rate != 16000:
    waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

CPU times: user 23.6 ms, sys: 0 ns, total: 23.6 ms
Wall time: 17 ms


In [10]:
%%time
# Предобработка аудиофайла с использованием Wav2Vec2Processor
processed = processor(waveform.numpy(), sampling_rate=16000, return_tensors="pt", padding='longest')

2024-03-08 21:11:15.463600: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 21:11:15.463673: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 21:11:15.630446: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-08 21:11:15.978401: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


CPU times: user 1.95 s, sys: 528 ms, total: 2.48 s
Wall time: 5.91 s


In [11]:
%%time
with torch.no_grad():
    predicted_ids = model.generate(**processed)

CPU times: user 37 s, sys: 16.7 ms, total: 37 s
Wall time: 5.08 s


In [12]:
predicted_sentences = processor.batch_decode(
    predicted_ids,
    num_processes=num_processes,
    skip_special_tokens=True
)[0]

In [13]:
# Вывод результатов
print("-" * 100)
print("Reference:", content)
print("Prediction:", predicted_sentences)
print("-" * 100)

----------------------------------------------------------------------------------------------------
Reference: мы благодарны вам господин ренд

Prediction: Мы благодарны Вам, господин Ле.
----------------------------------------------------------------------------------------------------
