# Wav2Vec2-Large-Ru-Golos

##### https://huggingface.co/bond005/wav2vec2-large-ru-golos

Модель Wav2Vec2 основана на `facebook/wav2vec2-large-xlsr-53`, доработана на русском языке с помощью `Sberdevices Gloss` с дополнениями для звука, такими как сдвиг высоты тона, ускорение / замедление звука, реверберация и т.д.

При использовании этой модели убедитесь, что частота дискретизации речевого ввода составляет 16 кГц.

# Usage

In [1]:
import os
import warnings

import torch
from datasets import load_dataset, load_from_disk
from datasets.features import Audio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

import torchaudio

In [2]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the device name
    device = torch.cuda.current_device()
    print(f"Code is connected to CUDA. Using GPU: {torch.cuda.get_device_name(device)}")
else:
    print("CUDA is not available. Running on CPU.")

Code is connected to CUDA. Using GPU: NVIDIA GeForce GTX 1650 Ti


In [3]:
# Все равно у меня не хватит памяти для выполнения(
device = 'cpu'

In [4]:
LANG_ID = "ru"
PATH_MODEL = '/home/redalexdad/recognition_speech/wav2vec2-large-ru-golos'
# Кол-во текстов для предсказания
SAMPLES = 10

In [5]:
# Проверка наличия модели в локальном пути
if not os.path.exists(PATH_MODEL):
    processor = Wav2Vec2Processor.from_pretrained(PATH_MODEL)
    model = Wav2Vec2ForCTC.from_pretrained(PATH_MODEL).to(device)
    print('Успешно модель загружена')
else:
    # Загрузка процессора из сети
    processor = Wav2Vec2Processor.from_pretrained("bond005/wav2vec2-large-ru-golos")
    processor.save_pretrained(PATH_MODEL)
    
    # Загрузка модели из сети
    model = Wav2Vec2ForCTC.from_pretrained("bond005/wav2vec2-large-ru-golos").to(device)
    model.save_pretrained(PATH_MODEL)
    
    print(f'Успешно процессор и модель скачаны и сохранены в пути {PATH_MODEL}')

Успешно процессор и модель скачаны и сохранены в пути /home/redalexdad/recognition_speech/wav2vec2-large-ru-golos


In [6]:
# Загрузка тестовую часть набора данных Golos и чтение первого звукового файла
ds = load_dataset("bond005/sberdevices_golos_10h_crowd", split="test")

In [7]:
# Токенизация
# Batch size 1
processed = processor(ds[0]["audio"]["array"], return_tensors="pt", padding="longest").to(device)

It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.
2024-03-08 22:47:58.382376: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-08 22:47:58.382413: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-08 22:47:58.383096: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-03-08 22:47:58.388120: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the follow

In [8]:
# Освобождение памяти на GPU
torch.cuda.empty_cache()

In [9]:
# Извлекаем логиты
logits = model(processed.input_values, attention_mask=processed.attention_mask).logits

In [10]:
# Транскрибация
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
print(transcription)

шестьдесят тысяч тенге сколько будет стоить


# Evaluation

In [11]:
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
from jiwer import wer, cer  # we need word error rate (WER) and character error rate (CER)

In [12]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Get the device name
    device = torch.cuda.current_device()
    print(f"Code is connected to CUDA. Using GPU: {torch.cuda.get_device_name(device)}")
else:
    print("CUDA is not available. Running on CPU.")

Code is connected to CUDA. Using GPU: NVIDIA GeForce GTX 1650 Ti


In [13]:
%%time
# load the test part of Golos Crowd and remove samples with empty "true" transcriptions
golos_crowd_test = load_dataset("bond005/sberdevices_golos_10h_crowd", split="test")
golos_crowd_test = golos_crowd_test.filter(
    lambda it1: (it1["transcription"] is not None) and (len(it1["transcription"].strip()) > 0)
)

CPU times: user 173 ms, sys: 14.8 ms, total: 188 ms
Wall time: 1.95 s


In [14]:
%%time
# load the test part of Golos Farfield and remove sampels with empty "true" transcriptions
golos_farfield_test = load_dataset("bond005/sberdevices_golos_100h_farfield", split="test")
golos_farfield_test = golos_farfield_test.filter(
    lambda it2: (it2["transcription"] is not None) and (len(it2["transcription"].strip()) > 0)
)

CPU times: user 198 ms, sys: 1.17 ms, total: 199 ms
Wall time: 2.49 s


In [15]:
PATH_MODEL = '/home/redalexdad/recognition_speech/wav2vec2-base-960h'

In [16]:
%%time
# Проверка наличия модели в локальном пути
if not os.path.exists(PATH_MODEL):
    processor = Wav2Vec2Processor.from_pretrained(PATH_MODEL)
    model = Wav2Vec2ForCTC.from_pretrained(PATH_MODEL).to(device)
    print('Успешно модель загружена')
else:
    # Загрузка процессора из сети
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    processor.save_pretrained(PATH_MODEL)
    
    # Загрузка модели из сети
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)
    model.save_pretrained(PATH_MODEL)
    
    print(f'Успешно процессор и модель скачаны и сохранены в пути {PATH_MODEL}')

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Успешно процессор и модель скачаны и сохранены в пути /home/redalexdad/recognition_speech/wav2vec2-base-960h
CPU times: user 2.29 s, sys: 1.37 s, total: 3.66 s
Wall time: 4.88 s


In [17]:
%%time
# recognize one sound
def map_to_pred(batch):
    # tokenize and vectorize
    processed = processor(
        batch["audio"]["array"], sampling_rate=batch["audio"]["sampling_rate"],
        return_tensors="pt", padding="longest"
    )
    input_values = processed.input_values.to("cuda")
    
    # Ensure attention_mask is present
    if "attention_mask" in processed:
        attention_mask = processed.attention_mask.to("cuda")
    else:
        attention_mask = None

    # recognize
    with torch.no_grad():
        logits = model(input_values, attention_mask=attention_mask).logits
    predicted_ids = torch.argmax(logits, dim=-1)

    # decode
    transcription = processor.batch_decode(predicted_ids)
    batch["text"] = transcription[0]
    return batch

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 12.4 µs


In [18]:
%%time
# calculate WER and CER on the crowd domain
crowd_result = golos_crowd_test.map(map_to_pred, remove_columns=["audio"])
crowd_wer = wer(crowd_result["transcription"], crowd_result["text"])
crowd_cer = cer(crowd_result["transcription"], crowd_result["text"])
print("Word error rate on the Crowd domain:", crowd_wer)
print("Character error rate on the Crowd domain:", crowd_cer)

Map:   0%|          | 0/9896 [00:00<?, ? examples/s]

Word error rate on the Crowd domain: 1.2374885899114973
Character error rate on the Crowd domain: 1.0096483782964534
CPU times: user 16min 33s, sys: 2.03 s, total: 16min 35s
Wall time: 6min 50s


In [19]:
%%time
# calculate WER and CER on the farfield domain
farfield_result = golos_farfield_test.map(map_to_pred, remove_columns=["audio"])
farfield_wer = wer(farfield_result["transcription"], farfield_result["text"])
farfield_cer = cer(farfield_result["transcription"], farfield_result["text"])
print("Word error rate on the Farfield domain:", farfield_wer)
print("Character error rate on the Farfield domain:", farfield_cer)

Map:   0%|          | 0/1915 [00:00<?, ? examples/s]

Word error rate on the Farfield domain: 1.4665950847053209
Character error rate on the Farfield domain: 1.0606908047041392
CPU times: user 2min 15s, sys: 426 ms, total: 2min 16s
Wall time: 59.4 s
