# Install dependencies

In [3]:
!pip install -U git+https://github.com/huggingface/accelerate.git

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-req-build-kdil40d0
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-req-build-kdil40d0
  Resolved https://github.com/huggingface/accelerate.git to commit 3086e26db9ea7033fcceff5e35b035787420d873
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: accelerate
  Building wheel for accelerate (pyproject.toml) ... [?25ldone
[?25h  Created wheel for accelerate: filename=accelerate-0.32.0.dev0-py3-none-any.whl size=313975 sha256=a1dce9d1b4baabe95efad8478c1eef997671c8ec926946a13c963923399fccd8
  Stored in directory: /tmp/pip-ephem-wheel-cache-9bfj6tpt/wheels/9c/a3/1e/47368f9b657

In [5]:
!python3 -m pip install --upgrade pip

Defaulting to user installation because normal site-packages is not writeable
Collecting pip
  Downloading pip-24.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.1.1-py3-none-any.whl (1.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
aniemore 1.2.2 requires pip<24.0.0,>=23.0.1, but you have pip 24.1.1 which is incompatible.[0m[31m
[0mSuccessfully installed pip-24.1.1


# Imports

In [6]:
import os
import numpy as np
import pandas as pd

import torch
import torchaudio
from torch.utils.data import Dataset
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from transformers import Trainer, TrainingArguments

from sklearn.model_selection import train_test_split

import librosa # for audio reading

from tqdm import tqdm

# Constants

WHISPER_MODEL examples:

* 1) openai/whisper-small (suitable for testing functionality, not very accurate on sentences, but capable of recognizing individual words or phrases. Requires low computational resources)
* 2) openai/whisper-medium (recommended medium model)
* 3) openai/whisper-large (sufficiently accurate on large sentences, but requires significant computational resources)
* 4) openai/whisper-large-v2 (sufficiently accurate on large sentences, but requires significant computational resources)
* 5) lorenzoncina/whisper-medium-ru (a model finetuned on the Russian language - recommended for training on Russian)

In [10]:
os.environ['WANDB_DISABLED'] = 'true' # disable logging of wandb

WHISPER_MODEL = 'openai/whisper-small'

VAL_PERCENT, TEST_PERCENT = 0.05, 0.2 # dataset is divided into train, validation and test according to these values multiplied by 100%. TRAIN_PERCENT = 1 - (VAL_PERCENT + TEST_PERCENT)
TRAINING_ARGS = TrainingArguments(
    output_dir='./whisper', # the directory to save checkpoints
    overwrite_output_dir=True, # overwrite output directory if exists
    num_train_epochs=10, # number of epochs. One epoch is a single pass through the entire dataset. The number of epochs to use depends on the size of the dataset. Too many epochs can lead to overfitting, which can be detected by monitoring the validation loss during training. Too few epochs can result in underfitting, which can be identified by a consistently "sharp" decrease in the loss.
    per_device_train_batch_size=2, # The batch size per iteration on one GPU. It is ideally in the form of a power of two (2, 4, 8), but should not exceed 64 (using larger batch sizes can lead to worse results from the optimizer)
    save_steps=500, # save checkpoint each X iterations
    save_total_limit=2, # maximum number of checkpoints in a folder, where older checkpoints are deleted when new ones are saved
    do_train=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Whisper initializing

In [11]:
processor = AutoProcessor.from_pretrained(WHISPER_MODEL)
# model = AutoModelForSpeechSeq2Seq.from_pretrained(WHISPER_MODEL).to('cuda')
device = 'cpu'
model = AutoModelForSpeechSeq2Seq.from_pretrained(WHISPER_MODEL).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
# setting the model's language and defining the task of transcription
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="russian", task="transcribe")

NameError: name 'processor' is not defined

# Data initializing

In [1]:
import pandas as pd
import os

In [2]:
DATASET_DIR = ''
df = pd.read_csv(os.path.join(DATASET_DIR, 'speech_noise_text.csv'))[['name', 'text']]
df.head()

Unnamed: 0,name,text
0,../dataset/RESD_csv/train/sadness_anger_39/39_...,Он уже не дышит. Что мне делать?
1,../dataset/RESD_csv/train/sadness_happiness_49...,"Я сейчас помою и отдам тебе, если тебя что-то ..."
2,../dataset/RESD_csv/train/02_anger_sadness/02_...,Неужели дольше? Анна Ивановна 70 лет. Она дела...
3,../dataset/RESD_csv/train/sadness_disgust_33/3...,Мало ли что ты захотел
4,../dataset/RESD_csv/train/05_neutral_fear/05_n...,"Да, они всегда вечером плохо ходят."


In [3]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import soundfile as sf
import torch
import pandas as pd
import jiwer
import scipy.signal as sps

# Загрузка модели и процессора
processor = WhisperProcessor.from_pretrained("./whisper_large", language='ru')
processor.get_decoder_prompt_ids( language='ru')
model = WhisperForConditionalGeneration.from_pretrained("./whisper_large")

# Убедитесь, что устройство поддерживает CUDA, если это нужно
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
dirToOpen = "/home/user/Melkumyan/outputWav/"
# Подготовка данных: wav файл и его расшифровка
# Предполагается, что у вас есть список аудиофайлов и их текстовых транскрипций
# df = pd.read_csv('../DataSet/speech_to_text.csv', index_col=0)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
audio_input, sr = sf.read(df.name.values[1])
len(audio_input), type(audio_input), audio_input.shape, audio_input.dtype

(253697, numpy.ndarray, (253697,), dtype('float64'))

In [5]:
audio_input1, sr1 = sf.read('../../РАЗМЕТКА2024/диалоги ДСП и ТЧМ/ДСП/ДСП12.wav')
len(audio_input1), type(audio_input1), audio_input1.shape, audio_input1.dtype

(24588, numpy.ndarray, (24588, 2), dtype('float64'))

In [9]:
audio_input1[:, 1].shape

(24588,)

In [48]:
def transcribe_audio(audio_path):
    # Загрузка аудио
    audio = denoiser.read_wav(audio_path)
    denoised_audio = denoiser.filter(audio)
    denoiser.write_wav('test_denoised.wav', denoised_audio)
    audio_input, sr = sf.read('test_denoised.wav')
    # audio_input, sr = sf.read(audio_path)
    
    audio_input = audio_input.reshape((audio_input.shape[0], -1))[:, 0]
    # audio_input = sps.resample(audio_input, 16000)
    print(sr)
    
    print(len(audio_input)/sr)
    # Подготовка входных данных для модели
    inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt",language='ru').input_features
    inputs = inputs.to(device)

    # Генерация текста
    predicted_ids = model.generate(inputs)
    
    # Декодирование текста
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True, language='ru')[0]
    return transcription

In [49]:
from IPython.display import Audio

In [50]:
wer_values = []
# Пример использования
for i in range(5, 7): #(df.shape[0]):
    # audio_file = dirToOpen+f'output_{i}.wav'
    audio_file = df.name.values[i]
    print(f'Запись {i}')
    transcription = transcribe_audio(audio_file)
    print("Транскрипция:", transcription)
    manual_transcription = df.text.values[i]
    print("Ручная транскрипция: ", manual_transcription)
    # Вычисление WER
    wer = jiwer.wer(manual_transcription.lower(), transcription.lower())
    print("WER:", wer)
    wer_values.append(wer)
    
    display(Audio(audio_file))
# Вычисление среднего WER
average_wer = sum(wer_values) / len(wer_values)
print("Среднее WER:", average_wer)

Запись 5
44100
6.23
Транскрипция:  Нет Принял мальчишок у вас иииииииииииииииииииииииии
Ручная транскрипция:  Нет! Принимайте заказ и идите готовьте! Я есть хочу!
WER: 1.0


Запись 6
44100
3.05
Транскрипция:  Вещь ботночна поддвешимся до колечного  
Ручная транскрипция:  А может быть мы с ней подружимся и будем лучшими друзьями?
WER: 1.0


Среднее WER: 1.0


In [32]:
from rnnoise_wrapper import RNNoise

denoiser = RNNoise(f_name_lib='../../RNNoise_Wrapper/rnnoise_wrapper/libs/librnnoise_5h_ru_500k.so.0.4.1')

audio = denoiser.read_wav( glob.glob('../../РАЗМЕТКА2024/ПРС КВ/*.wav')[3])
denoised_audio = denoiser.filter(audio)
denoiser.write_wav('test_denoised.wav', denoised_audio)

In [33]:
audio

In [34]:
denoised_audio

In [26]:
display(Audio( glob.glob('../../РАЗМЕТКА2024/ПРС КВ/*.wav')[3]))

In [25]:
display(Audio('test_denoised.wav'))

In [51]:
import glob
wer_values = []
# Пример использования
i = 0
for audio_file in glob.glob('../../РАЗМЕТКА2024/ПРС КВ/*.wav')[:5]: #(df.shape[0]):
    # audio_file = dirToOpen+f'output_{i}.wav'
    
    print(f'Запись {i}')
    transcription = transcribe_audio(audio_file)
    print("Транскрипция:", transcription)

    
    # print("Ручная транскрипция: ", manual_transcription)
    # # Вычисление WER
    # wer = jiwer.wer(manual_transcription.lower(), transcription.lower())
    # print("WER:", wer)
    # wer_values.append(wer)
    i += 1
    display(Audio(audio_file))
# Вычисление среднего WER
# average_wer = sum(wer_values) / len(wer_values)
# print("Среднее WER:", average_wer)

Запись 0
8013
32.91002121552477
Транскрипция:  бииииииииииииииииииииии


Запись 1
8013
7.420067390490453
Транскрипция:  BING


Запись 2
8013
4.220017471608636
Транскрипция:  Восейская Вороподинтадин в двадцать пятый поезд 


Запись 3
8013
32.72007987021091
Транскрипция:  Поехали 


Запись 4
8013
4.030076126294771
Транскрипция:  Peep peep peep peep peep peep peep peep


Training dataset. When indexed, it returns a list containing:

* 1) filepath - path to the audio
* 2) text - transcribed text by annotators
* 3) input_features - audio features for prediction
* 4) labels - transcribed text by annotators converted into tokens
* 5) attention mask - an attention mask where each element indicates whether the model should pay attention to the token corresponding to the same index in the labels list.

In [3]:
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer


  _torch_pytree._register_pytree_node(
2024-09-17 15:57:19.982913: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-17 15:57:19.990666: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-17 15:57:20.002653: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-17 15:57:20.002677: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-17 15:57:20.010330: I t

In [2]:
class WhisperDataset(Dataset):
    def __init__(self, df, processor):
        self.df = df
        self.processor = processor

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        filepath, text = self.df.iloc[idx]
        
        # this is the sample rate, which represents audio frequency. Whisper models are pretrained on a sample rate of 16000, so it's recommended not to change this value.
        audio, _ = librosa.load(filepath, sr=16000)
        
        tokenized = self.processor.tokenizer(text, return_tensors='pt', padding='max_length', return_attention_mask=True, max_length=model.config.max_length)
        labels, attention_mask = tokenized['input_ids'][0], tokenized['attention_mask'][0]
        
        input_features = self.processor(audio, return_tensors="pt", sampling_rate=16000).input_features[0]
        
        return [filepath, text, input_features, labels, attention_mask]

NameError: name 'Dataset' is not defined

In [45]:
# split data on train/val/test
VAL_PERCENT, TEST_PERCENT =  0.2, 0.1
train_df, nontrain_df = train_test_split(df, test_size=VAL_PERCENT+TEST_PERCENT, random_state=42)
eval_df, test_df = train_test_split(nontrain_df, test_size=VAL_PERCENT/(VAL_PERCENT+TEST_PERCENT), random_state=42)

In [46]:
# create train/val/test datasets
model_name_preprocess = "Shirali/whisper-small-ru"

processor =  WhisperProcessor.from_pretrained(model_name_preprocess)

train_dataset = WhisperDataset(train_df, processor)
eval_dataset = WhisperDataset(eval_df, processor)
test_dataset = WhisperDataset(test_df, processor)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [47]:
len(train_dataset), len(eval_dataset), len(test_dataset)

(125, 18, 37)

In [98]:
model = AutoModelForSpeechSeq2Seq.from_pretrained("Shirali/whisper-small-ru")
model

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (f

# Utils

In [49]:
# only used for making predictions on WhisperDataset. It takes as input the model and the dataset, and returns predictions in the form of a pandas.DataFrame (i.e. table)
def predict(model, dataset: WhisperDataset, device=device) -> pd.DataFrame:
    predicted_df = pd.DataFrame([], columns=['filename', 'pred', 'gt'])
    for filepath, text, input_features, _, attention_mask in test_dataset:
        filename = filepath.replace('\\', '/').split('/')[-1]
    
        input_features = torch.stack([input_features]).to(device)
        generated_ids = model.generate(inputs=input_features, attention_mask=attention_mask)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
        predicted_df.loc[len(predicted_df)] = [filename, transcription, text]
    return predicted_df

# Calculating current metric (not trained model)

In [66]:
def calculate_wer(reference, hypothesis):
    ref_words = reference.split()
    hyp_words = hypothesis.split()
    # Counting the number of substitutions, deletions, and insertions
    substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
    deletions = len(ref_words) - len(hyp_words)
    insertions = len(hyp_words) - len(ref_words)
    # Total number of words in the reference text
    total_words = len(ref_words)
    if total_words>0 :
        wer = (substitutions + deletions + insertions) / total_words
    else:
        wer = 1
    return wer

In [64]:
predicted_df = predict(model, test_dataset)

In [51]:
predicted_df

Unnamed: 0,filename,pred,gt
0,29_happiness_fear_f_110.wav,"Но что, мерстить мне у им не надо, ну пожалуйс...","Ну что вы меня ртите? Мне лендл не нужен, пожа..."
1,30_fear_happiness_h_120.wav,Why are you afraid? Why are you worried? Ever...,Ну чего вы боитесь? Ну чего вы переживаете? Ну...
2,15_enthusiasm_sadness_s_010.wav,здравствуйте. Я хотел бы записаться на курсы а...,Здравствуйте! Я хотел бы записаться на курсы а...
3,12_sadness_disgust_d_110.wav,"На гитаре, не ну я понимаю ещё Фортепиана там ...","На гитаре? Не, ну я понимаю, ещё фортепиано та..."
4,33_sadness_disgust_s_110.wav,внешность обманчивает,Внешность обманчива.
5,29_happiness_fear_f_110.wav,"Но что, мерстить мне у им не надо, ну пожалуйс...","Ну что вы меня ртите? Мне лендл не нужен, пожа..."
6,48_fear_disgust_d_050.wav,"а игра актеоров посмотри, да там вообще ну нег...","А игра актёров, посмотри. Да там вообще, ну не..."
7,39_sadness_anger_s_020.wav,"Ну он уже не дышит, всё что мне делать.",Он уже не дышит. Что мне делать?
8,03_disgust_neutral d_052.wav,Но вы как директор школы должны прям вот быть ...,"Но вы, как директор школы, должны прям вот быт..."
9,47_enthusiasm_sadness_e_070.wav,"Нужно в кредит, хотите в кредит.",Нужно в кредит. Хотите в кредит.


In [67]:
acc = sum([calculate_wer(predicted_df['pred'].values[i] , predicted_df['gt'].values[i]) for i in range(len(predicted_df))]) / len(predicted_df)
print(f'{acc * 100}%')

50.82015115746267%


In [None]:
model.proj_out

In [90]:
!pip install peft

Defaulting to user installation because normal site-packages is not writeable
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m36m0:00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.10.0


# Training

In [122]:
# function that transforms data after extracting it from the dataset. Here, the data_collate_fn simply reshapes the data.
def data_collate_fn(data_list):
    batch = len(data_list)
    data_numpy = np.array(data_list)
    input_features, labels, attention_mask = data_numpy[:, -3], data_numpy[:, -2], data_numpy[:, -1]
    return {'input_features': torch.stack(input_features.tolist()),
            'labels': torch.stack(labels.tolist()),
            'attention_mask': torch.stack(attention_mask.tolist())}

In [101]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, target_modules=[
        
        "proj_out",
       
    ],)

from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 421,064 || all params: 242,155,976 || trainable%: 0.1738813168913907


In [110]:
from dataclasses import dataclass
from typing import Any, Dict, List, Union

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need different padding methods
        # first treat the audio inputs by simply returning torch tensors
        input_features = [{"input_features": feature["name"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        # get the tokenized label sequences
        label_features = [{"input_ids": self.processor.tokenizer.truncate_sequences(feature["text"])[0]}
                          for feature in features]
        # pad the labels to max length
        
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt",)

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # if bos token is appended in previous tokenization step,
        # cut bos token here as it's append later anyways
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels

        return batch
        

In [124]:
Training_args = TrainingArguments(
    output_dir="mt0-large-lora",
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    tokenizer=processor,
    data_collator=data_collate_fn,
    compute_metrics=compute_metrics,
)

trainer.train()

# trainer = Trainer(
#     model=model,
#     train_dataset=train_dataset,
#     data_collator=DataCollatorSpeechSeq2SeqWithPadding(df),
#     args=training_args,
# )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


NameError: name 'compute_metrics' is not defined

In [88]:
metrics = torch.nn.CrossEntropyLoss()
CRITERIY = torch.nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.proj_out.parameters(), lr = 0.0001)

for ds in train_dataset:
    
    #filepath, text, input_features, labels, attention_mask
    input_features = ds[2]
    labels = ds[3]
    attention_mask =  ds[4]
    input_features, labels, attention_mask
    input_features = torch.stack([input_features]).to(device)
    generated_ids = model.generate(inputs=input_features, attention_mask=attention_mask)
    loss = CRITERIY(generated_ids, labels[:len(generated_ids)])
    
    break
    
loss

RuntimeError: "log_softmax_lastdim_kernel_impl" not implemented for 'Long'

In [82]:
attention_mask.shape, labels.shape, generated_ids.shape

(torch.Size([448]), torch.Size([448]), torch.Size([1, 33]))

In [86]:
labels[:33], attention_mask[:34]

(tensor([50258, 50363,   859, 28340, 48375, 22801, 35768,  4970, 13790, 12502,
            11, 32518,    11, 12846,  1725, 11813, 30802,  8531, 48140, 15599,
          1006,  4766,  3634, 30802,   776,  9971,  4165, 38521,  2345,    11,
         16977,    30, 50257]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 0]))

In [69]:
trainer = Trainer(
    model=model,
    args=TRAINING_ARGS,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collate_fn
)

TypeError: Descriptors cannot be created directly.
If this call came from a _pb2.py file, your generated code is out of date and must be regenerated with protoc >= 3.19.0.
If you cannot immediately regenerate your protos, some other possible workarounds are:
 1. Downgrade the protobuf package to 3.20.x or lower.
 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python parsing and will be much slower).

More information: https://developers.google.com/protocol-buffers/docs/news/2022-05-06#python-updates

In [None]:
trainer.train()

# Testing

In [None]:
# getting folder of the newest checkpoint
checkpoint_path = max(os.listdir(TRAINING_ARGS.output_dir), key=lambda x: int(x.split('-')[-1]) if 'checkpoint-' in x else 0)
checkpoint_path = os.path.join(TRAINING_ARGS.output_dir, checkpoint_path)

In [None]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(checkpoint_path).to('cuda')

## Test dataset test

In [None]:
predicted_df = predict(model, test_dataset)

In [None]:
predicted_df

In [None]:
# metric: accuracy
acc = sum((predicted_df['pred'] == predicted_df['gt'])) / len(predicted_df)
print(f'{acc * 100}%')

## Custom sample test:

In [None]:
audio, sample_rate = librosa.load(os.path.join(DATASET_DIR, 'sample/eu.0124f456-13b8-4765-936a-36bfd483683e.wav'), sr=16000)

In [None]:
inputs = processor(audio, return_tensors='pt', sampling_rate=sample_rate)
input_features = inputs.input_features.to('cuda')

In [None]:
generated_ids = model.generate(inputs=input_features)

In [None]:
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)