In [None]:
import pandas as pd
file = pd.read_excel('russian_speech.xlsx')
y = [sentence for sentence in file['Как пройти до корпуса?']]

In [None]:
import os
import librosa

dir_name = "abnormal_voice/"
files_in_dir = [f for f in os.listdir(dir_name)]

X = []
i = 1
for e in files_in_dir:
    X.append(librosa.load(dir_name + e, sr=16000)[0])
    if i % 100 == 0:
        print(i)
    i += 1

In [None]:
X_train = X[:700]
y_train = y[:700]
X_test = X[700:]
y_test = y[700:]

In [None]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\\xad\\n\–]'

def remove_special_characters(sentence):
    sentence = re.sub(chars_to_ignore_regex, '', sentence).lower() + " "
    sentence = sentence.replace('4', 'четыре').replace('р220', 'р двести двадцать').replace('6', 'шесть')
    return sentence

y = list(map(remove_special_characters, y))
y_train = list(map(remove_special_characters, y_train))
y_test = list(map(remove_special_characters, y_test))

In [None]:
!pip install transformers
from transformers import AutoModelForCTC, Wav2Vec2Processor


In [None]:
model = AutoModelForCTC.from_pretrained("UrukHan/wav2vec2-russian")
processor = Wav2Vec2Processor.from_pretrained("UrukHan/wav2vec2-russian")

In [None]:
!pip install datasets
from datasets import Dataset
def p_d(audioX, textY):
    X_list = []
    Len_list = []
    Y_list = []
    for i in range(len(audioX)):
        X_list.append(processor(audioX[i], sampling_rate=16000).input_values[0])
        Len_list.append(len(X_list[-1]))
        with processor.as_target_processor():
            Y_list.append(processor(textY[i]).input_ids)
        
    dataset = Dataset.from_dict({'input_values': X_list, 'input_length': Len_list, 'labels': Y_list})
    return dataset

In [None]:
train_dataset = p_d(X_train, y_train)
test_dataset = p_d(X_test, y_test)

In [None]:
import torch

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union


class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
    """
        
    def __init__(self, processor, padding):
        self.processor = processor
        self.padding = padding

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lengths and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            return_tensors="pt",
        )
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                return_tensors="pt",
            )

        # replace padding with -100 to ignore loss correctly
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [None]:
from datasets import load_metric
wer_metric = load_metric("wer")

In [None]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="UrukHan/wav2vec2-russian",
  group_by_length=True,
  per_device_train_batch_size=16,
  evaluation_strategy="steps",
  num_train_epochs=10,
  fp16=True,
  gradient_checkpointing=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-7,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,
  push_to_hub=False
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=processor.feature_extractor,
)

In [None]:
trainer.train()

In [None]:
import numpy as np
trainer.evaluate()

In [None]:
model_dir = '/content/gdrive/MyDrive/'
trainer.save_model('team_3_4_united_model/model')

In [None]:
from google.colab import files

In [None]:
files.download('team_3_4_united_model')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
from google.colab import drive
drive.mount('/content/drive')