In [7]:
import pandas as pd
file = pd.read_excel('russian_speech.xlsx')
y = [sentence for sentence in file['Русская речь']]

In [3]:
import os
import torchaudio

dir_name = "abnormal_voice/"
files_in_dir = [f for f in os.listdir(dir_name)]

samp = 0
X = []
for i in files_in_dir:
    speech_array, sampling_rate = torchaudio.load(dir_name + i)
    X.append(speech_array)
    samp = sampling_rate

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.7, 
                                                    random_state=42)

In [8]:
import re
chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�\\xad\\n\–]'

def remove_special_characters(sentence):
    sentence = re.sub(chars_to_ignore_regex, '', sentence).lower() + " "
    sentence = sentence.replace('4', 'четыре')
    sentence = sentence.replace('р220', 'эр двести двадцать')
    sentence = sentence.replace('6', 'шесть')
    return sentence

y = list(map(remove_special_characters, y))
y_train = list(map(remove_special_characters, y_train))
y_test = list(map(remove_special_characters, y_test))

In [9]:
fullY = ''.join(y)
fullY = fullY.split()
fullY = ' '.join(fullY)

In [10]:
vocab_dict = {}
for x in fullY:
    for t in x:
        if t in vocab_dict:
            vocab_dict[t] += 1
        else:
            vocab_dict[t] = 1

vocab_dict['|'] = vocab_dict[' ']
del vocab_dict[' ']
print(vocab_dict)

{'к': 677, 'а': 1501, 'п': 503, 'р': 825, 'о': 1716, 'й': 287, 'т': 1140, 'и': 1079, 'д': 523, 'у': 458, 'с': 906, 'г': 280, 'е': 1445, 'в': 661, 'з': 313, 'я': 391, 'ь': 399, 'н': 1102, 'л': 744, 'б': 320, 'м': 519, 'э': 43, 'ю': 99, 'ц': 111, 'х': 103, 'ч': 247, 'ё': 59, 'ы': 334, 'ш': 151, 'ф': 44, 'ж': 158, 'щ': 50, '́': 107, 'ъ': 4, '|': 3056}


In [11]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

37

In [12]:
import json
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [20]:
from transformers import Wav2Vec2CTCTokenizer

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
def sampl_conver(speech_array):
    resampler = torchaudio.transforms.Resample(sampling_rate, 16_000)
    sentc = resampler(speech_array).squeeze().numpy()
    return sentc

X_train = list(map(sampl_conver, X_train))
X_test = list(map(sampl_conver, X_test))

In [22]:
from transformers import Wav2Vec2FeatureExtractor

feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=True)

In [23]:
from transformers import Wav2Vec2Processor

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [24]:
def prepare_dataset(audioX, textY):
    data_list = []
    for i in range(len(audioX)):
        d = {}
        d['input_values'] = processor(audioX[i], sampling_rate=16000).input_values[0]
        d['input_length'] = len(d["input_values"])
        with processor.as_target_processor():
            d["labels"] = processor(textY[i]).input_ids
        data_list.append(d)
    return data_list

train_list = prepare_dataset(X_train, y_train)
test_list = prepare_dataset(X_test, y_test)

