# **Модуль Б**. Разработка модели машинного обучения

## Импортирование библиотек

In [8]:
# модули для работы с моделью
from transformers import AutoProcessor, AutoModel

# для работы с датасетом
from datasets import Dataset
import torch

import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

from IPython.display import Audio

## Загрузка данных

In [55]:
# путь к обработанным данным
dataset_path = '../Module1/Dataset/dataset.parquet'
# загружаем датасета
dataset = Dataset.from_parquet(dataset_path)
dataset

Dataset({
    features: ['path', 'sentence', 'record_duration', 'sampling_rate', 'array', 'input_features'],
    num_rows: 2678
})

Загруэаем предобученную модель

In [56]:
import torch
from transformers import AutoProcessor, AutoModel, TrainingArguments, Trainer

model_id = "facebook/mms-tts-rus"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)


# Текст для синтеза
text = "Привет! Как дела?"
inputs = processor(text=text, return_tensors="pt")



In [27]:
with torch.no_grad():
    output = model(**inputs).waveform  # Получаем аудиосигнал


In [28]:
import scipy.io.wavfile

sampling_rate = model.config.sampling_rate  # Получаем частоту дискретизации
print(sampling_rate)
speech_numpy = output.squeeze().cpu().numpy()  # Переводим в numpy

scipy.io.wavfile.write("output.wav", rate=sampling_rate, data=speech_numpy)


16000


In [29]:
from IPython.display import Audio

Audio(output, rate=model.config.sampling_rate)


In [57]:
dataset

Dataset({
    features: ['path', 'sentence', 'record_duration', 'sampling_rate', 'array', 'input_features'],
    num_rows: 2678
})

In [69]:
def prepare_data(batch):
    inputs = processor(text=batch["sentence"], return_tensors="pt", padding=True)
    print(inputs.input_ids.squeeze(0))
    batch["input_ids"] = inputs.input_ids.squeeze(0)  # Входные токены
    
    batch["labels"] = inputs.input_ids.squeeze(0)  # Выходные токены (для обучения)
    
    return batch

dataset_prep = dataset.map(prepare_data, remove_columns=["sentence", "array", "sampling_rate"])


Map:   0%|          | 0/2678 [00:00<?, ? examples/s]

tensor([ 0,  8,  0, 38,  0,  8,  0,  5,  0,  1,  0, 38,  0, 13,  0, 15,  0,  1,
         0, 14,  0, 37,  0,  1,  0, 40,  0,  9,  0, 15,  0,  3,  0, 35,  0,  9,
         0,  1,  0, 40,  0,  8,  0, 15,  0,  8,  0, 14,  0, 40,  0, 15,  0, 33,
         0, 41,  0, 13,  0, 43,  0,  1,  0, 22,  0,  9,  0, 25,  0, 40,  0,  9,
         0, 43,  0,  1,  0, 14,  0,  9,  0,  1,  0, 33,  0,  1,  0,  7,  0,  0,
         0, 37,  0, 14,  0, 13,  0, 43,  0])
tensor([ 0, 42,  0, 33,  0, 38,  0, 35,  0,  8,  0,  4,  0,  1,  0, 35,  0,  9,
         0, 18,  0, 35,  0,  8,  0,  1,  0, 15,  0, 37,  0, 42,  0, 35,  0,  9,
         0,  1,  0, 41,  0, 19,  0, 38,  0, 33,  0, 26,  0,  8,  0, 37,  0, 40,
         0,  6,  0,  4,  0,  1,  0, 14,  0,  8,  0,  1,  0, 41,  0, 13,  0,  6,
         0,  9,  0, 35,  0,  9,  0, 37,  0,  1,  0,  2,  0, 37,  0, 26,  0, 37,
         0, 41,  0,  9,  0])
tensor([ 0, 43,  0, 13,  0,  1,  0, 41,  0, 13,  0,  6,  0, 40,  0,  7,  0, 22,
         0,  8,  0, 37,  0, 43,  0,  1,  0, 19

In [71]:
dataset_prep

Dataset({
    features: ['path', 'record_duration', 'input_features', 'input_ids', 'labels'],
    num_rows: 2678
})

In [73]:
print(dataset[0])

{'path': 'common_voice_ru_41910911.mp3', 'sentence': 'Абай был не только талантливым поэтом, но и ученым.', 'record_duration': 5.04, 'sampling_rate': 16000, 'array': [-1.418811734765768e-10, 4.3655745685100555e-11, -8.003553375601768e-11, 8.731149137020111e-11, -1.4551915228366852e-10, 1.1641532182693481e-10, 1.4551915228366852e-10, 4.729372449219227e-11, -7.275957614183426e-11, 6.548361852765083e-11, -1.2732925824820995e-10, 3.637978807091713e-11, -6.912159733474255e-11, -3.7834979593753815e-10, 1.7462298274040222e-10, -8.731149137020111e-11, 4.656612873077393e-10, 0.0, 9.38598532229662e-10, -5.748006515204906e-10, 1.229636836796999e-09, -2.3283064365386963e-09, -8.076312951743603e-10, -8.36735125631094e-11, -3.943569026887417e-09, 1.7462298274040222e-10, -2.5029294192790985e-09, -3.863533493131399e-09, 2.7794158086180687e-09, -8.003553375601768e-10, 4.132743924856186e-09, 3.7034624256193638e-09, 3.6670826375484467e-09, 6.868503987789154e-09, -7.421476766467094e-10, -1.367880031466484

In [63]:
sample_text = dataset[0]["sentence"]
inputs = processor(sample_text, padding=True, return_tensors="pt")
print(inputs)


{'input_ids': tensor([[ 0,  8,  0, 38,  0,  8,  0,  5,  0,  1,  0, 38,  0, 13,  0, 15,  0,  1,
          0, 14,  0, 37,  0,  1,  0, 40,  0,  9,  0, 15,  0,  3,  0, 35,  0,  9,
          0,  1,  0, 40,  0,  8,  0, 15,  0,  8,  0, 14,  0, 40,  0, 15,  0, 33,
          0, 41,  0, 13,  0, 43,  0,  1,  0, 22,  0,  9,  0, 25,  0, 40,  0,  9,
          0, 43,  0,  1,  0, 14,  0,  9,  0,  1,  0, 33,  0,  1,  0,  7,  0,  0,
          0, 37,  0, 14,  0, 13,  0, 43,  0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]])}


In [72]:
dataset_prep[0]['input_ids']

[0.0,
 8.0,
 0.0,
 38.0,
 0.0,
 8.0,
 0.0,
 5.0,
 0.0,
 1.0,
 0.0,
 38.0,
 0.0,
 13.0,
 0.0,
 15.0,
 0.0,
 1.0,
 0.0,
 14.0,
 0.0,
 37.0,
 0.0,
 1.0,
 0.0,
 40.0,
 0.0,
 9.0,
 0.0,
 15.0,
 0.0,
 3.0,
 0.0,
 35.0,
 0.0,
 9.0,
 0.0,
 1.0,
 0.0,
 40.0,
 0.0,
 8.0,
 0.0,
 15.0,
 0.0,
 8.0,
 0.0,
 14.0,
 0.0,
 40.0,
 0.0,
 15.0,
 0.0,
 33.0,
 0.0,
 41.0,
 0.0,
 13.0,
 0.0,
 43.0,
 0.0,
 1.0,
 0.0,
 22.0,
 0.0,
 9.0,
 0.0,
 25.0,
 0.0,
 40.0,
 0.0,
 9.0,
 0.0,
 43.0,
 0.0,
 1.0,
 0.0,
 14.0,
 0.0,
 9.0,
 0.0,
 1.0,
 0.0,
 33.0,
 0.0,
 1.0,
 0.0,
 7.0,
 0.0,
 0.0,
 0.0,
 37.0,
 0.0,
 14.0,
 0.0,
 13.0,
 0.0,
 43.0,
 0.0]

In [65]:
training_args = TrainingArguments(
    output_dir="./mms_tts_finetuned",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_strategy="no",
    save_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_prep,
    processing_class=processor,
)




In [66]:
trainer.train()

ValueError: type of 0.0 unknown: <class 'float'>. Should be one of a python, numpy, pytorch or tensorflow object.