# **Модуль Б**. Разработка модели машинного обучения

## Импортирование библиотек

In [1]:
# модули для работы с моделью
from transformers import VitsModel, AutoTokenizer
import torch

# для работы с датасетом
from datasets import Dataset
import torch

import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

from IPython.display import Audio

## Загрузка данных

In [2]:
# путь к обработанным данным
dataset_path = '../Module1/Dataset/dataset.parquet'
# загружаем датасета
dataset = Dataset.from_parquet(dataset_path)
dataset

Dataset({
    features: ['path', 'sentence', 'record_duration', 'sampling_rate', 'array', 'input_features'],
    num_rows: 2678
})

In [3]:
text_example = dataset[42]['sentence']
text_example

'разворачивается эпическая история о добре и зле'

Загруэаем предобученную модель

In [4]:
model = VitsModel.from_pretrained("facebook/mms-tts-rus")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")

In [5]:
inputs = tokenizer(text_example, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs).waveform

In [6]:
import scipy.io.wavfile

sampling_rate = model.config.sampling_rate  # Получаем частоту дискретизации
print(sampling_rate)
speech_numpy = output.squeeze().cpu().numpy()  # Переводим в numpy

scipy.io.wavfile.write("output.wav", rate=sampling_rate, data=speech_numpy)


16000


In [7]:
from IPython.display import Audio

Audio(output, rate=model.config.sampling_rate)


In [8]:
from datasets import load_dataset, Audio

dataset = dataset.cast_column("path", Audio(sampling_rate=16000))  # Приведение к нужной частоте
dataset 


Dataset({
    features: ['path', 'sentence', 'record_duration', 'sampling_rate', 'array', 'input_features'],
    num_rows: 2678
})

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")

def preprocess_function(examples):
    return tokenizer(examples["sentence"], return_tensors="pt", padding=True, truncation=True)

dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/2678 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
dataset

Dataset({
    features: ['path', 'sentence', 'record_duration', 'sampling_rate', 'array', 'input_features', 'input_ids', 'attention_mask'],
    num_rows: 2678
})

In [11]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [12]:
from transformers import Trainer, TrainingArguments, VitsModel

training_args = TrainingArguments(
    output_dir="./tts_model",
    per_device_train_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    logging_dir="./logs",  # Директория для логов
    logging_strategy="steps",  # Логирование каждые logging_steps
    logging_steps=10,  # Логировать каждые 10 шагов
    save_steps=100,  # Сохранять модель каждые 100 шагов
    save_total_limit=2,
    eval_strategy="no",  # Оценка после каждой эпохи
    do_train=True,
    do_eval=False
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    processing_class=tokenizer,
)




In [13]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: waveform,sequence_lengths,spectrogram. For reference, the inputs it received are input_ids,attention_mask.

In [None]:
model.save_pretrained("./finetuned_tts_model")
tokenizer.save_pretrained("./finetuned_tts_model")


In [None]:
inputs = tokenizer("Привет, как дела?", return_tensors="pt")

with torch.no_grad():
    output = model(**inputs).waveform

from scipy.io.wavfile import write

write("output_finetuned.wav", rate=model.config.sampling_rate, data=output.squeeze().cpu().numpy())
