# **Модуль Б**. Разработка модели машинного обучения

## Импортирование библиотек

In [3]:
# модули для работы с моделью
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd

from IPython.display import Audio

from Model.tokenizer import Tokenizer

import numpy as np

from IPython.display import Audio

import librosa
import librosa.display
import matplotlib.pyplot as plt

import math

from tqdm import tqdm  # Импортируем tqdm


## Загрузка данных

In [4]:
# путь к обработанным данным
data_path = '../Module1/Dataset/dataset.h5'
# загружаем датасета
df = pd.read_hdf(data_path, key='df')
df.head()

Unnamed: 0,token_ids,mel_spec
0,"[25, 8, 25, 4, 3, 8, 5, 13, 3, 15, 10, 3, 11, ...","[[-56.18891, -54.528297, -43.84198, -44.330162..."
1,"[21, 9, 8, 12, 25, 20, 3, 12, 2, 24, 12, 25, 3...","[[-55.704872, -55.704872, -55.704872, -55.7048..."
2,"[22, 28, 10, 13, 25, 11, 0, 3, 15, 23, 18, 15,...","[[-57.897354, -57.897354, -51.308334, -54.8423..."
3,"[14, 11, 9, 3, 15, 2, 11, 5, 3, 22, 13, 9, 19,...","[[-59.127876, -59.127876, -55.668625, -55.1438..."
4,"[12, 19, 25, 28, 6, 25, 11, 3, 21, 9, 32, 2, 1...","[[-50.631554, -50.631554, -50.631554, -50.6315..."


## **Класс с датасетом**

In [5]:
# класс для датасета
class TTSDataset(Dataset):
    # конструктор класса
    def __init__(self, data_path):
        '''data_path - путь к данным в формате h5'''
        # получаем датафрейм
        self.data = pd.read_hdf(data_path, key='df')
        # получаем списки с токенами и мел-спектрограммами
        self.mels = self.data['mel_spec']
        self.token_ids = self.data['token_ids']

    # метод для получения длины датасета
    def __len__(self):
        return len(self.token_ids)
    
    # метод получения элемента
    def __getitem__(self, idx):
        # возвращаем тензоры
        token_ids = torch.LongTensor(self.token_ids[idx])
        mel = torch.FloatTensor(self.mels[idx])

        return token_ids, mel

In [6]:
dataset = TTSDataset(data_path)
dataset[0]

(tensor([25,  8, 25,  4,  3,  8,  5, 13,  3, 15, 10,  3, 11,  2, 13,  0, 12,  2,
          3, 11, 25, 13, 25, 15, 11, 13,  9, 19,  5, 26,  3, 32,  2, 14, 11,  2,
         26,  3, 15,  2,  3,  9,  3, 23, 29, 10, 15,  5, 26, 34, 34, 34, 34, 34,
         34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
         34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
         34, 34, 34, 34, 34, 34, 34, 34, 34]),
 tensor([[-56.1889, -54.5283, -43.8420,  ..., -80.0000, -80.0000, -80.0000],
         [-56.1889, -52.8671, -39.4327,  ..., -80.0000, -80.0000, -80.0000],
         [-56.1889, -52.7999, -34.5382,  ..., -80.0000, -80.0000, -80.0000],
         ...,
         [-56.1889, -56.1889, -56.1889,  ..., -80.0000, -80.0000, -80.0000],
         [-56.1889, -56.1889, -56.1889,  ..., -80.0000, -80.0000, -80.0000],
         [-56.1889, -56.1889, -56.1889,  ..., -80.0000, -80.0000, -80.0000]]))

In [7]:
PAD_IDX = 34

In [8]:
# Проверяем один пример
tokens, mel = dataset[0]
print("Пример 0:")
print(f"Токены: {tokens} (длина: {len(tokens)})")
print(f"Спектрограмма: {mel.shape}")

Пример 0:
Токены: tensor([25,  8, 25,  4,  3,  8,  5, 13,  3, 15, 10,  3, 11,  2, 13,  0, 12,  2,
         3, 11, 25, 13, 25, 15, 11, 13,  9, 19,  5, 26,  3, 32,  2, 14, 11,  2,
        26,  3, 15,  2,  3,  9,  3, 23, 29, 10, 15,  5, 26, 34, 34, 34, 34, 34,
        34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
        34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34,
        34, 34, 34, 34, 34, 34, 34, 34, 34]) (длина: 99)
Спектрограмма: torch.Size([80, 498])


In [9]:
dataloader = DataLoader(dataset, shuffle=True, batch_size=16)

In [10]:
# Проверяем батч
for batch_idx, (tokens_batch, mels_batch) in enumerate(dataloader):
    print(f"\nБатч {batch_idx}:")
    print(f"Токены: {tokens_batch.shape}")
    print(f"Спектрограммы: {mels_batch.shape}")
    break


Батч 0:
Токены: torch.Size([16, 99])
Спектрограммы: torch.Size([16, 80, 498])


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [12]:
# Параметры
tokenizer = Tokenizer()
vocab_size = tokenizer.getlen()      # пример размера словаря
print(f'vocab_size: {vocab_size}')
n_mels = 80           # число мел-коэффициентов
T_text = 99           # длина текстовой последовательности
T_mel = 498           # временная длина мел-спектрограммы


/home/user/Chemp/Speech-synthesis/Module2
vocab_size: 35


## **Класс с моделью**

**1. Text Encoder**

In [37]:
class TextEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=34)
        self.conv = nn.Sequential(
            nn.Conv1d(embed_dim, hidden_dim, kernel_size=5, padding=2),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=5, padding=2),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.embedding(x)
        # print('TE x1:', x.shape)
        x = x.transpose(1,2)
        # print('TE x2:', x.shape)
        x = self.conv(x)
        # print('TE x3:', x.shape)
        x = x.transpose(1,2)
        # print('TE x4:', x.shape)
        return x

In [38]:
class Decoder(nn.Module):
    def __init__(self, hidden_dim=512, mel_dim=80, seq_len=498):
        super().__init__()
        self.linear = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, mel_dim)
        )
        self.seq_len = seq_len

    def forward(self, hidden):
        # hidden: (B, T, H)
        x = self.linear(hidden)
        x = x.transpose(1, 2)

        if x.size(2) < self.seq_len:
            pad_amt = self.seq_len - x.size(2)
            x = F.pad(x, (0, pad_amt), value=-80)
        elif x.size(2) > self.seq_len:
            x = x[:, :, :self.seq_len]

        return x # (B, 80, 498)
    

In [39]:
class VITS(nn.Module):
    def __init__(self, vocab_size, pad_id=34):
        super().__init__()
        self.encoder = TextEncoder(vocab_size)
        self.decoder = Decoder()

    def forward(self, token_ids):
        hidden = self.encoder(token_ids)
        mel_pred = self.decoder(hidden)
        return mel_pred

In [32]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [40]:
model = VITS(35)
model.to(device)

VITS(
  (encoder): TextEncoder(
    (embedding): Embedding(35, 256, padding_idx=34)
    (conv): Sequential(
      (0): Conv1d(256, 512, kernel_size=(5,), stride=(1,), padding=(2,))
      (1): ReLU()
      (2): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
      (3): ReLU()
    )
  )
  (decoder): Decoder(
    (linear): Sequential(
      (0): Linear(in_features=512, out_features=512, bias=True)
      (1): ReLU()
      (2): Linear(in_features=512, out_features=80, bias=True)
    )
  )
)

In [41]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

In [42]:
EPOCHS = 10

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    pbar = tqdm(dataloader, desc=f'Epoch {epoch+1}|{EPOCHS}')

    for token_ids, mel_target in pbar:
        token_ids, mel_target = token_ids.to(device), mel_target.to(device)

        # прямой проход
        mel_pred = model(token_ids)
        # print(mel_pred.shape)
        # print(mel_target.shape)

        # потеря
        loss = criterion(mel_pred, mel_target)

        # обратный проход
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        pbar.set_postfix(loss=loss.item())

    print(f'Epoch {epoch+1}, Loss: {running_loss / len(dataloader):.4f}')


Epoch 1|10: 100%|██████████| 147/147 [00:01<00:00, 104.49it/s, loss=626]    


Epoch 1, Loss: 1092.5527


Epoch 2|10: 100%|██████████| 147/147 [00:01<00:00, 104.75it/s, loss=1.34e+3]


Epoch 2, Loss: 1046.8736


Epoch 3|10: 100%|██████████| 147/147 [00:01<00:00, 106.08it/s, loss=1.56e+3]


Epoch 3, Loss: 1046.7510


Epoch 4|10: 100%|██████████| 147/147 [00:01<00:00, 116.63it/s, loss=834]    


Epoch 4, Loss: 1042.5812


Epoch 5|10: 100%|██████████| 147/147 [00:01<00:00, 111.02it/s, loss=517]    


Epoch 5, Loss: 1038.0670


Epoch 6|10: 100%|██████████| 147/147 [00:01<00:00, 103.31it/s, loss=575]    


Epoch 6, Loss: 1039.2550


Epoch 7|10: 100%|██████████| 147/147 [00:01<00:00, 105.47it/s, loss=391]    


Epoch 7, Loss: 1036.7770


Epoch 8|10: 100%|██████████| 147/147 [00:01<00:00, 101.50it/s, loss=860]   


Epoch 8, Loss: 1040.0383


Epoch 9|10: 100%|██████████| 147/147 [00:01<00:00, 106.90it/s, loss=1.16e+3]


Epoch 9, Loss: 1041.1351


Epoch 10|10: 100%|██████████| 147/147 [00:01<00:00, 107.22it/s, loss=810]    

Epoch 10, Loss: 1038.1411



