In [1]:
!nvidia-smi
%env CUDA_VISIBLE_DEVICES=1
%env CUDA_VISIBLE_DEVICES

Mon May  3 09:23:34 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  Off  | 00000000:37:00.0 Off |                    0 |
| N/A   49C    P0    38W / 250W |  18444MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   62C    P0    67W / 250W |      3MiB / 40536MiB |     38%      Default |
|       

'1'

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import torchaudio
import torch
import torch.nn.functional as F
import torch.nn as nn
from tqdm import tqdm
from datetime import datetime
import numpy as np
from WaveNetTTS.model import WaveNet
import os
import sys
import random
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()

Using cache found in /zhome/22/c/137477/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub


Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (lin

In [5]:
@torch.no_grad()
def get_tacotron_spectrogram(transcript):
    sequence = tacotron2.text_to_sequence(transcript, ['english_cleaners'])
    sequence = torch.tensor([sequence], device=device, dtype=torch.int64)
    _, mel, _, _ = tacotron2.infer(sequence)
    return mel

In [6]:
sp_freq = 4000
seq_len = 4000
bins = 128
batch_size = 44
channels = 256
kernel_size = 2
dilation_depth = 9
blocks = 2
condition_size = 256

MuLawEncoding = torchaudio.transforms.MuLawEncoding(quantization_channels=bins)
Resample = torchaudio.transforms.Resample(22050, sp_freq)

hugging_face_model = 'bert-base-uncased'#'distilbert-base-uncased'#
#tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', hugging_face_model)

In [7]:
from torch.cuda.amp.grad_scaler import GradScaler
from torch.cuda.amp.autocast_mode import autocast

scaler = GradScaler()
from transformers import AdamW

model = WaveNet(quantization_bins=bins, kernel_size=kernel_size, channels=channels, dilation_depth=dilation_depth, blocks=blocks, condition_size=condition_size,
                initial_cond_size=80,
                global_condition=False, 
                local_condition=True, 
                use_bert=False)
model = model.to(device)

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and not 'bert' in n],
    'weight_decay': 0.01,
    'lr': 1e-4
}, {
    'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and not 'bert' in n],
    'weight_decay': 0.0,
    'lr': 1e-4
}, {
    'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and 'bert' in n],
    'weight_decay': 0.01,
    'lr': 5e-5
}, {
    'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and 'bert' in n],
    'weight_decay': 0.0,
    'lr': 5e-5
}]
optim = AdamW(optimizer_grouped_parameters, correct_bias=False, eps=1e-8)

criterion = torch.nn.CrossEntropyLoss()

losses = []

In [8]:
print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

Trainable parameters: 10347136


In [9]:
def collate_fn(batch_in):
    y_true_out = []
    gc_out = []
    lc_out = []
    for waveform, _, _, transcript in batch_in:
        #Mu Law Encoding of waveform
        y_true = MuLawEncoding(Resample(waveform[0])).to(device)
        #Trim random segment of the waveform with length seq_len
        random_idx = np.random.randint(len(y_true)-seq_len)
        y_true_trim = y_true[random_idx:random_idx+seq_len]

        y_true_out.append(y_true_trim)

        #Feed into sentence embedding class
        _, lc_embed = model.sentence_embedding(get_tacotron_spectrogram(transcript))

        #Interpolate the locally conditioned signal from BERT so it fits with the waveform size and then trim the same portion of the signal as for the waveform.
        lc_embed = F.interpolate(lc_embed, size=waveform.size(1))[:,:,random_idx:random_idx+seq_len]
        lc_out.append(lc_embed)
    return torch.stack(y_true_out,0), _, torch.cat(lc_out, 0)

In [10]:
dataset = torchaudio.datasets.LJSPEECH('', download=False)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
model.train()
try:
    while True:
        with tqdm(iter(dataloader)) as t_bar:
            for y_true, gc, lc in t_bar:
                optim.zero_grad()

                y_true, lc = y_true.to(device), lc.to(device)
                # Model predictions
                y_preds = model(y_true, lc=lc)

                # Calculates loss. The whole indexation show is just to align predictions with the true values.
                loss = criterion(y_preds[:, :, :-1], y_true[:, -y_preds.size(2)+1:])


                #scaler.scale(loss).backward()
                #scaler.step(optim)
                #scaler.update()

                loss.backward()
                optim.step()

                # Updates
                losses.append(loss.item())


                t_bar.set_postfix_str(f'Loss: {loss.item()}, Receptive Field: {model.receptive_field}, Learned Size: {y_preds.size(2)}')
        torch.save({'model':model.state_dict(), 'optim':optim.state_dict()}, f'LJ_speech_WaveNet-Tacotron2_{datetime.now().strftime("%d-%m-%Y")}-seq_L{seq_len}-bins{bins}-batch{batch_size}-C{channels}-k{kernel_size}-dil{dilation_depth}b{blocks}-cs{condition_size}-sp_freq{sp_freq}.pt')
except E:
    with open('ERROR.txt','w+') as f:
        print(f'{E} : {E.__class__} : {sys.exc_info()}', file=f)

 56%|█████▌    | 167/298 [1:19:36<1:03:25, 29.05s/it, Loss: 2.613783836364746, Receptive Field: 1022, Learned Size: 2978] 



 92%|█████████▏| 274/298 [2:09:07<11:33, 28.88s/it, Loss: 2.576012372970581, Receptive Field: 1022, Learned Size: 2978]   



100%|██████████| 298/298 [2:19:56<00:00, 28.18s/it, Loss: 2.660460948944092, Receptive Field: 1022, Learned Size: 2978] 
100%|██████████| 298/298 [2:22:25<00:00, 28.68s/it, Loss: 2.3964781761169434, Receptive Field: 1022, Learned Size: 2978]  
100%|██████████| 298/298 [2:22:36<00:00, 28.71s/it, Loss: 2.4150919914245605, Receptive Field: 1022, Learned Size: 2978]  
 11%|█         | 32/298 [15:23<2:08:05, 28.89s/it, Loss: 2.2835912704467773, Receptive Field: 1022, Learned Size: 2978]