In [1]:
!nvidia-smi
%env CUDA_VISIBLE_DEVICES=0
%env CUDA_VISIBLE_DEVICES

Fri May  7 16:10:14 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-PCI...  Off  | 00000000:37:00.0 Off |                    0 |
| N/A   51C    P0    60W / 250W |      0MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA A100-PCI...  Off  | 00000000:86:00.0 Off |                    0 |
| N/A   70C    P0    74W / 250W |  39064MiB / 40536MiB |      0%      Default |
|       

'0'

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import torchaudio
import torch
import torch.nn.functional as F
import torch.nn as nn
from tqdm.notebook import tqdm
from glob import glob
from datetime import datetime
import numpy as np
from WaveNetTTS.model import WaveNet
import os
import random
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
random.seed(12)
os.environ['PYTHONHASHSEED'] = str(12)
np.random.seed(12)
torch.manual_seed(12)
torch.cuda.manual_seed(12)

In [5]:
sp_freq = 4000
seq_len = 4000
bins = 128
batch_size = 44
channels = 256
kernel_size = 2
dilation_depth = 9
blocks = 2
condition_size = 256

MuLawEncoding = torchaudio.transforms.MuLawEncoding(quantization_channels=bins)
MuLawDecoding = torchaudio.transforms.MuLawDecoding(bins)
Resample = torchaudio.transforms.Resample(16000, sp_freq)

hugging_face_model = 'bert-base-uncased'#'distilbert-base-uncased'#
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', hugging_face_model)

Using cache found in /zhome/22/c/137477/.cache/torch/hub/huggingface_pytorch-transformers_master


In [6]:
model = WaveNet(quantization_bins=bins, kernel_size=kernel_size, channels=channels, dilation_depth=dilation_depth, blocks=blocks, condition_size=condition_size,
                initial_cond_size=80,
                global_condition=False, 
                local_condition=True, 
                use_bert=False)
model = model.to(device)

In [7]:
print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

Trainable parameters: 10347136


In [8]:
tacotron2 = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2')
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()

Using cache found in /zhome/22/c/137477/.cache/torch/hub/nvidia_DeepLearningExamples_torchhub


Tacotron2(
  (embedding): Embedding(148, 512)
  (encoder): Encoder(
    (convolutions): ModuleList(
      (0): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (2): Sequential(
        (0): ConvNorm(
          (conv): Conv1d(512, 512, kernel_size=(5,), stride=(1,), padding=(2,))
        )
        (1): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (lstm): LSTM(512, 256, batch_first=True, bidirectional=True)
  )
  (decoder): Decoder(
    (prenet): Prenet(
      (layers): ModuleList(
        (0): LinearNorm(
          (lin

In [9]:
@torch.no_grad()
def get_tacotron_spectrogram(transcript):
    sequence = tacotron2.text_to_sequence(transcript, ['english_cleaners'])
    sequence = torch.tensor([sequence], device=device, dtype=torch.int64)
    _, mel, _, _ = tacotron2.infer(sequence)
    return mel

In [10]:
chkpt = torch.load('LJ_speech_WaveNet-Tacotron2_05-05-2021-seq_L4000-bins128-batch44-C256-k2-dil9b2-cs256-sp_freq4000.pt')

In [11]:
model.load_state_dict(chkpt['model'])
model.eval()

WaveNet(
  (causal_layers): ModuleList(
    (0): ResidualLayer(
      (conv_fg): Conv1d(256, 512, kernel_size=(2,), stride=(1,))
      (conv_1x1): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      (lc_layer_fg): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    )
    (1): ResidualLayer(
      (conv_fg): Conv1d(256, 512, kernel_size=(2,), stride=(1,), dilation=(2,))
      (conv_1x1): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      (lc_layer_fg): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    )
    (2): ResidualLayer(
      (conv_fg): Conv1d(256, 512, kernel_size=(2,), stride=(1,), dilation=(4,))
      (conv_1x1): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      (lc_layer_fg): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    )
    (3): ResidualLayer(
      (conv_fg): Conv1d(256, 512, kernel_size=(2,), stride=(1,), dilation=(8,))
      (conv_1x1): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      (lc_layer_fg): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
 

In [12]:
def split_line(line):
    idx = line.split(" ")[0]
    trans = line[len(idx)+1:].rstrip()
    return idx, trans

with open('LibriSpeech/wavenet-cut.txt') as f:
    lines = [split_line(l) for l in f.readlines()]

trans_dict = dict(lines)

In [13]:
files = list([f for f in glob(f'LibriSpeech/test-clean/**/*.flac', recursive=True) if os.path.splitext(os.path.basename(f))[0] in trans_dict])

In [14]:
save_path = 'LibriSpeech/wavenet-tacotron2'

In [15]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [16]:
@torch.no_grad()
def synthesize(transcripts, waveform_sizes, model, tokenizer, device, seed=None, temperature=1.0):
    T = max(waveform_sizes)
    #Tokenize the transcript with the BERT tokenizer
    with torch.no_grad():
        model.eval()
        #Feed into sentence embedding class
        lc_embeds = []
        for transcript in transcripts:
            _, lc_embed = model.sentence_embedding(get_tacotron_spectrogram(transcript))

            #Interpolate the locally conditioned signal from BERT so it fits with the waveform size and then trim the same portion of the signal as for the waveform.
            lc_embed = F.interpolate(lc_embed, size=T)
            lc_embeds.append(lc_embed)
        lc_embed = torch.cat(lc_embeds,0)
        lc_embed = F.pad(lc_embed, (model.receptive_field,0))

        rec_fld = model.receptive_field + 1

        if seed is not None:
            seed_T = seed.size(1)
        else:
            seed_T = 0
    
        generated = (torch.ones((len(transcripts),rec_fld+T), device=device, dtype=torch.int64)*torchaudio.transforms.MuLawEncoding(model.bins)(torch.tensor(0.0)).item())
        if seed is not None:
            generated[:, :seed_T] = seed
        with tqdm(range(seed_T if seed_T is not None else 0,T)) as t_bar:
            for n in t_bar:
                predictions = model(generated[:,n:rec_fld+n], lc=lc_embed[:,:,n:rec_fld+n], gc=None)
                predictions = torch.softmax(predictions/temperature, dim=1)
                generated[:,n+rec_fld] = torch.multinomial(predictions.squeeze(), 1).squeeze()
    generated = generated[:, rec_fld:]
    return generated

In [None]:
for f_chunk in tqdm(list(chunks(files, 48))):
    ids = [os.path.splitext(os.path.basename(f))[0] for f in f_chunk]
    transcripts = [trans_dict[idx] for idx in ids]
    waveform_sizes = []
    for f in f_chunk:
        waveform, sample_rate = torchaudio.load(f)
        waveform = Resample(waveform)
        waveform_sizes.append(waveform.size(1))
    generated = synthesize(transcripts, waveform_sizes, model, tokenizer, device, temperature=1.0)
    for idx, gen, size in zip(ids, generated, waveform_sizes):
        torchaudio.save(f'{save_path}/{idx}.wav', MuLawDecoding(gen[:size].cpu().long()).unsqueeze(0), sample_rate=sp_freq)

  0%|          | 0/21 [00:00<?, ?it/s]



  0%|          | 0/114300 [00:00<?, ?it/s]



  0%|          | 0/93341 [00:00<?, ?it/s]



  0%|          | 0/104460 [00:00<?, ?it/s]