In [1]:
!nvidia-smi
%env CUDA_VISIBLE_DEVICES=0
%env CUDA_VISIBLE_DEVICES

Fri Apr 30 12:39:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA Tesla V1...  On   | 00000000:37:00.0 Off |                    0 |
| N/A   39C    P0    44W / 250W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA Tesla V1...  On   | 00000000:AF:00.0 Off |                    0 |
| N/A   71C    P0    83W / 250W |  15787MiB / 16160MiB |     98%      Default |
|       

'0'

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
import torchaudio
import torch
import torch.nn.functional as F
import torch.nn as nn
from tqdm.notebook import tqdm
from glob import glob
from datetime import datetime
import numpy as np
from WaveNetTTS.model import WaveNet
import os
import random
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
random.seed(12)
os.environ['PYTHONHASHSEED'] = str(12)
np.random.seed(12)
torch.manual_seed(12)
torch.cuda.manual_seed(12)

In [5]:
sp_freq = 4000
seq_len = 4000
bins = 128
batch_size = 58
channels = 256
kernel_size = 2
dilation_depth = 9
blocks = 2
condition_size = 256

MuLawEncoding = torchaudio.transforms.MuLawEncoding(quantization_channels=bins)
MuLawDecoding = torchaudio.transforms.MuLawDecoding(bins)
Resample = torchaudio.transforms.Resample(16000, sp_freq)

hugging_face_model = 'bert-base-uncased'#'distilbert-base-uncased'#
tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', hugging_face_model)

Using cache found in /zhome/22/c/137477/.cache/torch/hub/huggingface_pytorch-transformers_master


In [6]:
model = WaveNet(quantization_bins=bins, kernel_size=kernel_size, channels=channels, dilation_depth=dilation_depth, blocks=blocks, condition_size=condition_size, global_condition=True, local_condition=True)
model = model.to(device)

Using cache found in /zhome/22/c/137477/.cache/torch/hub/huggingface_pytorch-transformers_master


In [7]:
print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))

Trainable parameters: 123869568


In [8]:
from WaveNetTTS.synthesize import synthesize

In [9]:
chkpt = torch.load('LJ_speech_WaveNet_28-04-2021-seq_L4000-bins128-batch58-C256-k2-dil9b2-cs256-sp_freq4000.pt')

In [10]:
model.load_state_dict(chkpt['model'])
model.eval()

WaveNet(
  (causal_layers): ModuleList(
    (0): ResidualLayer(
      (conv_fg): Conv1d(256, 512, kernel_size=(2,), stride=(1,))
      (conv_1x1): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      (gc_layer_fg): Linear(in_features=256, out_features=512, bias=True)
      (lc_layer_fg): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    )
    (1): ResidualLayer(
      (conv_fg): Conv1d(256, 512, kernel_size=(2,), stride=(1,), dilation=(2,))
      (conv_1x1): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      (gc_layer_fg): Linear(in_features=256, out_features=512, bias=True)
      (lc_layer_fg): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    )
    (2): ResidualLayer(
      (conv_fg): Conv1d(256, 512, kernel_size=(2,), stride=(1,), dilation=(4,))
      (conv_1x1): Conv1d(256, 256, kernel_size=(1,), stride=(1,))
      (gc_layer_fg): Linear(in_features=256, out_features=512, bias=True)
      (lc_layer_fg): Conv1d(256, 512, kernel_size=(1,), stride=(1,))
    )
    (3): ResidualL

In [11]:
def split_line(line):
    idx = line.split(" ")[0]
    trans = line[len(idx)+1:].rstrip()
    return idx, trans

with open('LibriSpeech/wavenet-cut.txt') as f:
    lines = [split_line(l) for l in f.readlines()]

trans_dict = dict(lines)

In [12]:
files = list([f for f in glob(f'LibriSpeech/test-clean/**/*.flac', recursive=True) if os.path.splitext(os.path.basename(f))[0] not in trans_dict])

In [13]:
save_path = 'LibriSpeech/wavenet'

In [14]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [None]:
for f_chunk in tqdm(list(chunks(files, 48))):
    ids = [os.path.splitext(os.path.basename(f))[0] for f in f_chunk]
    transcripts = [trans_dict[idx] for idx in ids]
    waveform_sizes = []
    for f in f_chunk:
        waveform, sample_rate = torchaudio.load(f)
        waveform = Resample(waveform)
        waveform_sizes.append(waveform.size(1))
    generated = synthesize(transcripts, waveform_sizes, model, tokenizer, device, temperature=1.0)
    for idx, gen, size in zip(ids, generated, waveform_sizes):
        torchaudio.save(f'{save_path}/{idx}.wav', MuLawDecoding(gen[:size].cpu().long()).unsqueeze(0), sample_rate=sp_freq)

  0%|          | 0/55 [00:00<?, ?it/s]

  0%|          | 0/103760 [00:00<?, ?it/s]

  0%|          | 0/80220 [00:00<?, ?it/s]