In [1]:
!pip install deep_phonemizer -q
!pip install librosa -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for deep_phonemizer (setup.py) ... [?25l[?25hdone


In [None]:
import torch
import torchaudio

import librosa

import IPython
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams["figure.figsize"] = [16.0, 4.8]

torch.random.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"

print(torch.__version__)
print(torchaudio.__version__)
print(device)

2.0.1+cu118
2.0.2+cu118
cpu


In [None]:
def normalize_spec(melspec, minval=None, maxval=None):
  if (minval is None) and (maxval is None):
    maxval, minval = melspec.max(), melspec.min()
  melspec -= minval
  melspec /= (maxval - minval)
  return melspec, (minval, maxval)

def denormalize_spec(melspec, minval, maxval):
  melspec *= (maxval - minval)
  melspec += minval
  return melspec

In [None]:
# https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html
bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH

processor = bundle.get_text_processor()
tacotron2 = bundle.get_tacotron2().to(device)
vocoder = bundle.get_vocoder().to(device)
vocoder.sample_rate

100%|██████████| 63.6M/63.6M [00:04<00:00, 15.7MB/s]
Downloading: "https://download.pytorch.org/torchaudio/models/tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.pth" to /root/.cache/torch/hub/checkpoints/tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.pth
100%|██████████| 107M/107M [00:01<00:00, 108MB/s] 
Downloading: "https://download.pytorch.org/torchaudio/models/wavernn_10k_epochs_8bits_ljspeech.pth" to /root/.cache/torch/hub/checkpoints/wavernn_10k_epochs_8bits_ljspeech.pth
100%|██████████| 16.7M/16.7M [00:00<00:00, 38.8MB/s]


22050

## wavernn

In [None]:
text = "Hello world! Text to speech!"

with torch.inference_mode():
    processed, lengths = processor(text)
    processed = processed.to(device)
    lengths = lengths.to(device)
    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
    waveforms, lengths = vocoder(spec, spec_lengths)
    print(waveforms.shape, spec.shape, spec_lengths)

In [None]:
fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
ax1.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
ax2.plot(waveforms[0].cpu().detach())

IPython.display.Audio(waveforms[0:1].cpu(), rate=vocoder.sample_rate)

In [None]:
with torch.inference_mode():
  spec_norm, minmaxval = normalize_spec(spec)
  waveforms_norm, lengths_norm = vocoder(spec_norm, spec_lengths)
  print(waveforms_norm.shape, spec_norm.shape, lengths_norm)

torch.Size([1, 52250]) torch.Size([1, 80, 190]) tensor([52250], device='cuda:0', dtype=torch.int32)


In [None]:
IPython.display.Audio(waveforms_norm[0:1].cpu(), rate=vocoder.sample_rate)

## waveglow

In [None]:
# https://pytorch.org/audio/stable/tutorials/tacotron2_pipeline_tutorial.html
# Workaround to load model mapped on GPU
# https://stackoverflow.com/a/61840832
waveglow = torch.hub.load(
    "NVIDIA/DeepLearningExamples:torchhub",
    "nvidia_waveglow",
    model_math="fp32",
    pretrained=False,
)
checkpoint = torch.hub.load_state_dict_from_url(
    "https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth",  # noqa: E501
    progress=False,
    map_location=device,
)
state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}

waveglow.load_state_dict(state_dict)
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to(device)
waveglow.eval();

Downloading: "https://github.com/NVIDIA/DeepLearningExamples/zipball/torchhub" to /root/.cache/torch/hub/torchhub.zip
Downloading: "https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth" to /root/.cache/torch/hub/checkpoints/nvidia_waveglowpyt_fp32_20190306.pth


In [None]:
text = "Hello world! Text to speech!"

with torch.inference_mode():
    processed, lengths = processor(text)
    processed = processed.to(device)
    lengths = lengths.to(device)
    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
    with torch.no_grad():
      waveforms = waveglow.infer(spec)
    print(waveforms.shape, spec.shape, spec_lengths)

torch.Size([1, 59904]) torch.Size([1, 80, 234]) tensor([234], dtype=torch.int32)


In [None]:
# fig, [ax1, ax2] = plt.subplots(2, 1, figsize=(16, 9))
# ax1.imshow(spec[0].cpu().detach(), origin="lower", aspect="auto")
# ax2.plot(waveforms[0].cpu().detach())

IPython.display.Audio(waveforms[0:1].cpu(), rate=22050)

In [None]:
with torch.no_grad():
    spec_norm, minmaxval = normalize_spec(spec.detach().clone())
    waveforms_norm = waveglow.infer(spec_norm)

In [None]:
minmaxval, spec.shape

((tensor(-10.0416, device='cuda:0'), tensor(1.1586, device='cuda:0')),
 torch.Size([1, 80, 190]))

In [None]:
IPython.display.Audio(waveforms_norm[0:1].cpu(), rate=22050)

In [None]:
sample_rate = 22050
n_fft = 1024
hop_length = 256
win_length = 1024
n_mels = 80
melspec_librosa = librosa.feature.melspectrogram(
    y=waveforms.detach().clone().cpu().numpy(), # SPEECH_WAVEFORM.numpy().shape = (1, 54400)
    sr=sample_rate,
    n_fft=n_fft,
    hop_length=hop_length,
    win_length=win_length,
    center=True,
    pad_mode="reflect",
    power=2.0,
    n_mels=n_mels,
    norm="slaney",
    htk=True,
)