## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [2]:

import matplotlib.pyplot as plt
%matplotlib inline
import IPython.display as ipd
import os

import sys
sys.path.append('waveglow/')
import numpy as np
import torch
import torchaudio

import librosa

from tacotron2.hparams import create_hparams
from tacotron2.model import Tacotron2
from tacotron2.layers import TacotronSTFT, STFT
from tacotron2.audio_processing import griffin_lim
from tacotron2.train import load_model
from tacotron2.text import text_to_sequence
from waveglow.denoiser import Denoiser

In [3]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')
    plt.show()
plt.plot([1,2,3,4,5])
plt.show()

  plt.show()


#### Setup hparams

In [4]:
hparams = create_hparams()
hparams["sampling_rate"] = 22050

#### Load model from checkpoint

In [5]:
checkpoint_path = "tacotron2_statedict.pt"
model = load_model(hparams)
model.load_state_dict(torch.load(checkpoint_path, map_location=torch.device('cpu'))['state_dict'])
_ = model.cuda().eval().half()
# also modified train.py line:74

#_ = model.eval()

#### Load WaveGlow for mel2audio synthesis and denoiser

In [6]:
waveglow_path = 'waveglow_256channels_universal_v5.pt'
waveglow = torch.load(waveglow_path)['model']
waveglow.cuda().eval().half()
# modified denoiser.py line:15
# modified glow.py line:268
# modified glow.py line:298

#waveglow.eval()#.half()

for k in waveglow.convinv:
    k.float()
    
#denoiser = Denoiser(waveglow)



In [None]:
!nvidia-smi

In [None]:
print(denoiser)

#### Prepare text input

In [7]:
text = "This is a test for my presentation!"
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
print(sequence)
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).cuda().long()
    #torch.from_numpy(sequence)).long()
print(sequence)

[[57 45 46 56 11 46 56 11 38 11 57 42 56 57 11 43 52 55 11 50 62 11 53 55
  42 56 42 51 57 38 57 46 52 51  2]]
tensor([[57, 45, 46, 56, 11, 46, 56, 11, 38, 11, 57, 42, 56, 57, 11, 43, 52, 55,
         11, 50, 62, 11, 53, 55, 42, 56, 42, 51, 57, 38, 57, 46, 52, 51,  2]],
       device='cuda:0')


In [None]:
text = "This is a test for my presentation!"
processor = torchaudio.pipelines.TACOTRON2_WAVERNN_CHAR_LJSPEECH.get_text_processor()
processed, lengths = processor(text)
print(processed)
print(lengths)

#### Decode text input and plot results

In [8]:
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

  plt.show()


In [None]:
import librosa.display
# step1 - converting a wav file to numpy array and then converting that to mel-spectrogram
my_audio_as_np_array, my_sample_rate= librosa.load("../alignment/aligned_post/bdl_arctic_a0001.wav")

# step2 - converting audio np array to spectrogram
spec = librosa.feature.melspectrogram(y=my_audio_as_np_array,
                                        sr=my_sample_rate, 
                                            n_fft=2048, 
                                            hop_length=512, 
                                            win_length=None, 
                                            window='hann', 
                                            center=True, 
                                            pad_mode='reflect', 
                                            power=2.0,
                                     n_mels=128)
print(spec.shape)
img = librosa.display.specshow(librosa.power_to_db(spec, ref=np.max))
maxlen = 140
outlen = 140
out = model([sequence, torch.tensor(len(sequence)), spec, maxlen, outlen])
#out

#### Synthesize audio from spectrogram using WaveGlow

In [9]:
with torch.no_grad():
    audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
print(audio.shape)
ipd.Audio(audio[0].data.cpu().numpy(), rate=hparams["sampling_rate"])

torch.Size([1, 46592])


#### (Optional) Remove WaveGlow bias

In [None]:
audio_denoised = denoiser(audio, strength=0.01)[:, 0]
ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams["sampling_rate"]) 