## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

In [None]:
!mkdir crust
!wget -q "https://huggingface.co/Uberduck/HiFi-Crust/resolve/main/g_00000000%20(1)" -O "crust/g_00000000" --show-progress
!wget -q "https://huggingface.co/Uberduck/HiFi-Crust/resolve/main/do_00000000" -O "crust/do_00000000" --show-progress
!wget -q "https://raw.githubusercontent.com/jik876/hifi-gan/master/config_v1.json" -O "crust/config.json" --show-progress

In [None]:
import matplotlib
import matplotlib.pylab as plt

import IPython.display as ipd
import os
import json
import sys
sys.path.append('hifi-gan/')
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence

from env import AttrDict
from meldataset import MAX_WAV_VALUE
from models import Generator

In [None]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')
    plt.show()

#### Setup hparams

In [None]:
hparams = create_hparams()
hparams.sampling_rate = 22050

#### Load model from checkpoint

In [None]:
device = "cpu"
checkpoint_path = "outdir/checkpoint_0"
model = load_model(hparams, device="cpu")
model.load_state_dict(torch.load(checkpoint_path, map_location=device)['state_dict'])
_ = model.eval().cpu()

In [None]:
next(model.parameters()).device

#### Load HiFi-GAN for mel2audio synthesis and denoiser

In [None]:
checkpoint_file = "crust/g_00000000"
config_file = os.path.join(os.path.split(checkpoint_file)[0], "config.json")
with open(config_file) as f:
    data = f.read()

json_config = json.loads(data)
attr_dict = AttrDict(json_config)

In [None]:
generator = Generator(attr_dict).to(device)
checkpoint_dict = torch.load(checkpoint_file, map_location=device)
generator.load_state_dict(checkpoint_dict["generator"])
generator.eval()
generator.remove_weight_norm()

#### Prepare text input

In [None]:
from g2p_id import G2P

def g2p_post(text):
    text = text.replace("ˈ", "")
    text = text.replace("ɛ", "e")
    text = text.replace("ɔ", "o")
    text = text.replace("ɪ", "i")
    text = text.replace("ʊ", "u")
    return text.lower()

g2p = G2P()
text = "Kanada berdiri pada tahun seribu delapan ratus enam puluh tujuh, setelah disahkannya undang-undang Konfederasi."

In [None]:
text = g2p(text)
text = g2p_post(text)
sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
sequence = torch.autograd.Variable(
    torch.from_numpy(sequence)).long()

#### Decode text input and plot results

In [None]:
%matplotlib inline
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
plot_data((mel_outputs.float().data.cpu().numpy()[0],
           mel_outputs_postnet.float().data.cpu().numpy()[0],
           alignments.float().data.cpu().numpy()[0].T))

#### Synthesize audio from spectrogram using WaveGlow

In [None]:
with torch.no_grad():
    y_g_hat = generator(mel_outputs_postnet)
    audio = y_g_hat.squeeze()
    audio = audio * MAX_WAV_VALUE
    audio = audio.cpu().numpy().astype("int16")
ipd.Audio(audio, rate=hparams.sampling_rate)