## Tacotron 2 inference code 
Edit the variables **checkpoint_path** and **text** to match yours and run the entire code to generate plots of mel outputs, alignments and audio synthesis from the generated mel-spectrogram using Griffin-Lim.

#### Import libraries and setup matplotlib

#### Setup hparams

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
import glob
import os
import argparse
import json
import torch
from scipy.io.wavfile import write


import matplotlib
#%matplotlib inline
import matplotlib.pylab as plt
import IPython.display as ipd
import sys
import numpy as np
import torch

from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT, STFT
from audio_processing import griffin_lim
from train import load_model
from text import text_to_sequence

from hifigan.env import AttrDict
from hifigan.meldataset import mel_spectrogram, MAX_WAV_VALUE, load_wav
from hifigan.models import Generator

#from .denoiser import Denoiser

hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.max_decoder_steps = 10000


h = None
device = None

def get_taco_mel(text):
    speaker = ('jej_checkpoint_904500_done', 'jej_waveglow890k_done')
    checkpoint_path = "../Models/"+ speaker[0]
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()
    sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    return mel_outputs_postnet
    


def inference(checkpoint_file, output_dir, input_dir):
    generator = Generator(h).to(device)
        
    state_dict_g = torch.load(checkpoint_file, map_location=device)
    generator.load_state_dict(state_dict_g['generator'])

    filelist = os.listdir(a.input_wavs_dir)

    os.makedirs(output_dir, exist_ok=True)
    
    generator.eval()
    generator.remove_weight_norm()
    with torch.no_grad():
        for i, filname in enumerate(filelist):
            print("loop", i, filname)
            x = get_taco_mel("Tell me the meaning")
            x = torch.FloatTensor(x).to(device)
            y_g_hat = generator(x)
            audio = y_g_hat.squeeze()
            audio = audio * MAX_WAV_VALUE
            audio = audio.cpu().numpy().astype('int16')

            output_file = os.path.join(a.output_dir, os.path.splitext(filname)[0] + '_generated_e2e.wav')
            write(output_file, h.sampling_rate, audio)
            print(output_file)


def start_inf():
    print('Initializing Inference Process..')

    parser = argparse.ArgumentParser()
    parser.add_argument('--input_wavs_dir', default='test_files')
    parser.add_argument('--output_dir', default='generated_files')
    parser.add_argument('--checkpoint_file', default='./pretrained/LJ_FT_T2_V1/generator_v1')
    a = parser.parse_args("")

    config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json')
    with open(config_file) as f:
        data = f.read()

    global h
    json_config = json.loads(data)
    h = AttrDict(json_config)

    torch.manual_seed(h.seed)
    global device
    if torch.cuda.is_available():
        torch.cuda.manual_seed(h.seed)
        device = torch.device('cuda')
    else:
        print("cpu inference")
        device = torch.device('cpu')
    
    inference(a.checkpoint_file, a.output_dir, a.input_wavs_dir)
    
start_inf()


This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "C:\Users\DeepThought\.conda\envs\inf\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\DeepThought\.conda\envs\inf\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\DeepThought\.conda\envs\inf\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\DeepThought\.conda\envs\inf\lib\site-packages\traitlets\config\application.py", line 845, in launch_instance
    app.start()
  File "C:\Users\DeepThought\.conda\envs\inf\lib\site-packages\ipykernel\kernelapp.py", line 612, in start
    self.io_loop.start()
  File "C:\Users\DeepThought\.conda\envs\inf\l

ImportError: cannot import name 'init_weights' from 'utils' (E:\deepfakes\tacotron2-Offerman\utils.py)

#### Load model from checkpoint

In [None]:
import scipy
import re
import os
from pydub import AudioSegment
checkpoints = ['56500','60000', '60500', '63000']
checkpoints = [ '84000']
waveglows = ['400000']

speakers = [('da_checkpoint_824800_done', 'da_waveglow_1516200'), ('jej_checkpoint_904500_done', 'jej_waveglow890k_done')]
speakers = [('jej_checkpoint_904500_done', 'jej_waveglow890k_done')]
settings_groups = [('a', 0.02, 0.666)]
texts = []

text_file = open("../Samples/Work.txt", "r", encoding="utf8")
texts = text_file.readlines()
for speaker in speakers:
    checkpoint = speaker[0]
    wg = speaker[1]
    checkpoint_path = "../Models/"+ speaker[0]
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()
    waveglow_path = '../Models/'+speaker[1]
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)
    complete_audio = AudioSegment.silent()
    for index, text in enumerate(texts):
        for settings in settings_groups:
            sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).cuda().long()
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
            with torch.no_grad():
                audio = waveglow.infer(mel_outputs_postnet, sigma=settings[2])
            audio_denoised = denoiser(audio, strength=settings[1])[:, 0]
            audio = ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate)
            audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1)
            output_dir = '../Samples/Work2/'

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            filename = output_dir+'{0:03d}'.format(index)+'_'+checkpoint+"_wg"+wg+'_'+re.sub(r'\W+', '', text)[:30]+'_'+settings[0]+'.wav'
            audio.export(filename, format="wav")
            silence = AudioSegment.silent(duration=250)
            complete_audio = complete_audio.append(audio)
            complete_audio = complete_audio.append(silence)
    complete_audio.export(output_dir +speaker[0]+"Work.wav", format="wav")

In [None]:
import scipy
import re
import os
from pydub import AudioSegment
checkpoints = ['56500','60000', '60500', '63000']
checkpoints = [ '84000']
waveglows = ['400000']

speakers = [('da_checkpoint_824800_done', 'da_waveglow_1516200'), ('jej_checkpoint_904500_done', 'jej_waveglow890k_done')]
speakers = [('jej_checkpoint_904500_done', 'jej_waveglow890k_done')]
settings_groups = [('a', 0.02, 0.666)]
texts = []

text_file = open("../Samples/Work.txt", "r", encoding="utf8")
texts = text_file.readlines()
for speaker in speakers:
    checkpoint = speaker[0]
    wg = speaker[1]
    checkpoint_path = "../Models/"+ speaker[0]
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval().half()
    waveglow_path = '../Models/'+speaker[1]
    waveglow = torch.load(waveglow_path)['model']
    waveglow.cuda().eval().half()
    for k in waveglow.convinv:
        k.float()
    denoiser = Denoiser(waveglow)
    complete_audio = AudioSegment.silent()
    for index, text in enumerate(texts):
        for settings in settings_groups:
            sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :]
            sequence = torch.autograd.Variable(
                torch.from_numpy(sequence)).cuda().long()
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
            with torch.no_grad():
                audio = waveglow.infer(mel_outputs_postnet, sigma=settings[2])
            audio_denoised = denoiser(audio, strength=settings[1])[:, 0]
            audio = ipd.Audio(audio_denoised.cpu().numpy(), rate=hparams.sampling_rate)
            audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1)
            output_dir = '../Samples/Work2/'

            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            filename = output_dir+'{0:03d}'.format(index)+'_'+checkpoint+"_wg"+wg+'_'+re.sub(r'\W+', '', text)[:30]+'_'+settings[0]+'.wav'
            audio.export(filename, format="wav")
            silence = AudioSegment.silent(duration=250)
            complete_audio = complete_audio.append(audio)
            complete_audio = complete_audio.append(silence)
    complete_audio.export(output_dir +speaker[0]+"Work.wav", format="wav")

In [None]:
!pip uninstall env