# Synthesize speech from text
- from trained dctts model
- code adpated from: https://github.com/Kyubyong/dc_tts

## Load dependencies

In [0]:
import numpy as np
import tensorflow as tf
from train_transfer import Graph
from utils import *
from data_load import load_data
from scipy.io.wavfile import write, read
from tqdm import tqdm
import codecs
import re
import os
import unicodedata
from IPython.display import Audio
import scipy.signal as sg

## Parameters

In [0]:
hp_vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding, E: EOS. 
hp_max_N = 180 # Maximum number of characters.
hp_max_T = 210 # Maximum number of mel frames.
hp_n_mels = 80  # Number of Mel banks to generate
hp_restoredir = "/content/gdrive/My Drive/dctts_colab/logdir/scarjo"
hp_sr = 22050


## Load models

In [0]:
 %%capture
 # Load graph
 g = Graph(mode="synthesize")

## Helper functions

In [0]:
def load_vocab():
    char2idx = {char: idx for idx, char in enumerate(hp_vocab)}
    idx2char = {idx: char for idx, char in enumerate(hp_vocab)}
    return char2idx, idx2char
  
def text_normalize(text):
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                           if unicodedata.category(char) != 'Mn') # Strip accents

    text = text.lower()
    text = re.sub("[^{}]".format(hp_vocab), " ", text)
    text = re.sub("[ ]+", " ", text)
    return text

In [0]:
sents  = [' a strange  game. the only winning move is not to play.E']

## may have to add spaces to improve pronunciation (skip the normalization)

In [0]:
char2idx, idx2char = load_vocab()

texts = np.zeros((len(sents), hp_max_N), np.int32)
for i, sent in enumerate(sents):
    texts[i, :len(sent)] = [char2idx[char] for char in sent]
    
L = texts

In [0]:

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Restore parameters
    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'Text2Mel')
    saver1 = tf.train.Saver(var_list=var_list)
    saver1.restore(sess, tf.train.latest_checkpoint(hp_restoredir + "-1"))
    print("Text2Mel Restored!")

    var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'SSRN') + \
               tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, 'gs')
    saver2 = tf.train.Saver(var_list=var_list)
    saver2.restore(sess, tf.train.latest_checkpoint(hp_restoredir + "-2"))
    print("SSRN Restored!")

    # Feed Forward
    ## mel
    Y = np.zeros((len(L), hp_max_T, hp_n_mels), np.float32)
    prev_max_attentions = np.zeros((len(L),), np.int32)
    for j in tqdm(range(hp_max_T)):
        _gs, _Y, _max_attentions, _alignments = \
            sess.run([g.global_step, g.Y, g.max_attentions, g.alignments],
                     {g.L: L,
                      g.mels: Y,
                      g.prev_max_attentions: prev_max_attentions})
        Y[:, j, :] = _Y[:, j, :]
        prev_max_attentions = _max_attentions[:, j]

    # Get magnitude
    Z = sess.run(g.Z, {g.Y: Y})
    mag = Z[0]
    wav = spectrogram2wav(mag)

In [0]:
Audio(wav,rate=hp.sr)

In [0]:
b, a = sg.butter(4, 7300. / (hp_sr / 2.), 'low')
wav_fil = sg.filtfilt(b, a, wav)
Audio(wav_fil,rate=hp.sr)