In [None]:
import yaml
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

from tensorflow_tts.processor.ljspeech import LJSpeechProcessor
from tensorflow_tts.processor.ljspeech import symbols, _symbol_to_id

from tensorflow_tts.configs import Tacotron2Config
from tensorflow_tts.models import TFTacotron2

import IPython.display as ipd

In [None]:
with open('./config.yml') as f:
    config = yaml.load(f, Loader=yaml.Loader)

In [None]:
config = Tacotron2Config(**config["tacotron2_params"])

In [None]:
config.prenet_dropout_rate = 0.7

In [None]:
tacotron2 = TFTacotron2(config=config, training=False, name="tacotron2")

In [None]:
tacotron2._build()

In [None]:
tacotron2.summary()

In [None]:
tacotron2.load_weights("./model-35000.h5")

In [None]:
input_text = "But Mrs. Solomons could not resist the temptation to dabble in stolen goods, and she was found shipping watches of the wrong category to New York."

In [None]:
input_ids = LJSpeechProcessor(None, None).text_to_sequence(input_text.lower(), ["english_cleaners"])
input_ids = np.concatenate([input_ids, [len(symbols) - 1]], -1)

In [None]:
decoder_output, mel_outputs, stop_token_prediction, alignment_history = tacotron2.inference(
    input_ids=np.expand_dims(input_ids, 0),
    input_lengths=np.array([len(input_ids)]),
    speaker_ids=np.array([0]),
    use_window_mask=False,
    win_front=4,
    win_back=6
)

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111)
ax.set_title(f'Alignment steps')
im = ax.imshow(
    alignment_history[0].numpy(),
    aspect='auto',
    origin='lower',
    interpolation='none')
fig.colorbar(im, ax=ax)
xlabel = 'Decoder timestep'
plt.xlabel(xlabel)
plt.ylabel('Encoder timestep')
plt.tight_layout()
plt.show()
plt.close()

In [None]:
alignment_history.numpy().shape

In [None]:
def get_durations_from_alignments(alignment):
    D = np.array([0 for _ in range(np.shape(alignment)[0])])
    
    for i in range(np.shape(alignment)[1]):
        max_index = alignment[:, i].tolist().index(alignment[:, i].max())
        D[max_index] = D[max_index] + 1
        
    return D

In [None]:
D = get_durations_from_alignments(alignment_history.numpy()[0])

In [None]:
D

In [None]:
mel_outputs = tf.reshape(mel_outputs, [-1, 80]).numpy()
fig = plt.figure(figsize=(10, 8))
ax1 = fig.add_subplot(311)
ax1.set_title(f'Predicted Mel-after-Spectrogram')
im = ax1.imshow(np.rot90(mel_outputs), aspect='auto', interpolation='none')
fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
plt.show()
plt.close()

MELGAN VOCODER

In [None]:
from tensorflow_tts.models import TFMelGANGenerator
from tensorflow_tts.configs import MelGANGeneratorConfig

In [None]:
config = MelGANGeneratorConfig(is_weight_norm=False)
melgan = TFMelGANGenerator(config=config, name='melgan_generator')
melgan(np.expand_dims(mel_outputs, 0))  # build model.

In [None]:
melgan.load_weights('./pretrained/generator-2080000.h5')

In [None]:
melgan = tf.function(melgan, 
                     experimental_relax_shapes=True, 
                     input_signature=[tf.TensorSpec(shape=[None, None, 80], dtype=tf.float32)])

In [None]:
audio_pred = melgan(np.expand_dims(mel_outputs, 0))[0, :, 0].numpy()

In [None]:
ipd.Audio(audio_pred, rate=22050)

In [None]:
plt.plot(audio_pred)