In [None]:
import os
import sys
import numpy as np
import pandas as pd
import scipy
import IPython.display

import librosa
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from collections import Counter, OrderedDict
from torchsummary import summary

# load other modules --> repo root path
sys.path.insert(0, "../")

import torch
from utils import text, audio
from utils.logging import Logger
from params.params import Params as hp
from modules.tacotron2 import Tacotron
from dataset.dataset import TextToSpeechDataset, TextToSpeechDatasetCollection

In [None]:
hp.sample_rate = 22050
hp.stft_window_ms = 50
hp.stft_shift_ms = 12.5
hp.num_fft = 1102
hp.num_mels = 80
hp.use_preemphasis = True

#waveform = audio.load("../data/vctk/wav48/p226/p226_012.wav")
waveform = audio.load("../data/ljspeech/wavs/LJ002-0001.wav")
print(audio.duration(waveform))

melspec = audio.mel_spectrogram(waveform)
spec = audio.spectrogram(waveform)

Logger._plot_spectrogram(melspec);
Logger._plot_spectrogram(spec);

print(spec.shape)
print(melspec.shape)

In [None]:
IPython.display.Audio(data=waveform, rate=hp.sample_rate)

In [None]:
hp.griffin_lim_iters = 60

inverse_melspec = audio.inverse_mel_spectrogram(melspec)
IPython.display.Audio(data=inverse_melspec, rate=hp.sample_rate)

In [None]:
hp.griffin_lim_iters = 60

inverse_melspec = audio.inverse_spectrogram(spec)
IPython.display.Audio(data=inverse_melspec, rate=hp.sample_rate)

In [None]:
hp.griffin_lim_iters = 60

y = waveform
if hp.use_preemphasis: y = scipy.signal.lfilter([1, -hp.preemphasis], [1], y)
wf = int(hp.sample_rate * hp.stft_window_ms / 1000)
hf = int(hp.sample_rate * hp.stft_shift_ms / 1000)
S = librosa.stft(y, n_fft=hp.num_fft , hop_length=hf, win_length=wf)
y = librosa.istft(S, hop_length=hf, win_length=wf)
if hp.use_preemphasis: y = scipy.signal.lfilter([1], [1, -hp.preemphasis], y)

IPython.display.Audio(data=y, rate=hp.sample_rate)

In [None]:
hp.griffin_lim_iters = 60

SS = librosa.amplitude_to_db(np.abs(S), top_db=None)
inverse_melspec = audio.inverse_spectrogram(SS)
IPython.display.Audio(data=inverse_melspec, rate=hp.sample_rate)

In [None]:
Logger._plot_spectrogram(np.angle(S))

# Synthesis

In [None]:
def remove_dataparallel_prefix(state_dict): 
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:]
        new_state_dict[name] = v
    return new_state_dict

In [None]:
def build_model(checkpoint):   
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state = torch.load(checkpoint, map_location=device)
    hp.load_state_dict(state['parameters'])
    model = Tacotron()
    model.load_state_dict(remove_dataparallel_prefix(state['model']))   
    model.to(device)
    return model

In [None]:
def inference(model):
    

    spectrograms = []
    for i in inputs:
        i = torch.LongTensor(text.to_sequence(i, use_phonemes=hp.use_phonemes))
        if torch.cuda.is_available(): i = i.cuda(non_blocking=True)
        spectrograms.append(model.inference(i).cpu().detach().numpy())
        
    return spectrograms

In [None]:
# checkpoint = "../checkpoints/DEUTSCH-GA_loss-299-0.265"
checkpoint = "../checkpoints/BASE_loss-299-0.191"
# checkpoint = "../checkpoints/JAPAN_loss-319-0.254"

In [None]:
model = build_model(checkpoint)
model.eval();

In [None]:
inputs = ["sumimasen 。",
          "kon nichiwa 、genkidesuka ?",
          "kocchi no ko-i wo uketoru toki 、 aite no yari kane nai gyo-san na aisatsu mo azayaka ni egakidasa re ta ",
          "dobry den 、 ja jsem z nemecka 。",
          "Dobry den, ja jsem z Nemecka, prd prd prd.",
          "Hello, it is me. I am from Germany.",
          "He has agreed a deal with the EU but the bill implementing it has been put on hold.",
          "Just returned to the United States after spending a great Thanksgiving with our Courageous American Warriors in Afghanistan!",
          "President of the United States of America, by virtue of the authority vested in me by the Constitution and the laws of the United States, do hereby proclaim Thursday, as a National Day of Thanksgiving.",
          "Guten Tag, wie geht es dir?",
          "Es geht mir gut, danke.",
          "Erlauben Sie bitte, dass ich mich kurz vorstelle. Ich heiße Jana Novakova.",
          "Ein aktueller Bericht der Bundesnetzagentur zeigt, dass die Preise an der Strombörse deutlich steigen."]

In [None]:
generated_spectrograms = inference(model)

In [None]:
hp.griffin_lim_iters = 60
hp.griffin_lim_power = 1.45

for i, s in enumerate(generated_spectrograms):
    s = audio.denormalize_spectrogram(s, not hp.predict_linear)
    w = audio.inverse_spectrogram(s, not hp.predict_linear)
    a = IPython.display.Audio(data=w, rate=hp.sample_rate)
    IPython.display.display(a)