In [None]:
import os
import sys
import numpy as np
import pandas as pd
import scipy
import IPython.display

import librosa
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from collections import Counter, OrderedDict
from torchsummary import summary

# load other modules --> repo root path
sys.path.insert(0, "../")

import torch
from utils import text, audio
from utils.logging import Logger
from params.params import Params as hp
from modules.tacotron2 import Tacotron
from dataset.dataset import TextToSpeechDataset, TextToSpeechDatasetCollection

In [None]:
hp.sample_rate = 22050
hp.stft_window_ms = 50
hp.stft_shift_ms = 12.5
hp.num_fft = 1102
hp.num_mels = 80
hp.use_preemphasis = True

waveform = audio.load("../data/ljspeech/wavs/LJ002-0001.wav")

melspec = audio.mel_spectrogram(waveform)
spec = audio.spectrogram(waveform)

Logger._plot_spectrogram(melspec);
Logger._plot_spectrogram(spec);

In [None]:
IPython.display.Audio(data=waveform, rate=hp.sample_rate)

In [None]:
hp.griffin_lim_iters = 60

inverse_melspec = audio.inverse_mel_spectrogram(melspec)
IPython.display.Audio(data=inverse_melspec, rate=hp.sample_rate)

In [None]:
hp.griffin_lim_iters = 60

inverse_melspec = audio.inverse_spectrogram(spec)
IPython.display.Audio(data=inverse_melspec, rate=hp.sample_rate)

# Synthesis

In [None]:
def remove_dataparallel_prefix(state_dict): 
    new_state_dict = OrderedDict()
    for k, v in state_dict.items():
        name = k[7:]
        new_state_dict[name] = v
    return new_state_dict

In [None]:
def build_model(checkpoint):   
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state = torch.load(checkpoint, map_location=device)
    hp.load_state_dict(state['parameters'])
    model = Tacotron()
    model.load_state_dict(remove_dataparallel_prefix(state['model']))   
    model.to(device)
    return model

In [None]:
def inference(model, inputs):
    
    inputs = [l.rstrip().split('|') for l in inputs if l]

    spectrograms = []
    for i in inputs:
        t = torch.LongTensor(text.to_sequence(i[0], use_phonemes=hp.use_phonemes))
        l = torch.LongTensor([hp.languages.index(i[2])]) if hp.multi_language else None
        s = torch.LongTensor([hp.unique_speakers.index(i[1])]) if hp.multi_speaker else None

        if torch.cuda.is_available(): 
            t = t.cuda(non_blocking=True)
            if l: l = l.cuda(non_blocking=True)
            if s: s = s.cuda(non_blocking=True)

        spectrograms.append(model.inference(t, speaker=s, language=l).cpu().detach().numpy())

    return spectrograms

In [None]:
checkpoint = "../checkpoints/FRGE-SEP_loss-89-0.143"

In [None]:
model = build_model(checkpoint)
model.eval();

In [None]:
inputs = ["erlauben sie bitte, dass ich mich kurz vorstelle. ich heiße jana novakova.||german",
          "les socialistes et les républicains sont venus apporter leurs voix à la majorité pour ce texte.||french"]

In [None]:
generated_spectrograms = inference(model, inputs)

In [None]:
hp.griffin_lim_iters = 60
hp.griffin_lim_power = 1.45

for i, s in enumerate(generated_spectrograms):
    s = audio.denormalize_spectrogram(s, not hp.predict_linear)
    w = audio.inverse_spectrogram(s, not hp.predict_linear)
    a = IPython.display.Audio(data=w, rate=hp.sample_rate)
    IPython.display.display(a)