In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import importlib

import commons
import utils
from models import SynthesizerTrn

from scipy.io.wavfile import write

def get_text(text, hps, text_to_sequence_func):
    text_norm = text_to_sequence_func(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

DEBUG:matplotlib.pyplot:Loaded backend module://matplotlib_inline.backend_inline version unknown.


## UUDB (Japanese) Inference (CPU)

In [10]:
# Load config
hps = utils.get_hparams_from_file("./configs/uudb_base.json")

# Load text module and symbols
text_module = importlib.import_module(hps.data.text_module)
symbols = text_module.symbols
text_to_sequence = text_module.text_to_sequence

# Build model
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

# Load checkpoint
# IMPORTANT: Change the path to your trained UUDB model checkpoint
_ = utils.load_checkpoint("./logs/uudb_base/G_1900000.pth", net_g, None)


INFO:root:Loaded checkpoint './logs/uudb_base/G_1900000.pth' (iteration 15834)


In [11]:
import time

# Synthesize Japanese text
text_to_synthesize = "[エト]アノメノマエニドウグガアルトオモウノダケド"

# Phonemize
# The text_module is loaded in the previous cell
cleaner_names = hps.data.text_cleaners
cleaner = getattr(text_module.cleaners, cleaner_names[0])
phonemized_text = cleaner(text_to_synthesize)

print(f"Original text: {text_to_synthesize}")
print(f"Phonemized: {phonemized_text}")

start_time = time.time()
# get_text is defined in the first cell
stn_tst = get_text(text_to_synthesize, hps, text_to_sequence)

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))


Original text: [エト]アノメノマエニドウグガアルトオモウノダケド
Phonemized: [ e t o  ]a n o m e n o m a e n i d o u g u g a a r u t o o m o u n o d a k e d o
Elapsed time: 0.27 seconds
