## UUDB (Japanese) Inference (CPU with pykakasi)

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import importlib

import commons
import utils
from models import SynthesizerTrn

from scipy.io.wavfile import write


In [2]:
# Load config
hps = utils.get_hparams_from_file("./logs/uudb_6/config.json")

# Load text module and symbols
text_module = importlib.import_module(hps.data.text_module)
cleaned_text_to_sequence = text_module.cleaned_text_to_sequence
symbols = text_module.symbols

# Build model
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

# Load checkpoint
_ = utils.load_checkpoint("./logs/uudb_6/G_700000.pth", net_g, None)

Mutli-stream iSTFT VITS


  WeightNorm.apply(module, name, dim)
  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')


In [3]:
import re
import time
from pykakasi import kakasi
from text_JP.symbols import table_jpn, table2_jpn

# This function converts katakana to phonemes.
# It expects the input to be already split into processable units.
def mora2phoneme_jpn(text):
    # These replacements are safe because the input is pre-processed.
    for m, p in table2_jpn.items():
        text = text.replace(m, p + " ")
    for m, p in table_jpn.items():
        text = text.replace(m, p + " ")
    text = text.replace(" ー", ": ")
    return text.rstrip()

# Initialize kakasi
kks = kakasi()

def japanese_cleaner_kakasi(text):
    # This is the main cleaner function.
    # It handles all special characters and text normalization.

    # 1. Add spaces around any special characters to treat them as separate tokens.
    # The final .split() will handle multiple spaces gracefully.
    text = text.replace(' ', ' sp ')
    text = text.replace('<cough>', ' <cough> ')
    text = text.replace('{cough}', ' <cough> ')
    text = text.replace('[', ' [ ')
    text = text.replace(']', ' ] ')

    # 2. Convert the entire text to katakana.
    # kakasi is good at preserving the special tokens we just spaced out.
    converted = kks.convert(text)
    katakana_text = "".join([item['kana'] for item in converted])

    # 3. Phonemize the katakana text.
    phonemized_text = mora2phoneme_jpn(katakana_text)

    # 4. Normalize all spacing.
    final_text = " ".join(phonemized_text.split())

    return final_text

# Synthesize Japanese text
text_to_synthesize = "[えっと]、{cough}シーとディーがあって、シーが、おじいちゃんが[なんか]しゃべってんだけど、[と]、台詞が、そうじゃわしは死んどったんじゃ、いつまでもこうしてるわけにはいかんなあってやつで、もう一個のやつが、もう少しここにいようかねって"

# Phonemize using the new cleaner
phonemized_text = japanese_cleaner_kakasi(text_to_synthesize)

print(f"Original text: {text_to_synthesize}")
print(f"Phonemized: {phonemized_text}")
print("--------------------")

start_time = time.time()

# Convert phonemes to sequence
stn_tst = cleaned_text_to_sequence(phonemized_text)
# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))


Original text: [えっと]、{cough}シーとディーがあって、シーが、おじいちゃんが[なんか]しゃべってんだけど、[と]、台詞が、そうじゃわしは死んどったんじゃ、いつまでもこうしてるわけにはいかんなあってやつで、もう一個のやつが、もう少しここにいようかねって
Phonemized: [ e Q t o ] 、 <cough> s i: t o d i: g a a Q t e 、s i: g a 、o z i i ch a N g a [ n a N k a ] sy a b e Q t e N d a k e d o 、 [ t o ] 、s e r i h u g a 、s o u zy a w a s i h a s i N d o Q t a N zy a 、i t u m a d e m o k o u s i t e r u w a k e n i h a i k a N n a a Q t e y a t u d e 、m o u i Q k o n o y a t u g a 、m o u s u k o s i k o k o n i i y o u k a n e Q t e
--------------------
Elapsed time: 0.35 seconds
