## UUDB (Japanese) Inference (CPU with pyopenjtalk for Kana + Phonemizer)

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import importlib
import time

import commons
import utils 
from models import SynthesizerTrn
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from scipy.io.wavfile import write

In [13]:
# Load config
hps = utils.get_hparams_from_file("./logs/uudb_27/config.json")

# Load text module and symbols
text_module = importlib.import_module(hps.data.text_module)
cleaned_text_to_sequence = text_module.cleaned_text_to_sequence
symbols = text_module.symbols

# Build model
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model)
_ = net_g.eval()


device="cpu"
net_g.to(device)

# Load checkpoint
_ = utils.load_checkpoint("./logs/uudb_27/G_300000.pth", net_g, None)

Mutli-stream iSTFT VITS


In [14]:

import re
import time
import pyopenjtalk
from text_JP.phonemize import Phonemizer

# Initialize Phonemizer
phonemizer = Phonemizer()

def japanese_cleaner_revised(text):
    # This cleaner uses pyopenjtalk to get katakana and then the original phonemizer.
    # It handles special tokens correctly.

    # 1. Split text by special tokens, keeping them as delimiters.
    parts = re.split(r'({cough}|<cough>|\[.*?\]|[、。])', text)
    
    phoneme_parts = []
    for part in parts:
        if not part or part.isspace():
            continue

        # a. Handle special tokens
        if part.startswith('[') and part.endswith(']'):
            content = part[1:-1]
            # Get kana, then phonemize content
            if not content:
                phoneme_parts.append('[ ]')
            else:
                kana_content = pyopenjtalk.g2p(content, kana=True).replace('ヲ', 'オ')
                phoneme_content = phonemizer(kana_content)
                phoneme_parts.append(f'[ {phoneme_content} ]')
            continue

        if part == '{cough}' or part == '<cough>':
            phoneme_parts.append('<cough>')
            continue
            
        if part in '、。':
            phoneme_parts.append('sp')
            continue

        # b. For normal text, get kana and then phonemize
        kana = pyopenjtalk.g2p(part, kana=True).replace('ヲ', 'オ')
        phonemes = phonemizer(kana)
        phoneme_parts.append(phonemes)

    # 3. Join and clean up spaces
    final_text = ' '.join(phoneme_parts)
    return re.sub(r'\s+', ' ', final_text).strip()

# --- Execution ---

# Synthesize Japanese text
text_to_synthesize = "あらゆる現実を、全て自分の方へ捻じ曲げたのだ"

# Phonemize using the revised cleaner
phonemized_text = japanese_cleaner_revised(text_to_synthesize)

# For display purposes, get the intermediate katakana representation
katakana_for_display = pyopenjtalk.g2p(re.sub(r'({cough}|<cough>|\[.*\])', '', text_to_synthesize), kana=True)

# --- Mora Count ---
mora_count = len(re.sub(r'[\s\[\]<>{}]', '', katakana_for_display))

print(f"Original text: {text_to_synthesize}")
print(f"Phonemized (revised): {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

start_time = time.time()

# Convert phonemes to sequence
stn_tst = cleaned_text_to_sequence(phonemized_text)
# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    sid = torch.LongTensor([8]).to(device)
    audio = net_g.infer(x_tst, x_tst_lengths,sid=sid, noise_scale=0.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time
audio_duration = len(audio) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration

print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))


Original text: あらゆる現実を、全て自分の方へ捻じ曲げたのだ
Phonemized (revised): a r a y u r u g e N z i t u o sp s u b e t e z i b u N n o h o: e n e z i m a g e t a n o d a
Mora Count: 27
--------------------
Audio duration: 3.31 seconds
Elapsed time: 0.12 seconds
Real Time Factor (RTF): 0.0361
