## UUDB (Japanese) Inference (CPU with pyopenjtalk)

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import importlib
import time
import commons
import utils
from models import SynthesizerTrn

from scipy.io.wavfile import write

  from pkg_resources import resource_filename


In [2]:
# Load config
hps = utils.get_hparams_from_file("./logs/uudb_6/config.json")

# Load text module and symbols
text_module = importlib.import_module(hps.data.text_module)
cleaned_text_to_sequence = text_module.cleaned_text_to_sequence
symbols = text_module.symbols

# Build model
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

# Load checkpoint
_ = utils.load_checkpoint("./logs/uudb_6/G_700000.pth", net_g, None)

Mutli-stream iSTFT VITS


In [4]:
import re
import time
import pyopenjtalk

def openjtalk_cleaner(text):
    # Isolate special tokens by adding spaces around them.
    text = text.replace('<cough>', ' <cough> ')
    text = text.replace('{cough}', ' <cough> ')
    text = text.replace('[', ' [ ')
    text = text.replace(']', ' ] ')

    # Convert text to phonemes.
    phonemes = pyopenjtalk.g2p(text)

    # Replace pyopenjtalk's pause symbol 'pau' with 'sp'
    phonemes = phonemes.replace('pau', 'sp')

    # Final cleanup of spaces.
    final_text = " ".join(phonemes.split())
    
    return final_text

# Synthesize Japanese text
text_to_synthesize = "でもなん、[なんか]、<cough>、この餅を食べてる、お母さんを見て、[なんか]その女の子が、あって書いてあるのね"

# --- Mora Count ---
# Get the kana reading from pyopenjtalk
kana_reading = pyopenjtalk.g2p(text_to_synthesize, kana=True)
# The number of characters in the kana reading corresponds to the number of morae
mora_count = len(kana_reading)

# Phonemize using the new openjtalk-based cleaner
phonemized_text = openjtalk_cleaner(text_to_synthesize)

print(f"Original text: {text_to_synthesize}")
print(f"Phonemized: {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

start_time = time.time()

# Convert phonemes to sequence
stn_tst = cleaned_text_to_sequence(phonemized_text)
# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))


Original text: でもなん、[なんか]、<cough>、この餅を食べてる、お母さんを見て、[なんか]その女の子が、あって書いてあるのね
Phonemized: d e m o n a N sp n a N k a sp sh i i o o y u u j i i e i ch i sp k o n o m o ch i o t a b e t e r u sp o k a a s a N o m i t e sp n a N k a sp s o n o o N n a n o k o g a sp a cl t e k a i t e a r u n o n e
Mora Count: 68
--------------------
Elapsed time: 0.20 seconds


In [11]:
import re
import time
import pyopenjtalk

def openjtalk_cleaner(text):
    # Use a regex to find and separate all special tokens and punctuation
    parts = re.split(r'({cough}|<cough>|\[.*?\]|[、。])', text)
    
    phoneme_parts = []
    for part in parts:
        if not part or part.isspace():
            continue
        
        if part == '{cough}' or part == '<cough>':
            phoneme_parts.append('<cough>')
        elif part.startswith('[') and part.endswith(']'):
            content = part[1:-1]
            # Only phonemize if there is content inside the brackets
            if content:
                # Phonemize and map pyopenjtalk phonemes to model phonemes
                phonemes = pyopenjtalk.g2p(content).replace('pau', 'sp').replace('cl', 'Q').replace('R', ':')
                phoneme_parts.extend(['[', phonemes, ']'])
            else:
                phoneme_parts.extend(['[', ']'])
        elif part in '、。':
            phoneme_parts.append('sp')
        else:
            # Phonemize and map pyopenjtalk phonemes to model phonemes
            phonemes = pyopenjtalk.g2p(part).replace('pau', 'sp').replace('cl', 'Q').replace('R', ':')
            phoneme_parts.append(phonemes)

    # Join all parts and normalize spaces
    final_text = " ".join(phoneme_parts)
    final_text = " ".join(final_text.split())
    
    return final_text

# Synthesize Japanese text
text_to_synthesize = "でもなん、[なんか]、<cough>、この餅を食べてる、お母さんを見て、[なんか]その女の子が、あって書いてあるのね"

# --- Mora Count ---
kana_reading = pyopenjtalk.g2p(text_to_synthesize, kana=True)
mora_count = len(kana_reading)

# Phonemize using the new openjtalk-based cleaner
phonemized_text = openjtalk_cleaner(text_to_synthesize)

print(f"Original text: {text_to_synthesize}")
print(f"Phonemized: {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

start_time = time.time()

# Convert phonemes to sequence
stn_tst = cleaned_text_to_sequence(phonemized_text)
# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))


Original text: でもなん、[なんか]、<cough>、この餅を食べてる、お母さんを見て、[なんか]その女の子が、あって書いてあるのね
Phonemized: d e m o n a N sp [ n a N k a ] sp <cough> sp k o n o m o ch i o t a b e t e r u sp o k a a s a N o m i t e sp [ n a N k a ] s o n o o N n a n o k o g a sp a Q t e k a i t e a r u n o n e
Mora Count: 68
--------------------
Elapsed time: 0.19 seconds


In [21]:
import pyopenjtalk
import platform
import re

print("--- PyOpenJTalk Diagnostics ---")

# 1. バージョン情報を表示
print(f"PyOpenJTalk Version: {pyopenjtalk.__version__}")
print(f"Python Version: {platform.python_version()}")
print("-" * 20)

# 2. G2Pの出力テスト
# いくつかの重要なケースで、生の音素出力がどうなっているかを確認します。
test_cases = [
    "シー",          # カタカナ＋長音記号
    "コーヒー",        # 複数の長音記号
    "ここにいよう",    # 問題の「に」「い」が連続するケース
    "そう",          # 「おう」の長音
    "おじいさん"       # 「いい」の長音
]

print("G2P RAW OUTPUTS:")
for text in test_cases:
    phonemes = pyopenjtalk.g2p(text)
    # repr() を使って、スペースや特殊文字を正確に表示します
    print(f"Input: '{text}' -> Output: {repr(phonemes)}")

print("\n" + "-" * 20)
# 3. 以前の正規表現がどうマッチするかをテスト
print("REGEX BEHAVIOR TEST:")
problem_phonemes = pyopenjtalk.g2p("ここにいよう")
print(f"Testing on phonemes: {repr(problem_phonemes)}")

match = re.search(r'(?<![aiueo]\s)([aiueo])\s\1', problem_phonemes)
if match:
    print("-> Regex DID match. This is unexpected.")
else:
    print("-> Regex did NOT match. This is the expected behavior.")

print("--- End of Diagnostics ---")

--- PyOpenJTalk Diagnostics ---
PyOpenJTalk Version: 0.4.1
Python Version: 3.9.23
--------------------
G2P RAW OUTPUTS:
Input: 'シー' -> Output: 's i i'
Input: 'コーヒー' -> Output: 'k o o h i i'
Input: 'ここにいよう' -> Output: 'k o k o n i i y o o'
Input: 'そう' -> Output: 's o o'
Input: 'おじいさん' -> Output: 'o j i i s a N'

--------------------
REGEX BEHAVIOR TEST:
Testing on phonemes: 'k o k o n i i y o o'
-> Regex DID match. This is unexpected.
--- End of Diagnostics ---


In [10]:
import re
import pyopenjtalk

def openjtalk_cleaner(text):
    """
    例外処理によって、より正確に長音を扱う最終版クリーナー。
    """
    parts = re.split(r'({cough}|<cough>|\[.*?\]|[、。])', text)
    
    phoneme_parts = []
    for part in parts:
        if not part or part.isspace():
            continue
        
        if part == '{cough}' or part == '<cough>':
            phoneme_parts.append('<cough>')
        elif part.startswith('[') and part.endswith(']'):
            content = part[1:-1]
            if content:
                phonemes = pyopenjtalk.g2p(content)
                phonemes = _phonemize_text(phonemes)
                phoneme_parts.extend(['[', phonemes, ']'])
            else:
                phoneme_parts.extend(['[', ']'])
        elif part in '、。':
            phoneme_parts.append('sp')
        else:
            phonemes = pyopenjtalk.g2p(part)
            phonemes = _phonemize_text(phonemes)
            phoneme_parts.append(phonemes)

    final_text = " ".join(phoneme_parts)
    final_text = " ".join(final_text.split())
    
    return final_text

def _phonemize_text(phonemes):
    """音素テキストの長音とポーズを処理する補助関数"""
    # 1. 基本的な記号を置換
    phonemes = phonemes.replace('R', ':').replace('cl', 'Q').replace('pau', 'sp')
    
    # 2. 連続する母音をすべて長音記号 ':' に置換する
    phonemes = re.sub(r'([aiueo])\s\1', r'\1:', phonemes)
    
    # 3. ★★★ 例外処理 ★★★
    # 誤って長音化された特定のパターンを元に戻す
    # 今後、もし他の例外が見つかれば、ここに追加していくことができます。
    phonemes = phonemes.replace('n a:', 'n a a')
    phonemes = phonemes.replace('n i:', 'n i i')
    phonemes = phonemes.replace('n u:', 'n u u')
    phonemes = phonemes.replace('n e:', 'n e e')
    phonemes = phonemes.replace('n o:', 'n o o')
    phonemes = phonemes.replace('sh a', 's a')
    phonemes = phonemes.replace('sh i', 's i')
    phonemes = phonemes.replace('sh u', 's u')
    phonemes = phonemes.replace('sh e', 's e')
    phonemes = phonemes.replace('sh o', 's o')
    phonemes = phonemes.replace('ts', 't')
    phonemes = phonemes.replace('j', 'z')
    
    return phonemes

# --- 実行テスト ---
print("--- 最終解決策での実行テスト ---")

# 問題のテキストでテスト
text_to_synthesize = "[えっと]、{cough}シーとディーがあって、シーが、おじいちゃんが[なんか]しゃべってんだけど、[と]、台詞が、そうじゃわしは死んどったんじゃ、いつまでもこうしてるわけにはいかんなあってやつで、もう一個のやつが、もう少しここにいようかねって"
phonemized_final = openjtalk_cleaner(text_to_synthesize)

print(f"\n入力テキスト:\n{text_to_synthesize}")
print(f"\n最終的な出力:\n{phonemized_final}")


# --- Mora Count ---
kana_reading = pyopenjtalk.g2p(text_to_synthesize, kana=True)
mora_count = len(kana_reading)

# Phonemize using the new openjtalk-based cleaner
phonemized_text = openjtalk_cleaner(text_to_synthesize)

print(f"Original text: {text_to_synthesize}")
print(f"Phonemized: {phonemized_final}")
print(f"Mora Count: {mora_count}")
print("--------------------")

start_time = time.time()

# Convert phonemes to sequence
stn_tst = cleaned_text_to_sequence(phonemized_text)
# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time

# --- RTF Calculation ---
audio_duration = len(audio) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration

print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")

ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))


--- 最終解決策での実行テスト ---

入力テキスト:
[えっと]、{cough}シーとディーがあって、シーが、おじいちゃんが[なんか]しゃべってんだけど、[と]、台詞が、そうじゃわしは死んどったんじゃ、いつまでもこうしてるわけにはいかんなあってやつで、もう一個のやつが、もう少しここにいようかねって

最終的な出力:
[ e Q t o ] sp <cough> s i: t o d i: g a: Q t e sp s i: g a sp o z i: ch a N g a [ n a N k a ] s a b e Q t e N d a k e d o sp [ t o ] sp s e r i f u g a sp s o: z a w a s i w a s i N d o Q t a N z a sp i t u m a d e m o k o: sh I t e r u w a k e n i w a i k a N n a a Q t e y a t u d e sp m o: i Q k o n o y a t u g a sp m o: s U k o sh I k o k o n i i y o: k a n e Q t e
Original text: [えっと]、{cough}シーとディーがあって、シーが、おじいちゃんが[なんか]しゃべってんだけど、[と]、台詞が、そうじゃわしは死んどったんじゃ、いつまでもこうしてるわけにはいかんなあってやつで、もう一個のやつが、もう少しここにいようかねって
Phonemized: [ e Q t o ] sp <cough> s i: t o d i: g a: Q t e sp s i: g a sp o z i: ch a N g a [ n a N k a ] s a b e Q t e N d a k e d o sp [ t o ] sp s e r i f u g a sp s o: z a w a s i w a s i N d o Q t a N z a sp i t u m a d e m o k o: sh I t e r u w a k e n i w a i k a N n a a Q t e y a t u d e sp m o: i Q k o n o y a t u g 

In [9]:
# main.py (長音を結合する最終版)

import re
import pyopenjtalk
from text_JP.symbols import symbols

symbols_set = set(symbols)

def normalize_phonemes(phonemes_in):
    """
    pyopenjtalkの出力をsymbols.pyの定義に準拠するよう正規化します。
    """
    phonemes = phonemes_in.lower()

    phonemes = re.sub(r'sh\s(i)', r's \1', phonemes)
    phonemes = re.sub(r'sh\s([aueo])', r'sy \1', phonemes)
    phonemes = re.sub(r'ch\s(i)', r't \1', phonemes)
    phonemes = re.sub(r'ts\s(u)', r't \1', phonemes)
    phonemes = re.sub(r'j\s(i)', r'z \1', phonemes)
    phonemes = re.sub(r'j\s([aueo])', r'zy \1', phonemes)

    phonemes = phonemes.replace('R', ':').replace('cl', 'Q').replace('pau', 'sp')

    # ★修正点: 長音化処理をスペースなしの結合形式に戻しました ('o :' -> 'o:')
    phonemes = re.sub(r'([aiueo])\s\1', r'\1:', phonemes)
    phonemes = phonemes.replace('n i:', 'n i i') # 例外処理

    return phonemes

def openjtalk_cleaner(text):
    """
    テキストを正規化された音素列に変換するクリーナー。
    """
    parts = re.split(r'({cough}|<cough>|\[.*?\]|[、。])', text)
    
    phoneme_parts = []
    for part in parts:
        if not part or part.isspace():
            continue
        
        if part == '{cough}' or part == '<cough>':
            phoneme_parts.append('<cough>')
        elif part.startswith('[') and part.endswith(']'):
            content = part[1:-1]
            if content:
                raw_phonemes = pyopenjtalk.g2p(content)
                norm_phonemes = normalize_phonemes(raw_phonemes)
                phoneme_parts.extend(['[', norm_phonemes, ']'])
            else:
                phoneme_parts.extend(['[', ']'])
        elif part in '、。':
            phoneme_parts.append('sp')
        else:
            raw_phonemes = pyopenjtalk.g2p(part)
            norm_phonemes = normalize_phonemes(raw_phonemes)
            phoneme_parts.append(norm_phonemes)

    final_text = " ".join(phoneme_parts)
    return re.sub(r'\s+', ' ', final_text).strip()

# --- 実行と検証 ---
test_text = "今日もジェイアール東日本をご利用くださいまして、ありがとうございます。"
print(f"入力テキスト:\n{test_text}\n")

cleaned_text = openjtalk_cleaner(test_text)
print(f"変換後の音素テキスト:\n{cleaned_text}\n")

print("--- 出力音素の検証 ---")
phonemes_to_check = cleaned_text.replace('[', '').replace(']', '').split(' ')
invalid_phonemes = []
for p in phonemes_to_check:
    if p and p not in symbols_set and not (p.startswith('<') and p.endswith('>')):
        invalid_phonemes.append(p)

if not invalid_phonemes:
    print("✅ すべての音素は symbols.py の定義に準拠しています。")
else:
    print(f"❌ 以下の音素が symbols.py の定義に含まれていません: {sorted(list(set(invalid_phonemes)))}")

入力テキスト:
今日もジェイアール東日本をご利用くださいまして、ありがとうございます。

変換後の音素テキスト:
ky o: m o zy e i a: r u h i g a s i n i h o n o g o r i y o: k u d a s a i m a s i t e sp a r i g a t o: g o z a i m a s u sp

--- 出力音素の検証 ---
❌ 以下の音素が symbols.py の定義に含まれていません: ['a:', 'o:']


In [10]:
# --- Mora Count ---
# Get the kana reading from pyopenjtalk
kana_reading = pyopenjtalk.g2p(test_text, kana=True)
# The number of characters in the kana reading corresponds to the number of morae
mora_count = len(kana_reading)

# Phonemize using the new openjtalk-based cleaner
phonemized_text = openjtalk_cleaner(test_text)

print(f"Original text: {test_text}")
print(f"Phonemized: {cleaned_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

start_time = time.time()

# Convert phonemes to sequence
stn_tst = cleaned_text_to_sequence(cleaned_text)
# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))

Original text: 今日もジェイアール東日本をご利用くださいまして、ありがとうございます。
Phonemized: ky o: m o zy e i a: r u h i g a s i n i h o n o g o r i y o: k u d a s a i m a s i t e sp a r i g a t o: g o z a i m a s u sp
Mora Count: 40
--------------------
Elapsed time: 0.14 seconds
