In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import importlib
import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text_JP.symbols import symbols as jpn_symbols # text_JPのsymbolsを使用
from text_JP.phonemize import Phonemizer # Phonemizerをインポート

from scipy.io.wavfile import write



# Phonemizerの初期化とシンボルIDへのマッピング
_phonemizer = Phonemizer()
_symbol_to_id = {s: i for i, s in enumerate(jpn_symbols)}

def _symbols_to_sequence(symbols_list):
    sequence = []
    for s in symbols_list:
        if s in _symbol_to_id:
            sequence.append(_symbol_to_id[s])
    return sequence

def get_text(text, hps):
    # Phonemizerを使用してテキストを音素列に変換
    phonemes_str = _phonemizer(text) # 例: "k o N n i t i h a"
    phonemes_list = phonemes_str.split(' ') # 例: ["k", "o", "N", "n", "i", "t", "i", "h", "a"]

    # 音素列をIDのシーケンスに変換
    text_norm = _symbols_to_sequence(phonemes_list)
    
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm


  from pkg_resources import resource_filename


In [6]:
import re
import time
import pyopenjtalk
from text_JP.phonemize import Phonemizer

# Initialize Phonemizer
phonemizer = Phonemizer()

def japanese_cleaner_revised(text):
    # This cleaner uses pyopenjtalk to get katakana and then the original phonemizer.
    # It handles special tokens correctly.

    # 1. Split text by special tokens, keeping them as delimiters.
    parts = re.split(r'({cough}|<cough>|\[.*?\]|[、。])', text)
    
    phoneme_parts = []
    for part in parts:
        if not part or part.isspace():
            continue

        # a. Handle special tokens
        if part.startswith('[') and part.endswith(']'):
            content = part[1:-1]
            # Get kana, then phonemize content
            if not content:
                phoneme_parts.append('[ ]')
            else:
                kana_content = pyopenjtalk.g2p(content, kana=True).replace('ヲ', 'オ')
                phoneme_content = phonemizer(kana_content)
                phoneme_parts.append(f'[ {phoneme_content} ]')
            continue

        if part == '{cough}' or part == '<cough>':
            phoneme_parts.append('<cough>')
            continue
            
        if part in '、。':
            phoneme_parts.append('sp')
            continue

        # b. For normal text, get kana and then phonemize
        kana = pyopenjtalk.g2p(part, kana=True).replace('ヲ', 'オ')
        phonemes = phonemizer(kana)
        phoneme_parts.append(phonemes)

    # 3. Join and clean up spaces
    final_text = ' '.join(phoneme_parts)
    return re.sub(r'\s+', ' ', final_text).strip()

In [4]:
# Configuration and Model Loading
# UUDBモデルの設定ファイルを使用
hps = utils.get_hparams_from_file("./logs/uudb_11/config.json")

net_g = SynthesizerTrn(
    len(jpn_symbols), # jpn_symbolsを使用
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

# UUDBモデルのチェックポイントを使用
_ = utils.load_checkpoint("./logs/uudb_11/G_200000.pth", net_g, None)


# Load text module and symbols
text_module = importlib.import_module(hps.data.text_module)
cleaned_text_to_sequence = text_module.cleaned_text_to_sequence
symbols = text_module.symbols

Mutli-stream iSTFT VITS


  WeightNorm.apply(module, name, dim)
  checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')


In [5]:
# Generate full latent representation and then synthesize only the first half
# 日本語のカタカナテキストを入力
text = "[ あ ] じゃあ、最初は、えーと[ その ] 、 交番の外にサラリーマンがいて"


# Phonemize using the revised cleaner
phonemized_text = japanese_cleaner_revised(text)

# For display purposes, get the intermediate katakana representation
katakana_for_display = pyopenjtalk.g2p(re.sub(r'({cough}|<cough>|\[.*\])', '', text), kana=True)

# --- Mora Count ---
mora_count = len(re.sub(r'[\s\[\]<>{}]', '', katakana_for_display))

print(f"Original text: {text}")
print(f"Phonemized (revised): {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

stn_tst = cleaned_text_to_sequence(phonemized_text)

# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

start_time=time.time()
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    
    # Perform inference to get the full latent representation (z) and mask (y_mask)
    # The infer method returns: o, o_mb, attn, y_mask, (z, z_p, m_p, logs_p)
    _, _, _, y_mask, (z, _, _, _), timings = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)

    # Determine the length for the first half of the latent representation
    # We take half of the sequence length of z
    half_len = z.shape[2] // 30
    
    
    # Slice the latent representation (z) and its corresponding mask (y_mask)
    z_partial = z[:, :, :half_len]
    y_mask_partial = y_mask[:, :, :half_len]

    # Directly call the decoder (net_g.dec) with the partial latent representation
    # The 'g' parameter (speaker embedding) is None in this single-speaker example
    audio_partial_first, _ = net_g.dec((z_partial * y_mask_partial), g=None)
    
    # Convert the generated audio tensor to a numpy array for playback
    audio_partial_first = audio_partial_first[0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time
audio_duration = len(audio_partial_first) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration

print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")
# Display the synthesized partial audio
ipd.display(ipd.Audio(audio_partial_first, rate=hps.data.sampling_rate, normalize=False))

print("Component-wise breakdown (of Inference time):")

# 合計時間を計算
total_time = sum(timings.values())

# 'timings' ディクショナリをループ処理
for component, time_taken in timings.items():
    # 各コンポーネントのパーセンテージを計算
    percentage = (time_taken / total_time) * 100
    
    # f-string を使って、左揃え・桁揃えで表示
    # {component:<25} : 25文字分の幅を確保し、左揃え
    # {time_taken:.4f} : 小数点以下4桁
    # {percentage:5.2f} : 全体で5文字（小数点含む）、小数点以下2桁
    print(f"- {component:<25}: {time_taken:.4f} sec ({percentage:5.2f}%)")

print("---------------------------------------------")
print(f"- {'Total':<25}: {total_time:.4f} sec (100.00%)")

Original text: [ あ ] じゃあ、最初は、えーと[ その ] 、 交番の外にサラリーマンがいて
Phonemized (revised): [ a ] zy a: sp s a i sy o w a sp e: t o [ s o n o ] sp k o: b a N n o s o t o n i s a r a r i: m a N g a i t e
Mora Count: 18
--------------------
Audio duration: 0.14 seconds
Elapsed time: 0.31 seconds
Real Time Factor (RTF): 2.1681


Component-wise breakdown (of Inference time):
- text_encoder             : 0.0379 sec (12.97%)
- duration_predictor       : 0.0012 sec ( 0.42%)
- alignment_and_projection : 0.0011 sec ( 0.38%)
- flow                     : 0.0371 sec (12.70%)
- waveform_decoder         : 0.2150 sec (73.53%)
---------------------------------------------
- Total                    : 0.2925 sec (100.00%)


In [16]:
 # Generate full latent representation and then synthesize only the second half
# 日本語のカタカナテキストを入力 (前半部分と同じテキストを使用)
text_second_half = "[ あ ] じゃあ、最初は、えーの[ その ] 、 交番の外にサラリーマンがいて"

# Phonemize using the revised cleaner
phonemized_text = japanese_cleaner_revised(text)

# For display purposes, get the intermediate katakana representation
katakana_for_display = pyopenjtalk.g2p(re.sub(r'({cough}|<cough>|\[.*\])', '', text), kana=True)

# --- Mora Count ---
mora_count = len(re.sub(r'[\s\[\]<>{}]', '', katakana_for_display))

print(f"Original text: {text}")
print(f"Phonemized (revised): {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

stn_tst_second_half = cleaned_text_to_sequence(phonemized_text)

# Add blank tokens
if hps.data.add_blank:
    stn_tst_second_half = commons.intersperse(stn_tst_second_half, 0)
stn_tst_second_half = torch.LongTensor(stn_tst_second_half)

start_time=time.time()

with torch.no_grad():
    x_tst_second_half = stn_tst_second_half.unsqueeze(0)
    x_tst_lengths_second_half = torch.LongTensor([stn_tst_second_half.size(0)])
    
    # Perform inference to get the full latent representation (z) and mask (y_mask)
    _, _, _, y_mask_full, (z_full, _, _, _), timings = net_g.infer(x_tst_second_half, x_tst_lengths_second_half, noise_scale=.667, noise_scale_w=0.8, length_scale=1)

    # Determine the length for the second half of the latent representation
    full_len = z_full.shape[2]
    half_len_start = full_len // 2 # 後半の開始インデックス
    
    # Slice the latent representation (z) and its corresponding mask (y_mask) for the second half
    z_second_half = z_full[:, :, half_len_start:]
    y_mask_second_half = y_mask_full[:, :, half_len_start:]

    # Directly call the decoder (net_g.dec) with the partial latent representation
    audio_second_half, _ = net_g.dec((z_second_half * y_mask_second_half), g=None)
    
    # Convert the generated audio tensor to a numpy array for playback
    audio_second_half = audio_second_half[0,0].data.cpu().float().numpy()
    
end_time = time.time()
elapsed_time = end_time - start_time
audio_duration = len(audio_second_half) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration
print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")
# Display the synthesized second half audio
ipd.display(ipd.Audio(audio_second_half, rate=hps.data.sampling_rate, normalize=False))

print("Component-wise breakdown (of Inference time):")

# 合計時間を計算
total_time = sum(timings.values())

# 'timings' ディクショナリをループ処理
for component, time_taken in timings.items():
    # 各コンポーネントのパーセンテージを計算
    percentage = (time_taken / total_time) * 100
    
    # f-string を使って、左揃え・桁揃えで表示
    # {component:<25} : 25文字分の幅を確保し、左揃え
    # {time_taken:.4f} : 小数点以下4桁
    # {percentage:5.2f} : 全体で5文字（小数点含む）、小数点以下2桁
    print(f"- {component:<25}: {time_taken:.4f} sec ({percentage:5.2f}%)")

print("---------------------------------------------")
print(f"- {'Total':<25}: {total_time:.4f} sec (100.00%)")

Original text: [ あ ] じゃあ、最初は、えーの[ その ] 、 交番の外にサラリーマンがいて
Phonemized (revised): [ a ] zy a: sp s a i sy o w a sp e: n o [ s o n o ] sp k o: b a N n o s o t o n i s a r a r i: m a N g a i t e
Mora Count: 18
--------------------
Audio duration: 2.18 seconds
Elapsed time: 0.21 seconds
Real Time Factor (RTF): 0.0951


Component-wise breakdown (of Inference time):
- text_encoder             : 0.0136 sec (10.81%)
- duration_predictor       : 0.0007 sec ( 0.58%)
- alignment_and_projection : 0.0004 sec ( 0.30%)
- flow                     : 0.0182 sec (14.41%)
- waveform_decoder         : 0.0932 sec (73.90%)
---------------------------------------------
- Total                    : 0.1261 sec (100.00%)


In [23]:
print("--- Full Synthesis ---")
text_full_compare = "[ あ ] じゃあ、最初は、えーの[ その ] 、 交番の外にサラリーマンがいて"


# Phonemize using the revised cleaner
phonemized_text = japanese_cleaner_revised(text)

# For display purposes, get the intermediate katakana representation
katakana_for_display = pyopenjtalk.g2p(re.sub(r'({cough}|<cough>|\[.*\])', '', text), kana=True)

# --- Mora Count ---
mora_count = len(re.sub(r'[\s\[\]<>{}]', '', katakana_for_display))

print(f"Original text: {text}")
print(f"Phonemized (revised): {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")



stn_tst_full_compare = cleaned_text_to_sequence(phonemized_text)

# Add blank tokens
if hps.data.add_blank:
    stn_tst_full_compare = commons.intersperse(stn_tst_full_compare, 0)
stn_tst_full_compare = torch.LongTensor(stn_tst_full_compare)

start_time=time.time()

with torch.no_grad():
    x_tst_full_compare = stn_tst_full_compare.unsqueeze(0)
    x_tst_lengths_full_compare = torch.LongTensor([stn_tst_full_compare.size(0)])
    audio_full, _, _, _, _, timings = net_g.infer(x_tst_full_compare, x_tst_lengths_full_compare, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
    audio_full = audio_full[0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time
audio_duration = len(audio_full) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration
print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")
ipd.display(ipd.Audio(audio_full, rate=hps.data.sampling_rate, normalize=False))

--- Full Synthesis ---
Original text: [ あ ] じゃあ、最初は、えーと[ その ] 、 交番の外にサラリーマンがいて
Phonemized (revised): [ a ] zy a: sp s a i sy o w a sp e: t o [ s o n o ] sp k o: b a N n o s o t o n i s a r a r i: m a N g a i t e
Mora Count: 18
--------------------
Audio duration: 4.35 seconds
Elapsed time: 0.11 seconds
Real Time Factor (RTF): 0.0259


In [21]:
print("\n--- Visual Comparison (Waveforms) ---")
plt.figure(figsize=(15, 10))

plt.subplot(3, 1, 1)
plt.plot(audio_full)
plt.title("Full Synthesis")
plt.xlabel("Time (samples)")
plt.ylabel("Amplitude")

plt.subplot(3, 1, 2)
plt.plot(audio_partial_first)
plt.title("First Half Synthesis")
plt.xlabel("Time (samples)")
plt.ylabel("Amplitude")

plt.subplot(3, 1, 3)
plt.plot(audio_second_half)
plt.title("Second Half Synthesis")
plt.xlabel("Time (samples)")
plt.ylabel("Amplitude")

plt.tight_layout()
plt.show()


--- Visual Comparison (Waveforms) ---


In [7]:
# 日本語のカタカナテキストを入力
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"
phonemized_text = japanese_cleaner_revised(text)

stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)


# --- Mora Count ---
mora_count = len(re.sub(r'[\s\[\]<>{}]', '', katakana_for_display))
print(f"Original text: {text}")
print(f"Phonemized (revised): {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

start_time = time.time()
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    
    noise_scale = 0.667
    noise_scale_w = 0.8
    length_scale = 1

    # Step 1: テキスト全体から分布パラメータ(m_p, logs_p)を取得
    # net_g.inferの返り値から、zではなくm_pとlogs_pを受け取る
    _, _, _, y_mask, (_, _, m_p, logs_p) = net_g.infer(
        x_tst, 
        x_tst_lengths, 
        noise_scale=noise_scale, 
        noise_scale_w=noise_scale_w, 
        length_scale=length_scale
    )

    # Step 2: 分布パラメータとマスクをスライス
    half_len = m_p.shape[2] // 2
    
    m_p_partial = m_p[:, :, :half_len]
    logs_p_partial = logs_p[:, :, :half_len]
    y_mask_partial = y_mask[:, :, :half_len]

    # Step 3: スライスしたパラメータから、潜在表現zを新たにサンプリング
    # torch.randn_likeでm_p_partialと同じ形状の乱数を生成する
    z_sampled_partial = (m_p_partial + torch.randn_like(m_p_partial) * torch.exp(logs_p_partial) * noise_scale) * y_mask_partial

    # Step 4: 新しくサンプリングしたzをデコーダに入力
    audio_partial, _ = net_g.dec(z_sampled_partial, g=None)
    
    audio_partial = audio_partial[0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time
audio_duration = len(audio_partial) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration

print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")

ipd.display(ipd.Audio(audio_partial, rate=hps.data.sampling_rate, normalize=False))

Original text: 最近、インターステラーを見たのですけど、すごく面白かったです。
Phonemized (revised): s a i k i N sp i N t a: s u t e r a: o m i t a n o d e s u k e d o sp s u g o k u o m o s i r o k a Q t a d e s u sp
Mora Count: 35
--------------------
Audio duration: 2.40 seconds
Elapsed time: 0.22 seconds
Real Time Factor (RTF): 0.0934


In [64]:
# 日本語のカタカナテキストを入力
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"
phonemized_text = japanese_cleaner_revised(text)

stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

start_time = time.time()
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    
    noise_scale = 0.667
    noise_scale_w = 0.8
    length_scale = 1

    # Step 1: テキスト全体から分布パラメータ(m_p, logs_p)を取得
    _, _, _, y_mask, (_, _, m_p, logs_p) = net_g.infer(
        x_tst, 
        x_tst_lengths, 
        noise_scale=noise_scale, 
        noise_scale_w=noise_scale_w, 
        length_scale=length_scale
    )

    # Step 2: 分布パラメータとマスクを後半部分でスライス
    # 🌟 この部分を変更しました 🌟
    half_len = m_p.shape[2] // 2
    
    m_p_partial = m_p[:, :, half_len:] # :half_len -> half_len:
    logs_p_partial = logs_p[:, :, half_len:] # :half_len -> half_len:
    y_mask_partial = y_mask[:, :, half_len:] # :half_len -> half_len:

    # Step 3: スライスしたパラメータから、潜在表現zを新たにサンプリング
    z_sampled_partial = (m_p_partial + torch.randn_like(m_p_partial) * torch.exp(logs_p_partial) * noise_scale) * y_mask_partial

    # Step 4: 新しくサンプリングしたzをデコーダに入力
    audio_partial, _ = net_g.dec(z_sampled_partial, g=None)
    
    audio_partial = audio_partial[0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time
audio_duration = len(audio_partial) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration

print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")

# 生成された後半部分の音声を表示
ipd.display(ipd.Audio(audio_partial, rate=hps.data.sampling_rate, normalize=False))

Audio duration: 2.40 seconds
Elapsed time: 1.16 seconds
Real Time Factor (RTF): 0.4829


In [18]:
# Generate full latent representation and then synthesize only the first half
# 日本語のカタカナテキストを入力
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"


# Phonemize using the revised cleaner
phonemized_text = japanese_cleaner_revised(text)

# For display purposes, get the intermediate katakana representation
katakana_for_display = pyopenjtalk.g2p(re.sub(r'({cough}|<cough>|\[.*\])', '', text), kana=True)

# --- Mora Count ---
mora_count = len(re.sub(r'[\s\[\]<>{}]', '', katakana_for_display))

print(f"Original text: {text}")
print(f"Phonemized (revised): {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

stn_tst = cleaned_text_to_sequence(phonemized_text)

# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)

start_time=time.time()
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    
    # Perform inference to get the full latent representation (z) and mask (y_mask)
    # The infer method returns: o, o_mb, attn, y_mask, (z, z_p, m_p, logs_p)
    # Directly call the Prior Encoder (enc_p)
    # This generates 'z' from 'x_tst' but SKIPS the Decoder
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)

    # Determine the length for the first half of the latent representation
    # We take half of the sequence length of z
    half_len = z.shape[2] // 20
    
    
    # Slice the latent representation (z) and its corresponding mask (y_mask)
    z_partial = z[:, :, :half_len]
    y_mask_partial = y_mask[:, :, :half_len]

    # Directly call the decoder (net_g.dec) with the partial latent representation
    # The 'g' parameter (speaker embedding) is None in this single-speaker example
    audio_partial_first, _ = net_g.dec((z_partial * y_mask_partial), g=None)
    
    # Convert the generated audio tensor to a numpy array for playback
    audio_partial_first = audio_partial_first[0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time
audio_duration = len(audio_partial_first) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration

print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")
# Display the synthesized partial audio
ipd.display(ipd.Audio(audio_partial_first, rate=hps.data.sampling_rate, normalize=False))

Original text: 最近、インターステラーを見たのですけど、すごく面白かったです。
Phonemized (revised): s a i k i N sp i N t a: s u t e r a: o m i t a n o d e s u k e d o sp s u g o k u o m o s i r o k a Q t a d e s u sp
Mora Count: 35
--------------------
Audio duration: 0.24 seconds
Elapsed time: 0.05 seconds
Real Time Factor (RTF): 0.2138


In [20]:


# (これより上は、元のコードの 'stn_tst = ...' までの前処理)
# stn_tst = torch.LongTensor(stn_tst)
# hps.data.sampling_rate が定義されている前提
# Generate full latent representation and then synthesize only the first half
# 日本語のカタカナテキストを入力
text = "[ あ ] じゃあ、最初は、えーと[ その ] 、 交番の外にサラリーマンがいて"


# Phonemize using the revised cleaner
phonemized_text = japanese_cleaner_revised(text)

# For display purposes, get the intermediate katakana representation
katakana_for_display = pyopenjtalk.g2p(re.sub(r'({cough}|<cough>|\[.*\])', '', text), kana=True)

# --- Mora Count ---
mora_count = len(re.sub(r'[\s\[\]<>{}]', '', katakana_for_display))

print(f"Original text: {text}")
print(f"Phonemized (revised): {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

stn_tst = cleaned_text_to_sequence(phonemized_text)

# Add blank tokens
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst)


# --- ここからが変更後のコード ---

# ▼▼▼ 変更可能なパラメータ ▼▼▼
# 分割する割合 (例: 0.5 = 50% の位置で分割, 0.3 = 30%)
split_ratio = 0.05
# ▲▲▲ 変更可能なパラメータ ▲▲▲

print(f"Splitting latent representation at {split_ratio * 100:.0f}%...")
print("--------------------")

# 1. 全体の処理時間を計測開始
total_start_time = time.time() 

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    
    # 2. z (潜在表現) の全体を生成 (Prior Encoder)
    z_gen_start_time = time.time()
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
    z_gen_time = time.time() - z_gen_start_time
    print(f"Time for infer_z_only (Prior Encoder): {z_gen_time:.4f} sec")
    print("--------------------")

    # 3. 潜在表現 z を指定の割合で2つに分割
    full_len = z.shape[2]
    split_point = int(full_len * split_ratio)

    if split_point == 0:
        print("Warning: split_point is 0. Part 1 will be empty.")
    if split_point >= full_len:
         print("Warning: split_point is at or beyond the end. Part 2 will be empty.")

    # 1つ目のセグメント
    z_part1 = z[:, :, :split_point]
    y_mask_part1 = y_mask[:, :, :split_point]

    # 2つ目のセグメント (残り)
    z_part2 = z[:, :, split_point:]
    y_mask_part2 = y_mask[:, :, split_point:]

    # 4. 1つ目のセグメントをデコード (時間計測)
    print("Decoding Part 1...")
    dec_start_1 = time.time()
    # z_part1 が空でないことを確認
    if z_part1.shape[2] > 0:
        audio_part1, _ = net_g.dec((z_part1 * y_mask_part1), g=None)
        audio_part1_np = audio_part1[0,0].data.cpu().float().numpy()
    else:
        audio_part1_np = np.array([], dtype=np.float32) # 空の配列
    dec_time_1 = time.time() - dec_start_1
    
    print(f"Time for Decoder (Part 1): {dec_time_1:.4f} sec")

    # 5. 2つ目のセグメントをデコード (時間計測)
    print("Decoding Part 2...")
    dec_start_2 = time.time()
    # z_part2 が空でないことを確認
    if z_part2.shape[2] > 0:
        audio_part2, _ = net_g.dec((z_part2 * y_mask_part2), g=None)
        audio_part2_np = audio_part2[0,0].data.cpu().float().numpy()
    else:
        audio_part2_np = np.array([], dtype=np.float32) # 空の配列
    dec_time_2 = time.time() - dec_start_2
    
    print(f"Time for Decoder (Part 2): {dec_time_2:.4f} sec")
    print("--------------------")

# 6. 全体の処理時間を計算
total_elapsed_time = time.time() - total_start_time
print(f"Total Elapsed Time (z_gen + dec1 + dec2): {total_elapsed_time:.4f} sec")
print("--------------------")

# 7. 音声を結合
audio_combined = np.concatenate((audio_part1_np, audio_part2_np))

# 8. 各音声を表示
print("--- 🎧 Audio (Part 1) ---")
ipd.display(ipd.Audio(audio_part1_np, rate=hps.data.sampling_rate, normalize=False))

print("\n--- 🎧 Audio (Part 2) ---")
ipd.display(ipd.Audio(audio_part2_np, rate=hps.data.sampling_rate, normalize=False))

print("\n--- 🎧 Audio (Combined) ---")
ipd.display(ipd.Audio(audio_combined, rate=hps.data.sampling_rate, normalize=False))

Original text: [ あ ] じゃあ、最初は、えーと[ その ] 、 交番の外にサラリーマンがいて
Phonemized (revised): [ a ] zy a: sp s a i sy o w a sp e: t o [ s o n o ] sp k o: b a N n o s o t o n i s a r a r i: m a N g a i t e
Mora Count: 18
--------------------
Splitting latent representation at 5%...
--------------------
Time for infer_z_only (Prior Encoder): 0.0535 sec
--------------------
Decoding Part 1...
Time for Decoder (Part 1): 0.0118 sec
Decoding Part 2...
Time for Decoder (Part 2): 0.1027 sec
--------------------
Total Elapsed Time (z_gen + dec1 + dec2): 0.1684 sec
--------------------
--- 🎧 Audio (Part 1) ---



--- 🎧 Audio (Part 2) ---



--- 🎧 Audio (Combined) ---


In [28]:

# --- ここからが変更後のコード ---

# ▼▼▼ 変更可能なパラメータ ▼▼▼
# 分割したい「位置」の割合をリストで指定 (0.0 < r < 1.0)
# - [0.5] -> 2分割 (50%の位置で)
# - [0.3, 0.8] -> 3分割 (30%の位置と80%の位置で)
# - [0.25, 0.5, 0.75] -> 4分割
split_points_ratios = [0.25, 0.5, 0.75]
# ▲▲▲ 変更可能なパラメータ ▲▲▲

# 念のため割合をソートし、重複を削除
split_points_ratios = sorted(list(set(r for r in split_points_ratios if 0.0 < r < 1.0)))

print(f"Splitting into {len(split_points_ratios) + 1} segments at: {[f'{r*100:.0f}%' for r in split_points_ratios]}")
print("--------------------")

# 1. 全体の処理時間を計測開始
total_start_time = time.time() 



# 結果を格納するリスト
decoding_times = []
decoded_audios_np = []

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    
    # 2. z (潜在表現) の全体を生成 (Prior Encoder)
    print("Generating full latent representation (z)...")
    z_gen_start_time = time.time()
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
    z_gen_time = time.time() - z_gen_start_time
    print(f"Time for infer_z_only (Prior Encoder): {z_gen_time:.4f} sec")
    print("--------------------")

    # 3. 潜在表現 z を N個 に分割
    full_len = z.shape[2]

    # [0.25, 0.5, 0.75] -> [0, 250, 500, 750, 1000] (full_len=1000の場合)
    split_indices_abs = [int(r * full_len) for r in split_points_ratios]
    all_split_indices = [0] + split_indices_abs + [full_len]

    # 4. 各セグメントを個別にデコード
    print("Decoding segments individually...")
    for i in range(len(all_split_indices) - 1):
        start_idx = all_split_indices[i]
        end_idx = all_split_indices[i+1]
        
        print(f"  Segment {i+1}/{len(all_split_indices)-1} (z-index {start_idx} to {end_idx})...")

        # スライス実行
        z_part = z[:, :, start_idx:end_idx]
        y_mask_part = y_mask[:, :, start_idx:end_idx]

        # デコード時間を計測
        dec_start = time.time()
        if z_part.shape[2] > 0:
            audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
            audio_part_np = audio_part[0,0].data.cpu().float().numpy()
        else:
            audio_part_np = np.array([], dtype=np.float32) # 空のセグメント
        dec_time = time.time() - dec_start
        
        decoding_times.append(dec_time)
        decoded_audios_np.append(audio_part_np)
        print(f"  Time for Decoder (Part {i+1}): {dec_time:.4f} sec")

# 5. 全体の処理時間を計算
total_elapsed_time = time.time() - total_start_time
print("--------------------")
print(f"Total Decoder Time (Sum of parts): {sum(decoding_times):.4f} sec")
print(f"Total Elapsed Time (z_gen + all_decs): {total_elapsed_time:.4f} sec")
print("--------------------------------------------------")

# 6. 音声を結合
audio_combined = np.concatenate(decoded_audios_np)

# 7. 各音声を表示
for i, audio_np in enumerate(decoded_audios_np):
    print(f"\n--- 🎧 Audio (Part {i+1}) ---")
    ipd.display(ipd.Audio(audio_np, rate=hps.data.sampling_rate, normalize=False))

print("\n--- 🎧 Audio (Combined) ---")
ipd.display(ipd.Audio(audio_combined, rate=hps.data.sampling_rate, normalize=False))

Splitting into 4 segments at: ['25%', '50%', '75%']
--------------------
Generating full latent representation (z)...
Time for infer_z_only (Prior Encoder): 0.0522 sec
--------------------
Decoding segments individually...
  Segment 1/4 (z-index 0 to 75)...
  Time for Decoder (Part 1): 0.0315 sec
  Segment 2/4 (z-index 75 to 150)...
  Time for Decoder (Part 2): 0.0307 sec
  Segment 3/4 (z-index 150 to 225)...
  Time for Decoder (Part 3): 0.0342 sec
  Segment 4/4 (z-index 225 to 300)...
  Time for Decoder (Part 4): 0.0297 sec
--------------------
Total Decoder Time (Sum of parts): 0.1261 sec
Total Elapsed Time (z_gen + all_decs): 0.1789 sec
--------------------------------------------------

--- 🎧 Audio (Part 1) ---



--- 🎧 Audio (Part 2) ---



--- 🎧 Audio (Part 3) ---



--- 🎧 Audio (Part 4) ---



--- 🎧 Audio (Combined) ---


In [27]:
# (stn_tst までの前処理)
# ...

# ▼▼▼ 変更可能なパラメータ ▼▼▼
split_points_ratios = [0.25, 0.5, 0.75]
# ▲▲▲ 変更可能なパラメータ ▲▲▲

# (split_points_ratios のソート処理)
# ...
print(f"Splitting into {len(split_points_ratios) + 1} segments at: {[f'{r*100:.0f}%' for r in split_points_ratios]}")


# --- 挿入するウォームアップ処理 (ここから) ---

print("--------------------")
print("Warming up the GPU (Decoder) beforehand...")
try:
    # モデルが使用するデバイスを取得 (e.g., 'cuda:0' or 'cpu')
    device = next(net_g.parameters()).device 
    
    # zの次元 (inter_channels) をデコーダの定義から取得
    inter_channels = net_g.dec.conv_pre.in_channels
    
    # 非常に小さなダミーデータを作成 (1バッチ, inter_channels次元, 1フレーム)
    z_dummy = torch.randn(1, inter_channels, 1).to(device)
    y_mask_dummy = torch.ones(1, 1, 1).to(device)
    
    with torch.no_grad():
        # デコーダを一度だけ実行して、ライブラリをロードさせる
        _ = net_g.dec((z_dummy * y_mask_dummy), g=None)
    
    # GPU処理の完了を待つ (念のため)
    if device.type == 'cuda':
        torch.cuda.synchronize()
        
    print("Warm-up complete.")
except Exception as e:
    print(f"Warn: Warm-up failed (this is non-critical). Error: {e}")
print("--------------------")

# --- 挿入するウォームアップ処理 (ここまで) ---


# 1. 全体の処理時間を計測開始 (ウォームアップの後)
total_start_time = time.time() 

# (以降の z_gen, デコード処理は同じ)
# ...

Splitting into 4 segments at: ['25%', '50%', '75%']
--------------------
Warming up the GPU (Decoder) beforehand...
Warm-up complete.
--------------------


In [29]:

# --- ここからが変更後のコード ---

# ▼▼▼ 変更可能なパラメータ ▼▼▼
# 分割したい「位置」の割合をリストで指定 (0.0 < r < 1.0)
# - [0.5] -> 2分割 (50%の位置で)
# - [0.3, 0.8] -> 3分割 (30%の位置と80%の位置で)
# - [0.25, 0.5, 0.75] -> 4分割
split_points_ratios = [0.01, 0.02, 0.03, 0.04,0.05,0.06,0.07,0.08,0.09,0.10,0.11, 0.12, 0.13, 0.14,0.15,0.16,0.17,0.18,0.19,0.20,0.21, 0.22, 0.23, 0.24,0.25,0.26,0.27,0.28,0.29,0.30,0.31, 0.32, 0.33, 0.34,0.35,0.36,0.37,0.38,0.39,0.40,0.41, 0.42, 0.43, 0.44,0.45,0.46,0.47,0.48,0.49,0.50,0.51, 0.52, 0.53, 0.54,0.55,0.56,0.57,0.58,0.59,0.60,]
# ▲▲▲ 変更可能なパラメータ ▲▲▲

# 念のため割合をソートし、重複を削除
split_points_ratios = sorted(list(set(r for r in split_points_ratios if 0.0 < r < 1.0)))

print(f"Splitting into {len(split_points_ratios) + 1} segments at: {[f'{r*100:.0f}%' for r in split_points_ratios]}")
print("--------------------")

# 1. 全体の処理時間を計測開始
total_start_time = time.time() 



# 結果を格納するリスト
decoding_times = []
decoded_audios_np = []

with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
    
    # 2. z (潜在表現) の全体を生成 (Prior Encoder)
    print("Generating full latent representation (z)...")
    z_gen_start_time = time.time()
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
    z_gen_time = time.time() - z_gen_start_time
    print(f"Time for infer_z_only (Prior Encoder): {z_gen_time:.4f} sec")
    print("--------------------")

    # 3. 潜在表現 z を N個 に分割
    full_len = z.shape[2]

    # [0.25, 0.5, 0.75] -> [0, 250, 500, 750, 1000] (full_len=1000の場合)
    split_indices_abs = [int(r * full_len) for r in split_points_ratios]
    all_split_indices = [0] + split_indices_abs + [full_len]

    # 4. 各セグメントを個別にデコード
    print("Decoding segments individually...")
    for i in range(len(all_split_indices) - 1):
        start_idx = all_split_indices[i]
        end_idx = all_split_indices[i+1]
        
        print(f"  Segment {i+1}/{len(all_split_indices)-1} (z-index {start_idx} to {end_idx})...")

        # スライス実行
        z_part = z[:, :, start_idx:end_idx]
        y_mask_part = y_mask[:, :, start_idx:end_idx]

        # デコード時間を計測
        dec_start = time.time()
        if z_part.shape[2] > 0:
            audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
            audio_part_np = audio_part[0,0].data.cpu().float().numpy()
        else:
            audio_part_np = np.array([], dtype=np.float32) # 空のセグメント
        dec_time = time.time() - dec_start
        
        decoding_times.append(dec_time)
        decoded_audios_np.append(audio_part_np)
        print(f"  Time for Decoder (Part {i+1}): {dec_time:.4f} sec")

# 5. 全体の処理時間を計算
total_elapsed_time = time.time() - total_start_time
print("--------------------")
print(f"Total Decoder Time (Sum of parts): {sum(decoding_times):.4f} sec")
print(f"Total Elapsed Time (z_gen + all_decs): {total_elapsed_time:.4f} sec")
print("--------------------------------------------------")

# 6. 音声を結合
audio_combined = np.concatenate(decoded_audios_np)

# 7. 各音声を表示
for i, audio_np in enumerate(decoded_audios_np):
    print(f"\n--- 🎧 Audio (Part {i+1}) ---")
    ipd.display(ipd.Audio(audio_np, rate=hps.data.sampling_rate, normalize=False))

print("\n--- 🎧 Audio (Combined) ---")
ipd.display(ipd.Audio(audio_combined, rate=hps.data.sampling_rate, normalize=False))

Splitting into 51 segments at: ['1%', '2%', '3%', '4%', '5%', '6%', '7%', '8%', '9%', '10%', '11%', '12%', '13%', '14%', '15%', '16%', '17%', '18%', '19%', '20%', '21%', '22%', '23%', '24%', '25%', '26%', '27%', '28%', '29%', '30%', '31%', '32%', '33%', '34%', '35%', '36%', '37%', '38%', '39%', '40%', '41%', '42%', '43%', '44%', '45%', '46%', '47%', '48%', '49%', '50%']
--------------------
Generating full latent representation (z)...
Time for infer_z_only (Prior Encoder): 0.0290 sec
--------------------
Decoding segments individually...
  Segment 1/51 (z-index 0 to 3)...
  Time for Decoder (Part 1): 0.0103 sec
  Segment 2/51 (z-index 3 to 6)...
  Time for Decoder (Part 2): 0.0094 sec
  Segment 3/51 (z-index 6 to 9)...
  Time for Decoder (Part 3): 0.0096 sec
  Segment 4/51 (z-index 9 to 12)...
  Time for Decoder (Part 4): 0.0098 sec
  Segment 5/51 (z-index 12 to 15)...
  Time for Decoder (Part 5): 0.0098 sec
  Segment 6/51 (z-index 15 to 18)...
  Time for Decoder (Part 6): 0.0099 sec
 


--- 🎧 Audio (Part 2) ---



--- 🎧 Audio (Part 3) ---



--- 🎧 Audio (Part 4) ---



--- 🎧 Audio (Part 5) ---



--- 🎧 Audio (Part 6) ---



--- 🎧 Audio (Part 7) ---



--- 🎧 Audio (Part 8) ---



--- 🎧 Audio (Part 9) ---



--- 🎧 Audio (Part 10) ---



--- 🎧 Audio (Part 11) ---



--- 🎧 Audio (Part 12) ---



--- 🎧 Audio (Part 13) ---



--- 🎧 Audio (Part 14) ---



--- 🎧 Audio (Part 15) ---



--- 🎧 Audio (Part 16) ---



--- 🎧 Audio (Part 17) ---



--- 🎧 Audio (Part 18) ---



--- 🎧 Audio (Part 19) ---



--- 🎧 Audio (Part 20) ---



--- 🎧 Audio (Part 21) ---



--- 🎧 Audio (Part 22) ---



--- 🎧 Audio (Part 23) ---



--- 🎧 Audio (Part 24) ---



--- 🎧 Audio (Part 25) ---



--- 🎧 Audio (Part 26) ---



--- 🎧 Audio (Part 27) ---



--- 🎧 Audio (Part 28) ---



--- 🎧 Audio (Part 29) ---



--- 🎧 Audio (Part 30) ---



--- 🎧 Audio (Part 31) ---



--- 🎧 Audio (Part 32) ---



--- 🎧 Audio (Part 33) ---



--- 🎧 Audio (Part 34) ---



--- 🎧 Audio (Part 35) ---



--- 🎧 Audio (Part 36) ---



--- 🎧 Audio (Part 37) ---



--- 🎧 Audio (Part 38) ---



--- 🎧 Audio (Part 39) ---



--- 🎧 Audio (Part 40) ---



--- 🎧 Audio (Part 41) ---



--- 🎧 Audio (Part 42) ---



--- 🎧 Audio (Part 43) ---



--- 🎧 Audio (Part 44) ---



--- 🎧 Audio (Part 45) ---



--- 🎧 Audio (Part 46) ---



--- 🎧 Audio (Part 47) ---



--- 🎧 Audio (Part 48) ---



--- 🎧 Audio (Part 49) ---



--- 🎧 Audio (Part 50) ---



--- 🎧 Audio (Part 51) ---



--- 🎧 Audio (Combined) ---


In [21]:
import numpy as np

# ▼▼▼ 変更可能なパラメータ ▼▼▼
# 分割するzのフレーム数（チャンクサイズ）
# 例えば 100 に設定すると、z を [0:100], [100:200], [200:300], [300:350(残り)] のように分割します
chunk_size_frames = 5
# ▲▲▲ 変更可能なパラメータ ▲▲▲

if chunk_size_frames <= 0:
    print("Error: chunk_size_frames must be greater than 0")
else:
    print(f"Splitting latent representation into chunks of {chunk_size_frames} frames...")



    # 1. 全体の処理時間を計測開始
    total_start_time = time.time() 

    # 結果を格納するリスト
    decoding_times = []
    decoded_audios_np = []

    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        
        # 2. z (潜在表現) の全体を生成
        print("Generating full latent representation (z)...")
        z_gen_start_time = time.time()
        attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
            x_tst, x_tst_lengths, 
            noise_scale=.667, noise_scale_w=0.8, length_scale=1
        )
        z_gen_time = time.time() - z_gen_start_time
        print(f"Time for infer_z_only (Prior Encoder): {z_gen_time:.4f} sec")
        print("--------------------")

        # 3. 潜在表現 z を 'chunk_size_frames' ごとに分割
        full_len = z.shape[2]
        
        # (z_part, y_mask_part, start_idx, end_idx) を格納するリストを作成
        all_segments = []
        for start_idx in range(0, full_len, chunk_size_frames):
            end_idx = min(start_idx + chunk_size_frames, full_len)
            
            z_part = z[:, :, start_idx:end_idx]
            y_mask_part = y_mask[:, :, start_idx:end_idx]
            all_segments.append((z_part, y_mask_part, start_idx, end_idx))
        
        print(f"Total z-length {full_len} frames split into {len(all_segments)} segments.")

        # 4. 各セグメントを個別にデコード
        print("Decoding segments individually...")
        for i, (z_part, y_mask_part, start_idx, end_idx) in enumerate(all_segments):
            
            print(f"  Segment {i+1}/{len(all_segments)} (z-index {start_idx} to {end_idx})...")

            # デコード時間を計測
            dec_start = time.time()
            if z_part.shape[2] > 0:
                audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
                audio_part_np = audio_part[0,0].data.cpu().float().numpy()
            else:
                audio_part_np = np.array([], dtype=np.float32) # 空のセグメント
            dec_time = time.time() - dec_start
            
            decoding_times.append(dec_time)
            decoded_audios_np.append(audio_part_np)
            print(f"  Time for Decoder (Part {i+1}): {dec_time:.4f} sec")

    # 5. 全体の処理時間を計算
    total_elapsed_time = time.time() - total_start_time
    print("--------------------")
    print(f"Total Decoder Time (Sum of parts): {sum(decoding_times):.4f} sec")
    print(f"Total Elapsed Time (z_gen + all_decs): {total_elapsed_time:.4f} sec")
    print("--------------------------------------------------")

    # 6. 音声を結合
    audio_combined = np.concatenate(decoded_audios_np)

    # 7. 各音声を表示 (数が多すぎる場合は、最初の数個と最後だけ表示しても良いかもしれません)
    for i, audio_np in enumerate(decoded_audios_np):
        print(f"\n--- 🎧 Audio (Part {i+1}) ---")
        ipd.display(ipd.Audio(audio_np, rate=hps.data.sampling_rate, normalize=False))

    print("\n--- 🎧 Audio (Combined) ---")
    ipd.display(ipd.Audio(audio_combined, rate=hps.data.sampling_rate, normalize=False))

Splitting latent representation into chunks of 5 frames...
Generating full latent representation (z)...
Time for infer_z_only (Prior Encoder): 0.0223 sec
--------------------
Total z-length 272 frames split into 55 segments.
Decoding segments individually...
  Segment 1/55 (z-index 0 to 5)...
  Time for Decoder (Part 1): 0.0081 sec
  Segment 2/55 (z-index 5 to 10)...
  Time for Decoder (Part 2): 0.0079 sec
  Segment 3/55 (z-index 10 to 15)...
  Time for Decoder (Part 3): 0.0079 sec
  Segment 4/55 (z-index 15 to 20)...
  Time for Decoder (Part 4): 0.0079 sec
  Segment 5/55 (z-index 20 to 25)...
  Time for Decoder (Part 5): 0.0079 sec
  Segment 6/55 (z-index 25 to 30)...
  Time for Decoder (Part 6): 0.0079 sec
  Segment 7/55 (z-index 30 to 35)...
  Time for Decoder (Part 7): 0.0082 sec
  Segment 8/55 (z-index 35 to 40)...
  Time for Decoder (Part 8): 0.0079 sec
  Segment 9/55 (z-index 40 to 45)...
  Time for Decoder (Part 9): 0.0080 sec
  Segment 10/55 (z-index 45 to 50)...
  Time for De


--- 🎧 Audio (Part 2) ---



--- 🎧 Audio (Part 3) ---



--- 🎧 Audio (Part 4) ---



--- 🎧 Audio (Part 5) ---



--- 🎧 Audio (Part 6) ---



--- 🎧 Audio (Part 7) ---



--- 🎧 Audio (Part 8) ---



--- 🎧 Audio (Part 9) ---



--- 🎧 Audio (Part 10) ---



--- 🎧 Audio (Part 11) ---



--- 🎧 Audio (Part 12) ---



--- 🎧 Audio (Part 13) ---



--- 🎧 Audio (Part 14) ---



--- 🎧 Audio (Part 15) ---



--- 🎧 Audio (Part 16) ---



--- 🎧 Audio (Part 17) ---



--- 🎧 Audio (Part 18) ---



--- 🎧 Audio (Part 19) ---



--- 🎧 Audio (Part 20) ---



--- 🎧 Audio (Part 21) ---



--- 🎧 Audio (Part 22) ---



--- 🎧 Audio (Part 23) ---



--- 🎧 Audio (Part 24) ---



--- 🎧 Audio (Part 25) ---



--- 🎧 Audio (Part 26) ---



--- 🎧 Audio (Part 27) ---



--- 🎧 Audio (Part 28) ---



--- 🎧 Audio (Part 29) ---



--- 🎧 Audio (Part 30) ---



--- 🎧 Audio (Part 31) ---



--- 🎧 Audio (Part 32) ---



--- 🎧 Audio (Part 33) ---



--- 🎧 Audio (Part 34) ---



--- 🎧 Audio (Part 35) ---



--- 🎧 Audio (Part 36) ---



--- 🎧 Audio (Part 37) ---



--- 🎧 Audio (Part 38) ---



--- 🎧 Audio (Part 39) ---



--- 🎧 Audio (Part 40) ---



--- 🎧 Audio (Part 41) ---



--- 🎧 Audio (Part 42) ---



--- 🎧 Audio (Part 43) ---



--- 🎧 Audio (Part 44) ---



--- 🎧 Audio (Part 45) ---



--- 🎧 Audio (Part 46) ---



--- 🎧 Audio (Part 47) ---



--- 🎧 Audio (Part 48) ---



--- 🎧 Audio (Part 49) ---



--- 🎧 Audio (Part 50) ---



--- 🎧 Audio (Part 51) ---



--- 🎧 Audio (Part 52) ---



--- 🎧 Audio (Part 53) ---



--- 🎧 Audio (Part 54) ---



--- 🎧 Audio (Part 55) ---



--- 🎧 Audio (Combined) ---


In [4]:
# (これより上は、元のコードの 'stn_tst = ...' までの前処理)
# stn_tst = torch.LongTensor(stn_tst)
# hps.data.sampling_rate が定義されている前提
# Generate full latent representation and then synthesize only the first half
# 日本語のカタカナテキストを入力
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"


# Phonemize using the revised cleaner
phonemized_text = japanese_cleaner_revised(text)

# For display purposes, get the intermediate katakana representation
katakana_for_display = pyopenjtalk.g2p(re.sub(r'({cough}|<cough>|\[.*\])', '', text), kana=True)

# --- Mora Count ---
mora_count = len(re.sub(r'[\s\[\]<>{}]', '', katakana_for_display))

print(f"Original text: {text}")
print(f"Phonemized (revised): {phonemized_text}")
print(f"Mora Count: {mora_count}")
print("--------------------")

stn_tst_full_compare = cleaned_text_to_sequence(phonemized_text)

# Add blank tokens
if hps.data.add_blank:
    stn_tst_full_compare = commons.intersperse(stn_tst_full_compare, 0)
stn_tst_full_compare = torch.LongTensor(stn_tst_full_compare)

start_time=time.time()

with torch.no_grad():
    x_tst_full_compare = stn_tst_full_compare.unsqueeze(0)
    x_tst_lengths_full_compare = torch.LongTensor([stn_tst_full_compare.size(0)])
    audio_full, _, _, _, _, timings = net_g.infer(x_tst_full_compare, x_tst_lengths_full_compare, noise_scale=.667, noise_scale_w=0.8, length_scale=1)
    audio_full = audio_full[0,0].data.cpu().float().numpy()

end_time = time.time()
elapsed_time = end_time - start_time
audio_duration = len(audio_full) / hps.data.sampling_rate
rtf = elapsed_time / audio_duration
print(f"Audio duration: {audio_duration:.2f} seconds")
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print(f"Real Time Factor (RTF): {rtf:.4f}")
ipd.display(ipd.Audio(audio_full, rate=hps.data.sampling_rate, normalize=False))

Original text: 最近、インターステラーを見たのですけど、すごく面白かったです。
Phonemized (revised): s a i k i N sp i N t a: s u t e r a: o m i t a n o d e s u k e d o sp s u g o k u o m o s i r o k a Q t a d e s u sp
Mora Count: 35
--------------------
Audio duration: 4.80 seconds
Elapsed time: 1.95 seconds
Real Time Factor (RTF): 0.4068


In [11]:
import torch
import time
import re
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Audio
plt.ioff()
# --- 前提条件: 外部で定義されているオブジェクト ---
# hps, net_g, commons, japanese_cleaner_revised, cleaned_text_to_sequence, pyopenjtalk
# が定義されていることを前提とします。

# --- デバイス設定 ---
device = torch.device('cpu') # CPUで実行
print(f"Using device: {device}")
net_g.to(device)
net_g.eval()

# --- 入力テキスト ---
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"

# --- 前処理 (共通) ---
phonemized_text = japanese_cleaner_revised(text)
stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst).to(device)

# VITSの推論に必要な共通入力
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)

# --- z (潜在表現) の全体を生成 (共通処理) ---
# この処理はチャンクサイズに依存しないため、一度だけ実行します。
print("Generating full latent representation (z) once...")
with torch.no_grad():
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
full_len = z.shape[2]
print(f"Full z-length: {full_len} frames.")
print("==================================================")


# --- 計測設定 ---
# チャンクサイズを 1 から 100 まで変化させる
chunk_sizes = list(range(1, 101))
# 計測結果を保存するリスト
decoder_time_part1 = []
tested_chunk_size = []


# --- 計測ループ ---
print("Starting measurement loop...")
for chunk_size in chunk_sizes:
    
    # 潜在表現 z を 'chunk_size' で分割し、最初のセグメントを取得
    start_idx = 0
    end_idx = min(start_idx + chunk_size, full_len)
    
    if end_idx == start_idx:
        # z-lengthが短すぎて分割できない場合や、chunk_sizeが大きすぎる場合はスキップ
        continue
    
    z_part = z[:, :, start_idx:end_idx]
    y_mask_part = y_mask[:, :, start_idx:end_idx]

    # デコード時間を計測
    with torch.no_grad():
        dec_start = time.time()
        # net_g.dec() は z * y_mask を入力として受け取る
        audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
        # 結果のテンソル操作を含めて時間を計測
        audio_part_np = audio_part[0,0].data.cpu().float().numpy() 
        dec_time = time.time() - dec_start
        
        decoder_time_part1.append(dec_time)
        tested_chunk_size.append(chunk_size)
        
    print(f"Chunk Size {chunk_size:3d} (Z-Len {z_part.shape[2]}): Time = {dec_time:.6f} sec")


# --- グラフ描画 ---
print("\n--- Generating Plot ---")

# データのクリーンアップと型変換 (前回の修正を維持)
try:
    tested_chunk_size_np = np.array(tested_chunk_size, dtype=np.int32)
    decoder_time_part1_np = np.array(decoder_time_part1, dtype=np.float64)
except Exception as e:
    print(f"ERROR: Could not convert data lists to NumPy arrays. Error: {e}")
    # 変換失敗時はエラーを再送出
    raise

# データ点の整合性チェック
if len(tested_chunk_size_np) > 0:
    # グラフサイズの設定
    plt.figure(figsize=(10, 6))

    # 散布図と線グラフの描画
    plt.plot(tested_chunk_size_np, decoder_time_part1_np, marker='o', linestyle='-', color='b', label='Decoder Time (Part 1)')

    # タイトルとラベルの設定
    plt.title(f'Decoder Time vs. Chunk Size (First Segment)\nText Z-Length: {full_len} frames')
    plt.xlabel('Chunk Size (Frames)')
    plt.ylabel('Decoding Time (Seconds)')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    # ★★★ ファイルに保存して結果を確認する ★★★
    filename = 'decoder_time_vs_chunk_size.pdf'
    plt.savefig(filename)
    print(f"グラフをファイル '{filename}' に保存しました。")
    
    # 図を閉じる（メモリクリーンアップ）
    plt.show()

else:
    print("WARNING: Data lists are empty. Skipping plot.")
    

# ★★★ 最終手段: MatplotlibのPylabヘルパーをクリーンアップ ★★★
# 環境によって動作が異なりますが、matplotlibの状態をリセットします
try:
    import matplotlib._pylab_helpers
    # 現在のfigure managerを削除することで、コールバックが何も描画しないようにします。
    # グラフが表示されない場合があるため、これは最終手段として試みます。
    # matplotlib._pylab_helpers.Gcf.destroy_all() 
    pass
except Exception:
    pass


print("\nMeasurement and plotting completed successfully (Attempted environment cleanup).")

Using device: cpu
Generating full latent representation (z) once...
Full z-length: 300 frames.
Starting measurement loop...
Chunk Size   1 (Z-Len 1): Time = 0.006888 sec
Chunk Size   2 (Z-Len 2): Time = 0.007020 sec
Chunk Size   3 (Z-Len 3): Time = 0.007227 sec
Chunk Size   4 (Z-Len 4): Time = 0.007399 sec
Chunk Size   5 (Z-Len 5): Time = 0.007885 sec
Chunk Size   6 (Z-Len 6): Time = 0.007826 sec
Chunk Size   7 (Z-Len 7): Time = 0.008183 sec
Chunk Size   8 (Z-Len 8): Time = 0.008073 sec
Chunk Size   9 (Z-Len 9): Time = 0.008643 sec
Chunk Size  10 (Z-Len 10): Time = 0.009395 sec
Chunk Size  11 (Z-Len 11): Time = 0.012321 sec
Chunk Size  12 (Z-Len 12): Time = 0.012249 sec
Chunk Size  13 (Z-Len 13): Time = 0.012775 sec
Chunk Size  14 (Z-Len 14): Time = 0.012810 sec
Chunk Size  15 (Z-Len 15): Time = 0.013060 sec
Chunk Size  16 (Z-Len 16): Time = 0.013626 sec
Chunk Size  17 (Z-Len 17): Time = 0.014743 sec
Chunk Size  18 (Z-Len 18): Time = 0.014384 sec
Chunk Size  19 (Z-Len 19): Time = 0.014

In [None]:
import torch
import time
import re
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, Audio
import matplotlib

# --- 環境安定化のため、バックエンドを設定 (ファイル保存時などに推奨) ---
# GUI描画が必要な環境でなければ、この行は有効にしても安全です。
# matplotlib.use('Agg') 
plt.ioff() # Jupyter/Colab環境での自動描画を抑制

# --- 前提条件: 外部で定義されているオブジェクト ---
# hps, net_g, commons, japanese_cleaner_revised, cleaned_text_to_sequence, pyopenjtalk
# が定義されていることを前提とします。

# --- デバイス設定 ---
device = torch.device('cpu') # CPUで実行
print(f"Using device: {device}")
net_g.to(device)
net_g.eval()

# --- 入力テキスト ---
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"

# --- 前処理 (共通) ---
phonemized_text = japanese_cleaner_revised(text)
stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst).to(device)

# VITSの推論に必要な共通入力
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)


# =================================================================
# 1. テキスト全体を一度に合成した場合の時間を計測 (基準線)
# =================================================================

print("1. Measuring Full Synthesis Time (Baseline)...")
full_synthesis_start_time = time.time()
with torch.no_grad():
    # net_g.infer を使用して、Prior Encoder + Decoder を一度に実行
    audio_full, _, _, _, _, timings = net_g.infer(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
    audio_full_np = audio_full[0,0].data.cpu().float().numpy() 

full_synthesis_time = time.time() - full_synthesis_start_time
print(f"Time for Full Synthesis (net_g.infer): {full_synthesis_time:.6f} sec")
print("==================================================")


# =================================================================
# 2. チャンク分割合成の計測 (Z生成は共通)
# =================================================================

# --- z (潜在表現) の全体を生成 (共通処理) ---
print("2. Generating full latent representation (z) once for chunking...")
with torch.no_grad():
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
full_len = z.shape[2]
print(f"Full z-length: {full_len} frames.")
print("==================================================")


# --- 計測設定 ---
chunk_sizes = list(range(1, 101))
decoder_time_part1 = []
tested_chunk_size = []


# --- 計測ループ ---
print("3. Starting measurement loop for Decoder Time (Part 1)...")
for chunk_size in chunk_sizes:
    
    start_idx = 0
    end_idx = min(start_idx + chunk_size, full_len)
    
    if end_idx == start_idx:
        continue
    
    z_part = z[:, :, start_idx:end_idx]
    y_mask_part = y_mask[:, :, start_idx:end_idx]

    # デコード時間を計測
    with torch.no_grad():
        dec_start = time.time()
        audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
        audio_part_np = audio_part[0,0].data.cpu().float().numpy() 
        dec_time = time.time() - dec_start
        
        decoder_time_part1.append(dec_time)
        tested_chunk_size.append(chunk_size)
        
    # print(f"Chunk Size {chunk_size:3d}: Time = {dec_time:.6f} sec")

print("Measurement completed.")


# =================================================================
# 4. グラフ描画
# =================================================================

print("\n--- 4. Generating Plot ---")

# データのクリーンアップと型変換
try:
    tested_chunk_size_np = np.array(tested_chunk_size, dtype=np.int32)
    decoder_time_part1_np = np.array(decoder_time_part1, dtype=np.float64)
except Exception as e:
    print(f"ERROR: Could not convert data lists to NumPy arrays. Error: {e}")
    raise


if len(tested_chunk_size_np) == 0:
    print("WARNING: Data lists are empty. Skipping plot.")
else:
    plt.close('all') 
    plt.figure(figsize=(12, 7)) # グラフサイズを少し大きく変更 

    # チャンク分割デコード時間 (青線)
    plt.plot(tested_chunk_size_np, decoder_time_part1_np, marker='o', linestyle='-', color='b', label='Decoder Time (Part 1)')

    # ★★★ 全体合成時間を横線で追加 (赤破線) ★★★
    # X軸の最大値まで線を引き、全体合成時間を可視化
    plt.axhline(
        y=full_synthesis_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Synthesis Time (Total, net_g.infer): {full_synthesis_time:.4f} sec'
    )

    # タイトルとラベルの設定
    plt.title(f'Decoder Time vs. Chunk Size (First Segment) vs. Full Synthesis\nText Z-Length: {full_len} frames')
    plt.xlabel('Chunk Size (Frames)')
    plt.ylabel('Decoding Time (Seconds)')

    # グリッドの表示
    plt.grid(True, linestyle='--', alpha=0.6)

    # 凡例の表示
    plt.legend()

    # グラフの表示
    plt.show()

print("\nMeasurement and plotting completed successfully.")

In [16]:
import torch
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib # importを分けても問題ありません

# --- 環境安定化のため、バックエンドを設定 (PDF保存のためにAggが安全) ---
matplotlib.use('Agg') 
plt.ioff() # Jupyter/Colab環境での自動描画を抑制

# --- 前提条件: 外部で定義されているオブジェクト ---
# hps, net_g, commons, japanese_cleaner_revised, cleaned_text_to_sequence, pyopenjtalk
# が定義されていることを前提とします。

# --- デバイス設定 ---
device = torch.device('cpu') # CPUで実行
print(f"Using device: {device}")
net_g.to(device)
net_g.eval()

# --- 入力テキスト ---
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"

# --- 前処理 (共通) ---
phonemized_text = japanese_cleaner_revised(text)
stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst).to(device)

# VITSの推論に必要な共通入力
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)


# =================================================================
# 1. テキスト全体を一度に合成した場合の時間を計測 (基準線)
# =================================================================

print("1. Measuring Full Synthesis Time (Baseline)...")
full_synthesis_start_time = time.time()
with torch.no_grad():
    audio_full, _, _, _, _, timings = net_g.infer(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
    audio_full_np = audio_full[0,0].data.cpu().float().numpy() 

full_synthesis_time = time.time() - full_synthesis_start_time
print(f"Time for Full Synthesis (net_g.infer): {full_synthesis_time:.6f} sec")
print("==================================================")


# =================================================================
# 2. チャンク分割合成の計測 (Z生成は共通)
# =================================================================

# --- z (潜在表現) の全体を生成 (共通処理) ---
print("2. Generating full latent representation (z) once for chunking...")
with torch.no_grad():
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
full_len = z.shape[2]
print(f"Full z-length: {full_len} frames.")
print("==================================================")


# --- 計測設定 ---
chunk_sizes = list(range(1, 300))
decoder_time_part1 = []
tested_chunk_size = []


# --- 計測ループ ---
print("3. Starting measurement loop for Decoder Time (Part 1)...")
for chunk_size in chunk_sizes:
    
    start_idx = 0
    end_idx = min(start_idx + chunk_size, full_len)
    
    if end_idx == start_idx:
        continue
    
    z_part = z[:, :, start_idx:end_idx]
    y_mask_part = y_mask[:, :, start_idx:end_idx]

    # デコード時間を計測
    with torch.no_grad():
        dec_start = time.time()
        audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
        audio_part_np = audio_part[0,0].data.cpu().float().numpy() 
        dec_time = time.time() - dec_start
        
        decoder_time_part1.append(dec_time)
        tested_chunk_size.append(chunk_size)

print("Measurement completed.")


# =================================================================
# 4. グラフ描画とPDF保存
# =================================================================

print("\n--- 4. Generating Plot and Saving as PDF ---")

# データのクリーンアップと型変換
try:
    tested_chunk_size_np = np.array(tested_chunk_size, dtype=np.int32)
    decoder_time_part1_np = np.array(decoder_time_part1, dtype=np.float64)
except Exception as e:
    print(f"ERROR: Could not convert data lists to NumPy arrays. Error: {e}")
    raise


if len(tested_chunk_size_np) == 0:
    print("WARNING: Data lists are empty. Skipping save.")
else:
    # 既存の図を全て閉じる
    plt.close('all') 
    plt.figure(figsize=(12, 7))

    # チャンク分割デコード時間 (青線)
    plt.plot(tested_chunk_size_np, decoder_time_part1_np, marker='o', color='b', label='Decoder Time (Part 1)')

    # 全体合成時間を横線で追加 (赤破線)
    plt.axhline(
        y=full_synthesis_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Synthesis Time (Total, net_g.infer): {full_synthesis_time:.4f} sec'
    )

    # タイトルとラベルの設定
    plt.title(f'Decoder Time vs. Chunk Size (First Segment) vs. Full Synthesis\nText Z-Length: {full_len} frames')
    plt.xlabel('Chunk Size (Frames)')
    plt.ylabel('Decoding Time (Seconds)')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    # ★★★ PDFファイルとして保存 ★★★
    pdf_filename = 'decoder_time_vs_chunk_size.pdf'
    plt.savefig(pdf_filename, format='pdf', bbox_inches='tight')
    
    print(f"✅ グラフを PDFファイル '{pdf_filename}' に保存しました。")
    
    # 図を閉じる（メモリクリーンアップ）
    plt.close()

print("\nMeasurement and saving completed.")

Using device: cpu
1. Measuring Full Synthesis Time (Baseline)...
Time for Full Synthesis (net_g.infer): 0.152447 sec
2. Generating full latent representation (z) once for chunking...
Full z-length: 300 frames.
3. Starting measurement loop for Decoder Time (Part 1)...
Measurement completed.

--- 4. Generating Plot and Saving as PDF ---
✅ グラフを PDFファイル 'decoder_time_vs_chunk_size.pdf' に保存しました。

Measurement and saving completed.


In [21]:
import torch
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from IPython.display import display, Audio 

# --- 環境安定化のため、バックエンドを設定 ---
matplotlib.use('Agg') 
plt.ioff()

# --- 前提条件: 外部で定義されているオブジェクト ---
# hps, net_g, commons, japanese_cleaner_revised, cleaned_text_to_sequence, pyopenjtalk
# が定義されていることを前提とします。

# --- デバイス設定 ---
device = torch.device('cpu') # CPUで実行
print(f"Using device: {device}")
net_g.to(device)
net_g.eval()

# --- 入力テキスト ---
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"

# --- 前処理 (共通) ---
phonemized_text = japanese_cleaner_revised(text)
stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst).to(device)

# VITSの推論に必要な共通入力
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)


# =================================================================
# 1. 潜在表現 z の全体を生成 (全処理の共通ステップ)
# =================================================================

print("1. Generating full latent representation (z) once...")
with torch.no_grad():
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
full_len = z.shape[2]
print(f"Full z-length: {full_len} frames.")
print("==================================================")


# =================================================================
# 2. 潜在表現 z 全体を一度にデコードした場合の時間を計測 (基準線)
# =================================================================

print("2. Measuring Full Decoder Time (Baseline)...")
full_decoder_start_time = time.time()
with torch.no_grad():
    # z全体 (Prior Encoderの出力) を使って、Decoder単体 (net_g.dec) を実行
    audio_full_dec, _ = net_g.dec((z * y_mask), g=None)
    audio_full_dec_np = audio_full_dec[0,0].data.cpu().float().numpy() 

# 基準となるのはデコーダー単体での実行時間
full_decoder_time = time.time() - full_decoder_start_time 
print(f"Time for Full Decoder (net_g.dec): {full_decoder_time:.6f} sec")
print("==================================================")


# =================================================================
# 3. チャンク分割合成の計測
# =================================================================

# --- 計測設定 ---
chunk_sizes = list(range(1, 300))
decoder_time_part1 = []
tested_chunk_size = []


# --- 計測ループ ---
print("3. Starting measurement loop for Decoder Time (Part 1)...")
for chunk_size in chunk_sizes:
    
    start_idx = 0
    end_idx = min(start_idx + chunk_size, full_len)
    
    if end_idx == start_idx:
        continue
    
    z_part = z[:, :, start_idx:end_idx]
    y_mask_part = y_mask[:, :, start_idx:end_idx]

    # デコード時間を計測
    with torch.no_grad():
        dec_start = time.time()
        # チャンクサイズの Decoder 単体 (net_g.dec) を実行
        audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
        audio_part_np = audio_part[0,0].data.cpu().float().numpy() 
        dec_time = time.time() - dec_start
        
        decoder_time_part1.append(dec_time)
        tested_chunk_size.append(chunk_size)

print("Measurement completed.")


# =================================================================
# 4. グラフ描画とPDF保存
# =================================================================

print("\n--- 4. Generating Plot and Saving as PDF ---")

# データのクリーンアップと型変換
try:
    tested_chunk_size_np = np.array(tested_chunk_size, dtype=np.int32)
    decoder_time_part1_np = np.array(decoder_time_part1, dtype=np.float64)
except Exception as e:
    print(f"ERROR: Could not convert data lists to NumPy arrays. Error: {e}")
    raise


if len(tested_chunk_size_np) == 0:
    print("WARNING: Data lists are empty. Skipping save.")
else:
    plt.close('all') 
    plt.figure(figsize=(12, 7))

    # チャンク分割デコード時間 (青線)
    plt.plot(tested_chunk_size_np, decoder_time_part1_np, marker='o', color='b', label='Decoder Time (Part 1, Chunked)')
    

    # ★★★ 全体デコーダー時間を横線で追加 (赤破線) ★★★
    plt.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )

    # タイトルとラベルの設定
    plt.title(f'Decoder Time vs. Chunk Size Comparison\nBaseline: Full Decoder Time ({full_len} frames)')
    plt.xlabel('Chunk Size (Frames)')
    plt.ylabel('Decoding Time (Seconds)')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    # PDFファイルとして保存
    pdf_filename = 'decoder_time_vs_chunk_size_comparison.pdf'
    plt.savefig(pdf_filename, format='pdf', bbox_inches='tight')
    
    print(f"✅ グラフを PDFファイル '{pdf_filename}' に保存しました。")
    
    # 図を閉じる（メモリクリーンアップ）
    plt.close()

print("\nMeasurement and saving completed.")

Using device: cpu
1. Generating full latent representation (z) once...
Full z-length: 300 frames.
2. Measuring Full Decoder Time (Baseline)...
Time for Full Decoder (net_g.dec): 0.119709 sec
3. Starting measurement loop for Decoder Time (Part 1)...
Measurement completed.

--- 4. Generating Plot and Saving as PDF ---
✅ グラフを PDFファイル 'decoder_time_vs_chunk_size_comparison.pdf' に保存しました。

Measurement and saving completed.


In [32]:
import torch
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib 
from IPython.display import display, Audio 

# --- 環境安定化のため、バックエンドを設定 ---
matplotlib.use('Agg') 
plt.ioff()

# --- 前提条件: 外部で定義されているオブジェクト ---
# hps, net_g, commons, japanese_cleaner_revised, cleaned_text_to_sequence, pyopenjtalk
# が定義されていることを前提とします。

# --- デバイス設定 ---
device = torch.device('cpu') # CPUで実行
print(f"Using device: {device}")
net_g.to(device)
net_g.eval()

# --- 入力テキスト ---
text = "オムライスにはケチャップが一番。"

# --- 前処理 (共通) ---
phonemized_text = japanese_cleaner_revised(text)
stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst).to(device)

# VITSの推論に必要な共通入力
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)


# =================================================================
# 1. 潜在表現 z の全体を生成 
# =================================================================

print("1. Generating full latent representation (z) once...")
with torch.no_grad():
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
full_len = z.shape[2]
print(f"Full z-length: {full_len} frames.")


# =================================================================
# 2. 潜在表現 z 全体を一度にデコードした場合の時間を計測 (基準線)
# =================================================================

print("2. Measuring Full Decoder Time (Non-Chunked Baseline)...")
full_decoder_start_time = time.time()
with torch.no_grad():
    audio_full_dec, _ = net_g.dec((z * y_mask), g=None)
    audio_full_dec_np = audio_full_dec[0,0].data.cpu().float().numpy() 

full_decoder_time = time.time() - full_decoder_start_time 
print(f"Time for Full Decoder (net_g.dec): {full_decoder_time:.6f} sec")
print("==================================================")


# =================================================================
# 3. チャンク分割合成の計測
# =================================================================

# --- 計測設定 ---
chunk_sizes = list(range(1, 101))
decoder_time_part1 = []  # グラフ1用: 最初のチャンクの時間
decoder_time_total = []  # グラフ2用: 全チャンクのデコード時間合計
tested_chunk_size = []


# --- 計測ループ ---
print("3. Starting measurement loop for Chunked Decoder Times...")
for chunk_size in chunk_sizes:
    
    # 潜在表現 z を 'chunk_size' ごとに分割
    all_segments = []
    current_total_dec_time = 0
    
    for start_idx in range(0, full_len, chunk_size):
        end_idx = min(start_idx + chunk_size, full_len)
        if end_idx == start_idx:
            continue
            
        z_part = z[:, :, start_idx:end_idx]
        y_mask_part = y_mask[:, :, start_idx:end_idx]
        
        # デコード時間を計測
        with torch.no_grad():
            dec_start = time.time()
            audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
            dec_time = time.time() - dec_start
        
        # 最初のパートの時間を記録 (グラフ1用)
        if start_idx == 0:
            decoder_time_part1.append(dec_time)
            
        # 全チャンクの合計時間に追加 (グラフ2用)
        current_total_dec_time += dec_time

    # 全デコード時間を記録 (グラフ2用)
    if current_total_dec_time > 0:
        decoder_time_total.append(current_total_dec_time)
        tested_chunk_size.append(chunk_size)
        
print("Measurement completed.")


# =================================================================
# 4. グラフ描画とPDF保存
# =================================================================

print("\n--- 4. Generating Plots and Saving as PDF ---")

# データのクリーンアップと型変換
try:
    tested_chunk_size_np = np.array(tested_chunk_size, dtype=np.int32)
    decoder_time_part1_np = np.array(decoder_time_part1, dtype=np.float64)
    decoder_time_total_np = np.array(decoder_time_total, dtype=np.float64)
except Exception as e:
    print(f"ERROR: Could not convert data lists to NumPy arrays. Error: {e}")
    raise


if len(tested_chunk_size_np) == 0:
    print("WARNING: Data lists are empty. Skipping save.")
else:
    plt.close('all') 
    
    # PDFファイルとして保存
    pdf_filename = 'decoder_time_vs_chunk_size_comparison_combined.pdf'
    
    # --- 図の作成 ---
    fig, axes = plt.subplots(2, 1, figsize=(12, 12)) # 2つのサブプロットを作成 
    
    # --- グラフ 1: Part 1 Decoder Time ---
    ax1 = axes[0]
    ax1.plot(tested_chunk_size_np, decoder_time_part1_np, marker='o', color='b', label='Decoder Time (Part 1, Chunked)')
    ax1.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )
    ax1.set_title(f'Graph 1: First Chunk Decoder Time vs. Chunk Size\n(Measures per-chunk overhead)')
    ax1.set_xlabel('Chunk Size (Frames)')
    ax1.set_ylabel('Decoding Time (Seconds)')
    ax1.grid(True, linestyle='--', alpha=0.6)
    ax1.legend()
    
    # --- グラフ 2: Total Decoder Time ---
    ax2 = axes[1]
    ax2.plot(tested_chunk_size_np, decoder_time_total_np, marker='o', color='g', label='Total Decoder Time (Sum of all chunks)')
    ax2.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )
    ax2.set_title(f'Graph 2: Total Chunked Decoder Time vs. Chunk Size\n(Measures total throughput/efficiency)')
    ax2.set_xlabel('Chunk Size (Frames)')
    ax2.set_ylabel('Total Decoding Time (Seconds)')
    ax2.grid(True, linestyle='--', alpha=0.6)
    ax2.legend()
    
    # レイアウトの調整
    plt.tight_layout()
    
    # PDFファイルとして保存
    plt.savefig(pdf_filename, format='pdf', bbox_inches='tight')
    
    print(f"✅ 2つのグラフを PDFファイル '{pdf_filename}' に保存しました。")
    
    # 図を閉じる（メモリクリーンアップ）
    plt.close()

print("\nMeasurement and saving completed.")

Using device: cpu
1. Generating full latent representation (z) once...
Full z-length: 123 frames.
2. Measuring Full Decoder Time (Non-Chunked Baseline)...
Time for Full Decoder (net_g.dec): 0.049999 sec
3. Starting measurement loop for Chunked Decoder Times...
Measurement completed.

--- 4. Generating Plots and Saving as PDF ---
✅ 2つのグラフを PDFファイル 'decoder_time_vs_chunk_size_comparison_combined.pdf' に保存しました。

Measurement and saving completed.


In [26]:
import torch
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib 
from IPython.display import display, Audio 

# --- 環境安定化のため、バックエンドを設定 ---
matplotlib.use('Agg') 
plt.ioff()

# --- 前提条件: 外部で定義されているオブジェクト ---
# hps, net_g, commons, japanese_cleaner_revised, cleaned_text_to_sequence, pyopenjtalk
# が定義されていることを前提とします。

# --- デバイス設定 ---
device = torch.device('cpu') # CPUで実行
print(f"Using device: {device}")
net_g.to(device)
net_g.eval()

# --- 入力テキスト ---
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"

# --- 前処理 (共通) ---
phonemized_text = japanese_cleaner_revised(text)
stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst).to(device)

# VITSの推論に必要な共通入力
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)


# =================================================================
# 1. 潜在表現 z の全体を生成 
# =================================================================

print("1. Generating full latent representation (z) once...")
with torch.no_grad():
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
full_len = z.shape[2]
print(f"Full z-length: {full_len} frames.")


# =================================================================
# 2. 潜在表現 z 全体を一度にデコードした場合の時間を計測 (基準線)
# =================================================================

print("2. Measuring Full Decoder Time (Non-Chunked Baseline)...")
full_decoder_start_time = time.time()
with torch.no_grad():
    audio_full_dec, _ = net_g.dec((z * y_mask), g=None)
    audio_full_dec_np = audio_full_dec[0,0].data.cpu().float().numpy() 

full_decoder_time = time.time() - full_decoder_start_time 
print(f"Time for Full Decoder (net_g.dec): {full_decoder_time:.6f} sec")
print("==================================================")


# =================================================================
# 3. チャンク分割合成の計測
# =================================================================

# --- 計測設定 ---
chunk_sizes = list(range(1, 101))
decoder_time_part1 = []  # グラフ1用: 最初のチャンクの時間
decoder_time_total = []  # グラフ2用: 全チャンクのデコード時間合計
tested_chunk_size = []


# --- 計測ループ ---
print("3. Starting measurement loop for Chunked Decoder Times...")
for chunk_size in chunk_sizes:
    
    # 潜在表現 z を 'chunk_size' ごとに分割
    all_segments = []
    current_total_dec_time = 0
    
    for start_idx in range(0, full_len, chunk_size):
        end_idx = min(start_idx + chunk_size, full_len)
        if end_idx == start_idx:
            continue
            
        z_part = z[:, :, start_idx:end_idx]
        y_mask_part = y_mask[:, :, start_idx:end_idx]
        
        # デコード時間を計測
        with torch.no_grad():
            dec_start = time.time()
            audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
            dec_time = time.time() - dec_start
        
        # 最初のパートの時間を記録 (グラフ1用)
        if start_idx == 0:
            decoder_time_part1.append(dec_time)
            
        # 全チャンクの合計時間に追加 (グラフ2用)
        current_total_dec_time += dec_time

    # 全デコード時間を記録 (グラフ2用)
    if current_total_dec_time > 0:
        decoder_time_total.append(current_total_dec_time)
        tested_chunk_size.append(chunk_size)
        
print("Measurement completed.")


# =================================================================
# 4. グラフ描画と個別PDF保存
# =================================================================

print("\n--- 4. Generating Plots and Saving as Individual PDFs ---")

# データのクリーンアップと型変換
try:
    tested_chunk_size_np = np.array(tested_chunk_size, dtype=np.int32)
    decoder_time_part1_np = np.array(decoder_time_part1, dtype=np.float64)
    decoder_time_total_np = np.array(decoder_time_total, dtype=np.float64)
except Exception as e:
    print(f"ERROR: Could not convert data lists to NumPy arrays. Error: {e}")
    raise


if len(tested_chunk_size_np) == 0:
    print("WARNING: Data lists are empty. Skipping save.")
else:
    # -----------------------------------------------------
    # グラフ 1: Part 1 Decoder Time (個別 PDF)
    # -----------------------------------------------------
    plt.close('all') 
    fig1 = plt.figure(figsize=(10, 6))

    plt.plot(tested_chunk_size_np, decoder_time_part1_np, marker='o', color='b', label='Decoder Time (Part 1, Chunked)')
    plt.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )
    plt.title(f'First Chunk Decoder Time vs. Chunk Size\n(Measures per-chunk overhead)')
    plt.xlabel('Chunk Size (Frames)')
    plt.ylabel('Decoding Time (Seconds)')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    pdf_filename_1 = 'decoder_time_part1.pdf'
    plt.savefig(pdf_filename_1, format='pdf', bbox_inches='tight')
    plt.close(fig1)
    print(f"✅ グラフ 1 を PDFファイル '{pdf_filename_1}' に保存しました。")


    # -----------------------------------------------------
    # グラフ 2: Total Decoder Time (個別 PDF)
    # -----------------------------------------------------
    plt.close('all')
    fig2 = plt.figure(figsize=(10, 6))

    plt.plot(tested_chunk_size_np, decoder_time_total_np, marker='o', color='g', label='Total Decoder Time (Sum of all chunks)')
    plt.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )
    plt.title(f'Total Chunked Decoder Time vs. Chunk Size\n(Measures total throughput/efficiency)')
    plt.xlabel('Chunk Size (Frames)')
    plt.ylabel('Total Decoding Time (Seconds)')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    pdf_filename_2 = 'decoder_time_total.pdf'
    plt.savefig(pdf_filename_2, format='pdf', bbox_inches='tight')
    plt.close(fig2)
    print(f"✅ グラフ 2 を PDFファイル '{pdf_filename_2}' に保存しました。")

print("\nMeasurement and saving completed.")

Using device: cpu
1. Generating full latent representation (z) once...
Full z-length: 300 frames.
2. Measuring Full Decoder Time (Non-Chunked Baseline)...
Time for Full Decoder (net_g.dec): 0.093051 sec
3. Starting measurement loop for Chunked Decoder Times...
Measurement completed.

--- 4. Generating Plots and Saving as Individual PDFs ---
✅ グラフ 1 を PDFファイル 'decoder_time_part1.pdf' に保存しました。
✅ グラフ 2 を PDFファイル 'decoder_time_total.pdf' に保存しました。

Measurement and saving completed.


In [5]:
import torch
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib 
import math # mathモジュールをインポート
from IPython.display import display, Audio 

# --- 環境安定化のため、バックエンドを設定 ---
matplotlib.use('Agg') 
plt.ioff()

# --- 前提条件: 外部で定義されているオブジェクト ---
# hps, net_g, commons, japanese_cleaner_revised, cleaned_text_to_sequence, pyopenjtalk
# が定義されていることを前提とします。

# --- デバイス設定 ---
device = torch.device('cpu') # CPUで実行
print(f"Using device: {device}")
net_g.to(device)
net_g.eval()

# --- 入力テキスト ---
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"

# --- 前処理 (共通) ---
phonemized_text = japanese_cleaner_revised(text)
stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst).to(device)

# VITSの推論に必要な共通入力
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)


# =================================================================
# 1. 潜在表現 z の全体を生成 
# =================================================================

print("1. Generating full latent representation (z) once...")
with torch.no_grad():
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
full_len = z.shape[2]
print(f"Full z-length: {full_len} frames.")


# =================================================================
# 2. 潜在表現 z 全体を一度にデコードした場合の時間を計測 (基準線)
# =================================================================

print("2. Measuring Full Decoder Time (Non-Chunked Baseline)...")
full_decoder_start_time = time.time()
with torch.no_grad():
    audio_full_dec, _ = net_g.dec((z * y_mask), g=None)
    audio_full_dec_np = audio_full_dec[0,0].data.cpu().float().numpy() 

full_decoder_time = time.time() - full_decoder_start_time 
print(f"Time for Full Decoder (net_g.dec): {full_decoder_time:.6f} sec")
print("==================================================")


# =================================================================
# 3. チャンク分割合成の計測と分割数の計算
# =================================================================

# --- 計測設定 ---
chunk_sizes = list(range(1, 101))
decoder_time_part1 = []  
decoder_time_total = [] 
tested_chunk_size = []
# ★★★ 横軸用のリスト ★★★
number_of_chunks = []


# --- 計測ループ ---
print("3. Starting measurement loop for Chunked Decoder Times...")
for chunk_size in chunk_sizes:
    
    # 潜在表現 z を 'chunk_size' ごとに分割
    all_segments = []
    current_total_dec_time = 0
    num_chunks = 0
    
    # 分割数を計算 (ceil(full_len / chunk_size))
    # ★★★ 横軸の値の計算 ★★★
    num_chunks = math.ceil(full_len / chunk_size)

    for start_idx in range(0, full_len, chunk_size):
        end_idx = min(start_idx + chunk_size, full_len)
        if end_idx == start_idx:
            continue
            
        z_part = z[:, :, start_idx:end_idx]
        y_mask_part = y_mask[:, :, start_idx:end_idx]
        
        # デコード時間を計測
        with torch.no_grad():
            dec_start = time.time()
            audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
            dec_time = time.time() - dec_start
        
        # 最初のパートの時間を記録
        if start_idx == 0:
            decoder_time_part1.append(dec_time)
            
        # 全チャンクの合計時間に追加
        current_total_dec_time += dec_time

    # 全デコード時間と分割数を記録
    if current_total_dec_time > 0:
        decoder_time_total.append(current_total_dec_time)
        number_of_chunks.append(num_chunks) # 分割数を記録
        
print("Measurement completed.")


# =================================================================
# 4. グラフ描画と個別PDF保存
# =================================================================

print("\n--- 4. Generating Plots and Saving as Individual PDFs (X-axis: # of Chunks) ---")

# データのクリーンアップと型変換
try:
    # ★★★ 横軸データもNumPy配列に変換 ★★★
    number_of_chunks_np = np.array(number_of_chunks, dtype=np.int32)
    decoder_time_part1_np = np.array(decoder_time_part1, dtype=np.float64)
    decoder_time_total_np = np.array(decoder_time_total, dtype=np.float64)
except Exception as e:
    print(f"ERROR: Could not convert data lists to NumPy arrays. Error: {e}")
    raise


if len(number_of_chunks_np) == 0:
    print("WARNING: Data lists are empty. Skipping save.")
else:
    # -----------------------------------------------------
    # グラフ 1: Part 1 Decoder Time (個別 PDF)
    # -----------------------------------------------------
    plt.close('all') 
    fig1 = plt.figure(figsize=(10, 6))

    # ★★★ 横軸を number_of_chunks_np に変更 ★★★
    plt.plot(number_of_chunks_np, decoder_time_part1_np, marker='o', color='b', label='Decoder Time (Part 1, Chunked)')
    plt.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )
    plt.title(f'Graph 1: First Chunk Decoder Time vs. Number of Chunks\n(Measures per-chunk overhead)')
    # ★★★ 横軸ラベルの変更 ★★★
    plt.xlabel('Number of Chunks (Splits)')
    plt.ylabel('Decoding Time (Seconds)')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    pdf_filename_1 = 'decoder_time_part1_by_splits.pdf'
    plt.savefig(pdf_filename_1, format='pdf', bbox_inches='tight')
    plt.close(fig1)
    print(f"✅ グラフ 1 を PDFファイル '{pdf_filename_1}' に保存しました。")


    # -----------------------------------------------------
    # グラフ 2: Total Decoder Time (個別 PDF)
    # -----------------------------------------------------
    plt.close('all')
    fig2 = plt.figure(figsize=(10, 6))

    # ★★★ 横軸を number_of_chunks_np に変更 ★★★
    plt.plot(number_of_chunks_np, decoder_time_total_np, marker='o', color='g', label='Total Decoder Time (Sum of all chunks)')
    plt.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )
    plt.title(f'Graph 2: Total Chunked Decoder Time vs. Number of Chunks\n(Measures total throughput/efficiency)')
    # ★★★ 横軸ラベルの変更 ★★★
    plt.xlabel('Number of Chunks (Splits)')
    plt.ylabel('Total Decoding Time (Seconds)')
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    pdf_filename_2 = 'decoder_time_total_by_splits.pdf'
    plt.savefig(pdf_filename_2, format='pdf', bbox_inches='tight')
    plt.close(fig2)
    print(f"✅ グラフ 2 を PDFファイル '{pdf_filename_2}' に保存しました。")

print("\nMeasurement and saving completed.")

Using device: cpu
1. Generating full latent representation (z) once...
Full z-length: 300 frames.
2. Measuring Full Decoder Time (Non-Chunked Baseline)...
Time for Full Decoder (net_g.dec): 0.071599 sec
3. Starting measurement loop for Chunked Decoder Times...
Measurement completed.

--- 4. Generating Plots and Saving as Individual PDFs (X-axis: # of Chunks) ---
✅ グラフ 1 を PDFファイル 'decoder_time_part1_by_splits.pdf' に保存しました。
✅ グラフ 2 を PDFファイル 'decoder_time_total_by_splits.pdf' に保存しました。

Measurement and saving completed.


In [7]:
import torch
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib 
import math 
import random
from IPython.display import display, Audio 

# --- 環境安定化のため、バックエンドを設定 ---
matplotlib.use('Agg') 
plt.ioff()

# --- 前提条件: 外部で定義されているオブジェクト ---
# hps, net_g, commons, japanese_cleaner_revised, cleaned_text_to_sequence, pyopenjtalk
# が定義されていることを前提とします。

# --- デバイス設定 ---
device = torch.device('cpu') # CPUで実行
print(f"Using device: {device}")
net_g.to(device)
net_g.eval()

# --- 入力テキスト ---
text = "最近、インターステラーを見たのですけど、すごく面白かったです。"

# --- 前処理 (共通) ---
phonemized_text = japanese_cleaner_revised(text)
stn_tst = cleaned_text_to_sequence(phonemized_text)
if hps.data.add_blank:
    stn_tst = commons.intersperse(stn_tst, 0)
stn_tst = torch.LongTensor(stn_tst).to(device)

# VITSの推論に必要な共通入力
x_tst = stn_tst.unsqueeze(0)
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)


# =================================================================
# 1. 潜在表現 z の全体を生成 
# =================================================================

print("1. Generating full latent representation (z) once...")
with torch.no_grad():
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, 
        noise_scale=.667, noise_scale_w=0.8, length_scale=1
    )
full_len = z.shape[2]
print(f"Full z-length: {full_len} frames.")


# =================================================================
# 2. 潜在表現 z 全体を一度にデコードした場合の時間を計測 (基準線)
# =================================================================

print("2. Measuring Full Decoder Time (Non-Chunked Baseline)...")
full_decoder_start_time = time.time()
with torch.no_grad():
    audio_full_dec, _ = net_g.dec((z * y_mask), g=None)
    audio_full_dec_np = audio_full_dec[0,0].data.cpu().float().numpy() 

full_decoder_time = time.time() - full_decoder_start_time 
print(f"Time for Full Decoder (net_g.dec): {full_decoder_time:.6f} sec")
print("==================================================")


# =================================================================
# 3. チャンク分割合成の計測と分割数の計算
# =================================================================

# --- 計測設定 ---
target_splits = list(range(1, 101))
decoder_time_part1 = []  
decoder_time_total = [] 
# ★★★ 横軸用のデータとして target_splits (1〜100) を使用する ★★★
number_of_chunks_plot = [] 


# --- 計測ループ ---
print("3. Starting measurement loop for Chunked Decoder Times (Splits 1 to 100)...")
for target_split_count in target_splits:
    
    # 目標分割数 (N) に基づいて、最適なチャンクサイズを逆算
    chunk_size = math.ceil(full_len / target_split_count)
    
    current_total_dec_time = 0
    dec_time_part1 = 0
    
    for start_idx in range(0, full_len, chunk_size):
        end_idx = min(start_idx + chunk_size, full_len)
        if end_idx == start_idx:
            continue
            
        z_part = z[:, :, start_idx:end_idx]
        y_mask_part = y_mask[:, :, start_idx:end_idx]
        
        # デコード時間を計測
        with torch.no_grad():
            dec_start = time.time()
            audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
            dec_time = time.time() - dec_start
        
        # 最初のパートの時間を記録
        if start_idx == 0:
            dec_time_part1 = dec_time
            
        # 全チャンクの合計時間に追加
        current_total_dec_time += dec_time

    # 全デコード時間と目標分割数を記録
    if current_total_dec_time > 0:
        decoder_time_total.append(current_total_dec_time)
        decoder_time_part1.append(dec_time_part1)
        # ★★★ 目標分割数 (1〜100) をそのまま記録 ★★★
        number_of_chunks_plot.append(target_split_count)
        
print("Measurement completed.")


# =================================================================
# 4. グラフ描画と個別PDF保存
# =================================================================

print("\n--- 4. Generating Plots and Saving as Individual PDFs (X-axis: Target Splits 1-100) ---")

# データのクリーンアップと型変換
try:
    number_of_chunks_np = np.array(number_of_chunks_plot, dtype=np.int32)
    decoder_time_part1_np = np.array(decoder_time_part1, dtype=np.float64)
    decoder_time_total_np = np.array(decoder_time_total, dtype=np.float64)
except Exception as e:
    print(f"ERROR: Could not convert data lists to NumPy arrays. Error: {e}")
    raise


if len(number_of_chunks_np) == 0:
    print("WARNING: Data lists are empty. Skipping save.")
else:
    # -----------------------------------------------------
    # グラフ 1: Part 1 Decoder Time (個別 PDF)
    # -----------------------------------------------------
    plt.close('all') 
    fig1 = plt.figure(figsize=(10, 6))

    # ★★★ 横軸に目標分割数をそのまま使用 ★★★
    plt.plot(number_of_chunks_np, decoder_time_part1_np, marker='o', color='b', linestyle='None', label='Decoder Time (Part 1, Measured)')
    plt.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )
    plt.title(f'Graph 1: First Chunk Decoder Time vs. Target Splits\n(Measures per-chunk overhead)')
    plt.xlabel('Target Number of Chunks (Splits)')
    plt.ylabel('Decoding Time (Seconds)')
    plt.xlim(1, 100) 
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    pdf_filename_1 = 'decoder_time_part1_target_splits.pdf'
    plt.savefig(pdf_filename_1, format='pdf', bbox_inches='tight')
    plt.close(fig1)
    print(f"✅ グラフ 1 を PDFファイル '{pdf_filename_1}' に保存しました。")


    # -----------------------------------------------------
    # グラフ 2: Total Decoder Time (個別 PDF)
    # -----------------------------------------------------
    plt.close('all')
    fig2 = plt.figure(figsize=(10, 6))

    # ★★★ 横軸に目標分割数をそのまま使用 ★★★
    plt.plot(number_of_chunks_np, decoder_time_total_np, marker='o', color='g', linestyle='None', label='Total Decoder Time (Measured)')
    plt.axhline(
        y=full_decoder_time, 
        color='r', 
        linestyle='--', 
        label=f'Full Decoder Time (Non-Chunked): {full_decoder_time:.4f} sec'
    )
    plt.title(f'Graph 2: Total Chunked Decoder Time vs. Target Splits\n(Measures total throughput/efficiency)')
    plt.xlabel('Target Number of Chunks (Splits)')
    plt.ylabel('Total Decoding Time (Seconds)')
    plt.xlim(1, 100)
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.legend()
    
    pdf_filename_2 = 'decoder_time_total_target_splits.pdf'
    plt.savefig(pdf_filename_2, format='pdf', bbox_inches='tight')
    plt.close(fig2)
    print(f"✅ グラフ 2 を PDFファイル '{pdf_filename_2}' に保存しました。")

print("\nMeasurement and saving completed.")

Using device: cpu
1. Generating full latent representation (z) once...
Full z-length: 300 frames.
2. Measuring Full Decoder Time (Non-Chunked Baseline)...
Time for Full Decoder (net_g.dec): 0.090867 sec
3. Starting measurement loop for Chunked Decoder Times (Splits 1 to 100)...
Measurement completed.

--- 4. Generating Plots and Saving as Individual PDFs (X-axis: Target Splits 1-100) ---
✅ グラフ 1 を PDFファイル 'decoder_time_part1_target_splits.pdf' に保存しました。
✅ グラフ 2 を PDFファイル 'decoder_time_total_target_splits.pdf' に保存しました。

Measurement and saving completed.


In [9]:
import numpy as np
import time
import torch
import IPython.display as ipd

# ▼▼▼ 変更可能なパラメータ ▼▼▼
# 分割するzのフレーム数（メインのチャンクサイズ）
chunk_size_frames = 5

# オーバーラップさせるzのフレーム数
# ここの値が大きいほどつなぎ目は滑らかになりますが、計算量が増えます（2〜5程度推奨）
overlap_z_frames = 5
# ▲▲▲ 変更可能なパラメータ ▲▲▲

if chunk_size_frames <= 0:
    print("Error: chunk_size_frames must be greater than 0")
else:
    print(f"Splitting latent representation into chunks of {chunk_size_frames} frames with {overlap_z_frames} frames overlap...")

    # 1. 全体の処理時間を計測開始
    total_start_time = time.time() 

    # 結果を格納するリスト
    decoding_times = []
    decoded_audios_np = []

    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)])
        
        # 2. z (潜在表現) の全体を生成
        print("Generating full latent representation (z)...")
        z_gen_start_time = time.time()
        attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
            x_tst, x_tst_lengths, 
            noise_scale=.667, noise_scale_w=0.8, length_scale=1
        )
        z_gen_time = time.time() - z_gen_start_time
        print(f"Time for infer_z_only (Prior Encoder): {z_gen_time:.4f} sec")
        print("--------------------")

        # 3. 潜在表現 z を 'chunk_size_frames' ごとに、オーバーラップを含めて分割
        full_len = z.shape[2]
        
        all_segments = []
        # stepはオーバーラップを含まない純粋な進行量
        step = chunk_size_frames 
        
        for start_idx in range(0, full_len, step):
            # 終了位置は オーバーラップ分を余計に含める
            # ただし full_len を超えないようにする
            end_idx = min(start_idx + step + overlap_z_frames, full_len)
            
            # 実際のオーバーラップ量を計算（最後の方で足りなくなる場合に対応）
            # 次のチャンクの開始位置(start_idx + step)と現在の終了位置(end_idx)の差
            current_overlap = max(0, end_idx - (start_idx + step))
            
            z_part = z[:, :, start_idx:end_idx]
            y_mask_part = y_mask[:, :, start_idx:end_idx]
            
            # あとで結合時に使うために current_overlap も保存
            all_segments.append({
                "z_part": z_part,
                "y_mask_part": y_mask_part,
                "start": start_idx,
                "end": end_idx,
                "z_overlap": current_overlap
            })
        
        print(f"Total z-length {full_len} frames split into {len(all_segments)} segments.")

        # 4. 各セグメントを個別にデコード
        print("Decoding segments individually...")
        
        for i, seg in enumerate(all_segments):
            z_part = seg["z_part"]
            y_mask_part = seg["y_mask_part"]
            
            print(f"  Segment {i+1}/{len(all_segments)} (z-index {seg['start']} to {seg['end']})...")

            dec_start = time.time()
            if z_part.shape[2] > 0:
                audio_part, _ = net_g.dec((z_part * y_mask_part), g=None)
                audio_part_np = audio_part[0,0].data.cpu().float().numpy()
            else:
                audio_part_np = np.array([], dtype=np.float32)
            
            dec_time = time.time() - dec_start
            
            decoding_times.append(dec_time)
            
            # 結果とオーバーラップ情報を辞書に追加して保存
            seg["audio"] = audio_part_np
            print(f"  Time for Decoder (Part {i+1}): {dec_time:.4f} sec")

    # 5. Overlap and Add (クロスフェード) で結合
    print("--------------------")
    print("Stitching audio with Overlap-Add...")
    
    # アップサンプル率の推定（音声長 / z長）
    if len(all_segments) > 0 and all_segments[0]["z_part"].shape[2] > 0:
        # 最初のセグメントで計算
        ref_audio_len = len(all_segments[0]["audio"])
        ref_z_len = all_segments[0]["z_part"].shape[2]
        upsample_ratio = ref_audio_len / ref_z_len
    else:
        upsample_ratio = 256 # fallback (typical hop_length)

    final_audio = []
    
    # 最初のセグメントはそのまま開始点とする
    if len(all_segments) > 0:
        current_audio = all_segments[0]["audio"]
        current_z_overlap = all_segments[0]["z_overlap"]
        
        # 最初のセグメントの「オーバーラップ部分以外」をバッファに入れる
        # もし次のセグメントがないなら全部入れる
        if len(all_segments) == 1:
             final_audio.append(current_audio)
        else:
            # 音声におけるオーバーラップサンプル数
            # (厳密には z_overlap * upsample_ratio だが、多少のズレ許容のため計算で出す)
            # ここでは「次のチャンクのために残しておくべき長さ」を計算
            overlap_samples = int(current_z_overlap * upsample_ratio)
            
            # オーバーラップ直前までを確定させる
            keep_len = len(current_audio) - overlap_samples
            final_audio.append(current_audio[:keep_len])
            
            # 次のループのために「前のセグメントの尻尾（オーバーラップ部分）」を保存
            prev_tail = current_audio[keep_len:]

    # 2つ目のセグメント以降をループ
    for i in range(1, len(all_segments)):
        seg = all_segments[i]
        curr_audio = seg["audio"]
        curr_z_overlap = seg["z_overlap"]
        
        # クロスフェードの長さを決定（前の尻尾の長さに合わせる）
        overlap_len = len(prev_tail)
        
        # 現在のオーディオの先頭 overlap_len 分と、前のオーディオの尻尾をブレンド
        # 現在のオーディオが短すぎる場合のガード
        if len(curr_audio) < overlap_len:
            overlap_len = len(curr_audio)
            prev_tail = prev_tail[:overlap_len]

        # クロスフェード用の重み作成 (0 -> 1)
        fade_in = np.linspace(0, 1, overlap_len)
        fade_out = 1.0 - fade_in
        
        # 結合: (前の尻尾 * fade_out) + (今の頭 * fade_in)
        overlapped_part = (prev_tail * fade_out) + (curr_audio[:overlap_len] * fade_in)
        final_audio.append(overlapped_part)
        
        # 次のオーバーラップのために準備
        if i < len(all_segments) - 1:
            # 次のチャンクがある場合
            next_overlap_samples = int(curr_z_overlap * upsample_ratio)
            keep_len = len(curr_audio) - next_overlap_samples - overlap_len
            
            # クロスフェード済み部分の次 〜 次のオーバーラップ直前までを追加
            middle_part = curr_audio[overlap_len : overlap_len + keep_len]
            final_audio.append(middle_part)
            
            # 次のために尻尾を保存
            prev_tail = curr_audio[overlap_len + keep_len:]
        else:
            # 最後のチャンクの場合、残りをすべて追加
            final_audio.append(curr_audio[overlap_len:])

    # リストを結合
    audio_combined = np.concatenate(final_audio)

    # 6. 全体の処理時間を計算
    total_elapsed_time = time.time() - total_start_time
    print(f"Total Decoder Time (Sum of parts): {sum(decoding_times):.4f} sec")
    print(f"Total Elapsed Time (z_gen + all_decs + stitching): {total_elapsed_time:.4f} sec")
    print("--------------------------------------------------")

    # 7. 音声を表示
    # 各パーツの音声を表示してもクロスフェード前だとわかりにくいので、結合後を表示
    print("\n--- 🎧 Audio (Combined with Overlap-Add) ---")
    ipd.display(ipd.Audio(audio_combined, rate=hps.data.sampling_rate, normalize=False))

Splitting latent representation into chunks of 5 frames with 5 frames overlap...
Generating full latent representation (z)...
Time for infer_z_only (Prior Encoder): 0.0222 sec
--------------------
Total z-length 272 frames split into 55 segments.
Decoding segments individually...
  Segment 1/55 (z-index 0 to 10)...
  Time for Decoder (Part 1): 0.0092 sec
  Segment 2/55 (z-index 5 to 15)...
  Time for Decoder (Part 2): 0.0100 sec
  Segment 3/55 (z-index 10 to 20)...
  Time for Decoder (Part 3): 0.0112 sec
  Segment 4/55 (z-index 15 to 25)...
  Time for Decoder (Part 4): 0.0112 sec
  Segment 5/55 (z-index 20 to 30)...
  Time for Decoder (Part 5): 0.0115 sec
  Segment 6/55 (z-index 25 to 35)...
  Time for Decoder (Part 6): 0.0116 sec
  Segment 7/55 (z-index 30 to 40)...
  Time for Decoder (Part 7): 0.0123 sec
  Segment 8/55 (z-index 35 to 45)...
  Time for Decoder (Part 8): 0.0119 sec
  Segment 9/55 (z-index 40 to 50)...
  Time for Decoder (Part 9): 0.0117 sec
  Segment 10/55 (z-index 45 

In [11]:
import matplotlib.pyplot as plt
import librosa
import librosa.display

# ▼▼▼ 表示設定 ▼▼▼
n_fft = 1024       # FFTのポイント数
hop_length = 256   # ホップ長
max_display = 5    # 表示するセグメント数の上限（多すぎると重くなるため）
# ▲▲▲ 表示設定 ▲▲▲

print(f"Displaying Phase Spectrogram for first {min(len(all_segments), max_display)} segments...")

plt.figure(figsize=(15, 4 * min(len(all_segments), max_display)))

for i, seg in enumerate(all_segments):
    if i >= max_display:
        break
        
    audio_part = seg["audio"]
    
    # 1. 短時間フーリエ変換 (STFT) を計算
    # 複素スペクトルが得られます
    D = librosa.stft(audio_part, n_fft=n_fft, hop_length=hop_length)
    
    # 2. 位相成分 (Phase) を抽出
    # np.angle は複素数の偏角を -pi から pi の範囲で返します
    phase_spectrum = np.angle(D)
    
    # 3. プロット
    plt.subplot(max_display, 1, i + 1)
    
    # x軸を秒単位にするため、サンプリングレートを使用
    # y_axis='linear' または 'log' で周波数軸を表示
    librosa.display.specshow(
        phase_spectrum, 
        sr=hps.data.sampling_rate, 
        hop_length=hop_length, 
        x_axis='time', 
        y_axis='linear', 
        cmap='twilight'  # 位相は周期的(-piとpiがつながる)なのでtwilightやhsvが見やすい
    )
    
    plt.colorbar(format='%+2.0f rad', label='Phase (radians)')
    plt.title(f"Segment {i+1} Phase Spectrogram (z-index: {seg['start']}-{seg['end']})")
    plt.xlabel("Time (sec)")
    plt.ylabel("Frequency (Hz)")
    pdf_filename_2 = 'Segment {i+1} Phase Spectrogram.pdf'
    plt.savefig(pdf_filename_2, format='pdf', bbox_inches='tight')

plt.tight_layout()
plt.show()

Displaying Phase Spectrogram for first 5 segments...


In [13]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display

def calculate_phase_consistency(segments, sampling_rate, n_fft=1024, hop_length=256):
    """
    各セグメント間のオーバーラップ領域における位相整合性を計算します。
    """
    consistencies = []
    consistency_maps = []
    
    # アップサンプル率の推定
    if len(segments) > 0 and segments[0]["z_part"].shape[2] > 0:
        ref_audio_len = len(segments[0]["audio"])
        ref_z_len = segments[0]["z_part"].shape[2]
        upsample_ratio = ref_audio_len / ref_z_len
    else:
        upsample_ratio = 256

    print(f"Calculating phase consistency for {len(segments)-1} junctions...")

    for i in range(len(segments) - 1):
        seg_prev = segments[i]
        seg_next = segments[i+1]
        
        # 共有しているzの区間長からオーバーラップサンプル数を計算
        shared_z = (seg_prev['end'] - seg_next['start'])
        overlap_samples = int(shared_z * upsample_ratio)
        
        # ▼▼▼ 修正箇所: Warning時の処理 ▼▼▼
        if overlap_samples < n_fft:
            print(f"Warning: Junction {i+1} overlap ({overlap_samples} samples) is too short for n_fft={n_fft}. Skipping.")
            consistencies.append(0)
            # 修正前: np.zeros((n_fft//2 + 1, 1)) -> 2次元になってしまっていた
            # 修正後: np.zeros(n_fft//2 + 1)      -> 1次元にする
            consistency_maps.append(np.zeros(n_fft//2 + 1)) 
            continue
        # ▲▲▲ 修正箇所終わり ▲▲▲

        # 音声の切り出し
        audio_tail = seg_prev['audio'][-overlap_samples:]
        audio_head = seg_next['audio'][:overlap_samples]
        
        min_len = min(len(audio_tail), len(audio_head))
        audio_tail = audio_tail[:min_len]
        audio_head = audio_head[:min_len]

        # STFT計算
        S1 = librosa.stft(audio_tail, n_fft=n_fft, hop_length=hop_length)
        S2 = librosa.stft(audio_head, n_fft=n_fft, hop_length=hop_length)
        
        min_time = min(S1.shape[1], S2.shape[1])
        if min_time == 0:
            # 万が一STFT結果の幅が0だった場合のガード
            consistencies.append(0)
            consistency_maps.append(np.zeros(n_fft//2 + 1))
            continue
            
        S1 = S1[:, :min_time]
        S2 = S2[:, :min_time]

        # 位相差の計算
        phase_diff = np.angle(S1) - np.angle(S2)
        
        # 時間平均をとって1次元配列 (freq,) にする
        pc_freq = np.mean(np.cos(phase_diff), axis=1)
        
        consistency_maps.append(pc_freq)
        
        # 全体の平均スコア
        avg_score = np.mean(pc_freq)
        consistencies.append(avg_score)

    return consistencies, consistency_maps

# --- 実行と表示 ---

global_scores, freq_maps = calculate_phase_consistency(all_segments, hps.data.sampling_rate)

# 1. 接合部ごとの平均スコアをプロット
plt.figure(figsize=(12, 4))
plt.plot(global_scores, marker='o', linestyle='-', color='b')
plt.title("Average Phase Consistency per Junction")
plt.xlabel("Junction Index")
plt.ylabel("Consistency Score")
plt.grid(True)
plt.ylim(-0.2, 1.1)
plt.axhline(0, color='black', linewidth=0.5)
plt.show()

# 2. 周波数ごとの詳細ヒートマップを表示
if len(freq_maps) > 0:
    # 修正によりすべて1次元配列になっているので stack が成功します
    map_stack = np.stack(freq_maps, axis=1) # shape: (freq, junctions)
    
    plt.figure(figsize=(15, 6))
    img = plt.imshow(map_stack, origin='lower', aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1,
                     extent=[0, len(freq_maps), 0, hps.data.sampling_rate/2])
    
    plt.colorbar(img, label='Phase Consistency')
    plt.title("Phase Consistency Heatmap (Frequency vs Junction)")
    plt.xlabel("Junction Index")
    plt.ylabel("Frequency (Hz)")
    plt.tight_layout()
    plt.show()

print("--- Consistency Report ---")
for i, score in enumerate(global_scores):
    # スコアが極端に低いものだけ表示するなど間引いても良いです
    if i < 5 or i > len(global_scores) - 5:
        print(f"Junction {i+1}: Score = {score:.4f}")

Calculating phase consistency for 54 junctions...
--- Consistency Report ---
Junction 1: Score = 0.5427
Junction 2: Score = 0.3839
Junction 3: Score = 0.3074
Junction 4: Score = 0.1724
Junction 5: Score = -0.1073
Junction 51: Score = -0.0111
Junction 52: Score = 0.1387
Junction 53: Score = 0.1353
Junction 54: Score = 0.0000


In [14]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from matplotlib.backends.backend_pdf import PdfPages # PDF保存用

def calculate_phase_consistency(segments, sampling_rate, n_fft=1024, hop_length=256):
    """
    各セグメント間のオーバーラップ領域における位相整合性を計算します。
    """
    consistencies = []
    consistency_maps = []
    
    # アップサンプル率の推定
    if len(segments) > 0 and segments[0]["z_part"].shape[2] > 0:
        ref_audio_len = len(segments[0]["audio"])
        ref_z_len = segments[0]["z_part"].shape[2]
        upsample_ratio = ref_audio_len / ref_z_len
    else:
        upsample_ratio = 256

    print(f"Calculating phase consistency for {len(segments)-1} junctions...")

    for i in range(len(segments) - 1):
        seg_prev = segments[i]
        seg_next = segments[i+1]
        
        # 共有しているzの区間長からオーバーラップサンプル数を計算
        shared_z = (seg_prev['end'] - seg_next['start'])
        overlap_samples = int(shared_z * upsample_ratio)
        
        # Warning時の処理（修正版）
        if overlap_samples < n_fft:
            print(f"Warning: Junction {i+1} overlap ({overlap_samples} samples) is too short for n_fft={n_fft}. Skipping.")
            consistencies.append(0)
            consistency_maps.append(np.zeros(n_fft//2 + 1)) 
            continue

        # 音声の切り出し
        audio_tail = seg_prev['audio'][-overlap_samples:]
        audio_head = seg_next['audio'][:overlap_samples]
        
        min_len = min(len(audio_tail), len(audio_head))
        audio_tail = audio_tail[:min_len]
        audio_head = audio_head[:min_len]

        # STFT計算
        S1 = librosa.stft(audio_tail, n_fft=n_fft, hop_length=hop_length)
        S2 = librosa.stft(audio_head, n_fft=n_fft, hop_length=hop_length)
        
        min_time = min(S1.shape[1], S2.shape[1])
        if min_time == 0:
            consistencies.append(0)
            consistency_maps.append(np.zeros(n_fft//2 + 1))
            continue
            
        S1 = S1[:, :min_time]
        S2 = S2[:, :min_time]

        # 位相差の計算
        phase_diff = np.angle(S1) - np.angle(S2)
        
        # 時間平均をとって1次元配列 (freq,) にする
        pc_freq = np.mean(np.cos(phase_diff), axis=1)
        
        consistency_maps.append(pc_freq)
        
        # 全体の平均スコア
        avg_score = np.mean(pc_freq)
        consistencies.append(avg_score)

    return consistencies, consistency_maps

# --- 実行とPDF保存 ---

# 計算
global_scores, freq_maps = calculate_phase_consistency(all_segments, hps.data.sampling_rate)
output_filename = "phase_consistency_report.pdf"

print(f"Generating plots and saving to {output_filename}...")

with PdfPages(output_filename) as pdf:
    # 1. 接合部ごとの平均スコアをプロット
    plt.figure(figsize=(12, 6)) # サイズを少し調整
    plt.plot(global_scores, marker='o', linestyle='-', color='b')
    plt.title("Average Phase Consistency per Junction")
    plt.xlabel("Junction Index")
    plt.ylabel("Consistency Score (1.0=Perfect)")
    plt.grid(True)
    plt.ylim(-0.2, 1.1)
    plt.axhline(0, color='black', linewidth=0.5)
    
    # PDFに現在のFigureを保存
    pdf.savefig() 
    plt.show() # 画面にも表示
    plt.close() # メモリ解放

    # 2. 周波数ごとの詳細ヒートマップを表示
    if len(freq_maps) > 0:
        map_stack = np.stack(freq_maps, axis=1)
        
        plt.figure(figsize=(15, 8)) # サイズを少し調整
        img = plt.imshow(map_stack, origin='lower', aspect='auto', cmap='RdBu_r', vmin=-1, vmax=1,
                         extent=[0, len(freq_maps), 0, hps.data.sampling_rate/2])
        
        plt.colorbar(img, label='Phase Consistency')
        plt.title("Phase Consistency Heatmap (Frequency vs Junction)")
        plt.xlabel("Junction Index")
        plt.ylabel("Frequency (Hz)")
        plt.tight_layout()
        
        # PDFに現在のFigureを保存
        pdf.savefig()
        plt.show() # 画面にも表示
        plt.close() # メモリ解放

print(f"Successfully saved charts to '{output_filename}'")

Calculating phase consistency for 54 junctions...
Generating plots and saving to phase_consistency_report.pdf...
Successfully saved charts to 'phase_consistency_report.pdf'
