In [7]:
import os
import re
import time
import json
import torch
import commons
import utils
from models import SynthesizerTrn
from text_JP import cleaned_text_to_sequence, symbols
import pyopenjtalk
from text_JP.phonemize import Phonemizer

# This script is designed to be run in a Jupyter Notebook or an environment
# with IPython display capabilities.
from IPython.display import Audio, display

# 1 全体デコード 

In [15]:
# --- 1. Configuration (EDIT THESE PATHS) ---
# ===========================================
# Path to the config file of the trained multi-speaker model
config_path = "./logs/uudb_csj21/config.json" 

# Path to the generator checkpoint of the trained model
# Find the latest G_****.pth file in your model directory
checkpoint_path = "./logs/uudb_csj21/G_3020000.pth" # <-- IMPORTANT: UPDATE THIS PATH

# Text to be synthesized
text_to_synthesize = "最近、インターステラーを見たのですけど、すごく面白かったです。"
# ===========================================


# --- 2. Text Pre-processing Functions ---
def japanese_cleaner_revised(text):
    parts = re.split(r'({cough}|<cough>|\[.*?\]|[、。])', text)
    phoneme_parts = []
    phonemizer = Phonemizer()
    for part in parts:
        if not part or part.isspace():
            continue
        if part.startswith('[') and part.endswith(']') and len(part) > 2:
            content = part[1:-1]
            if not content:
                phoneme_parts.append('[ ]')
            else:
                kana_content = pyopenjtalk.g2p(content, kana=True).replace('ヲ', 'オ')
                phoneme_content = phonemizer(kana_content)
                phoneme_parts.append(f'[ {phoneme_content} ]')
            continue
        if part == '{cough}' or part == '<cough>':
            phoneme_parts.append('<cough>')
            continue
        if part in '、。':
            phoneme_parts.append('sp')
            continue
        kana = pyopenjtalk.g2p(part, kana=True).replace('ヲ', 'オ')
        phonemes = phonemizer(kana)
        phoneme_parts.append(phonemes)
    final_text = ' '.join(phoneme_parts)
    return re.sub(r'\s+', ' ', final_text).strip()

def text_to_sequence_custom(text, hps):
    phonemized_text = japanese_cleaner_revised(text)
    stn_tst = cleaned_text_to_sequence(phonemized_text)
    if hps.data.add_blank:
        stn_tst = commons.intersperse(stn_tst, 0)
    return torch.LongTensor(stn_tst)


# --- 3. Main Synthesis Process ---
if not os.path.exists(config_path):
    print(f"ERROR: Config file not found at {config_path}")
elif not os.path.exists(checkpoint_path):
    print(f"ERROR: Checkpoint file not found at {checkpoint_path}")
    print("Please update the 'checkpoint_path' variable in this script to point to your trained model.")
else:
    # Load configuration
    hps = utils.get_hparams_from_file(config_path)

    # Determine device
    #device = "cuda" if torch.cuda.is_available() else "cpu"
    device = "cpu"

    # Load model
    print("Loading model...")
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model).to(device)
    
    # Set model to evaluation mode
    _ = net_g.eval()
    
    # Load checkpoint
    print(f"Loading checkpoint from {checkpoint_path}...")
    _ = utils.load_checkpoint(checkpoint_path, net_g, None)

    # Process text
    print(f"Original text: {text_to_synthesize}")

    stn_tst = text_to_sequence_custom(text_to_synthesize, hps)

    i=375
    # Synthesize for each speaker
    print(f"\n--- Synthesizing for Speaker {i} ---")
    start_time = time.time()
        
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([i]).to(device)
            
        # Inference
        audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, 
                            noise_scale=0.1, 
                            noise_scale_w=1.0, 
                            length_scale=1.0
                            )[0][0,0].data.cpu().float().numpy()

    end_time = time.time()
    elapsed_time = end_time - start_time
    audio_duration = len(audio) / hps.data.sampling_rate
    rtf = elapsed_time / audio_duration

    print(f"Audio duration: {audio_duration:.2f} seconds")
    print(f"Elapsed time: {elapsed_time:.2f} seconds")
    print(f"Real Time Factor (RTF): {rtf:.4f}")
    display(Audio(audio, rate=hps.data.sampling_rate, normalize=False))

    print("\nSynthesis complete.")



Loading model...
Mutli-stream iSTFT VITS
Loading checkpoint from ./logs/uudb_csj21/G_3020000.pth...
Original text: 最近、インターステラーを見たのですけど、すごく面白かったです。

--- Synthesizing for Speaker 375 ---
Audio duration: 4.18 seconds
Elapsed time: 0.13 seconds
Real Time Factor (RTF): 0.0301



Synthesis complete.


# 2 細切れデコード　overlapなし 

In [24]:
if not os.path.exists(config_path):
    print(f"ERROR: Config file not found at {config_path}")
elif not os.path.exists(checkpoint_path):
    print(f"ERROR: Checkpoint file not found at {checkpoint_path}")
    print("Please update the 'checkpoint_path' variable in this script to point to your trained model.")
else:
    # Load configuration
    hps = utils.get_hparams_from_file(config_path)

    # Determine device
    # device = "cuda" if torch.cuda.is_available() else "cpu"
    device = "cpu" # 必要に応じてcudaに変更してください

    # Load model
    print("Loading model...")
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model).to(device)
    
    _ = net_g.eval()
    
    print(f"Loading checkpoint from {checkpoint_path}...")
    _ = utils.load_checkpoint(checkpoint_path, net_g, None)

    print(f"Original text: {text_to_synthesize}")
    stn_tst = text_to_sequence_custom(text_to_synthesize, hps)

    # Speaker ID
    i = 375 # 任意のスピーカーID
    print(f"\n--- Synthesizing for Speaker {i} (Chunk Decoding) ---")
    
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([i]).to(device)
            
        # 1. まず潜在表現 z を全体生成 (infer_z_only を使用)
        # ---------------------------------------------------------
        start_time_z = time.time()
        
        # infer_z_only は (attn, y_mask, (z, z_p, m_p, logs_p), timings) を返します
        # 必要なのは z (タプルの0番目) と y_mask です
        attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
            x_tst, 
            x_tst_lengths, 
            sid=sid, 
            noise_scale=0.1, 
            noise_scale_w=1.0, 
            length_scale=1.0
        )
        
        # z は padding されている可能性があるため、y_mask でマスク処理をしておきます
        z = z * y_mask
        
        time_z = time.time() - start_time_z
        print(f"Latent Z generation time: {time_z:.4f} sec")
        print(f"Latent Z shape: {z.shape}") # [Batch, Channels, Time]

        # 2. Decoder 用の Speaker Embedding (g) を準備
        # ---------------------------------------------------------
        # infer_z_only は g を返さないため、ここで手動生成します
        if net_g.n_speakers > 0:
            g = net_g.emb_g(sid).unsqueeze(-1)
        else:
            g = None

        # 3. 10フレームごとに分割してデコード (Chunk Decoding)
        # ---------------------------------------------------------
        chunk_size = 10  # 1セグメントあたりのフレーム数
        full_audio_chunks = []
        
        z_channels, z_time_length = z.shape[1], z.shape[2]
        
        print(f"Start decoding in chunks of size {chunk_size}...")
        start_time_dec = time.time()

        for step in range(0, z_time_length, chunk_size):
            # z をスライス (最後の端数も自動的に処理されます)
            end_step = min(step + chunk_size, z_time_length)
            z_chunk = z[:, :, step:end_step]
            
            # 部分デコード
            # models.py の dec は (o, o_mb, spec, phase) を返します
            o_chunk, _, _, _ = net_g.dec(z_chunk, g=g)
            
            full_audio_chunks.append(o_chunk)

        # 4. 結合して最終的な音声にする
        # ---------------------------------------------------------
        full_audio = torch.cat(full_audio_chunks, dim=2)
        
        end_time = time.time()
        
        # 結果の整形
        audio = full_audio[0, 0].data.cpu().float().numpy()

    # パフォーマンス計測
    elapsed_time = end_time - start_time_z
    audio_duration = len(audio) / hps.data.sampling_rate
    rtf = elapsed_time / audio_duration

    print(f"Total Audio duration: {audio_duration:.2f} seconds")
    print(f"Total Elapsed time: {elapsed_time:.2f} seconds")
    print(f"Real Time Factor (RTF): {rtf:.4f}")
    
    # 音声再生
    display(Audio(audio, rate=hps.data.sampling_rate, normalize=False))

    print("\nSynthesis complete.")

Loading model...
Mutli-stream iSTFT VITS
Loading checkpoint from ./logs/uudb_csj21/G_3020000.pth...
Original text: 最近、インターステラーを見たのですけど、すごく面白かったです

--- Synthesizing for Speaker 375 (Chunk Decoding) ---
Latent Z generation time: 0.0245 sec
Latent Z shape: torch.Size([1, 192, 252])
Start decoding in chunks of size 10...
Total Audio duration: 4.03 seconds
Total Elapsed time: 0.31 seconds
Real Time Factor (RTF): 0.0762



Synthesis complete.


# 3 細切れデコード　overlapあり

In [25]:
# ... (前略: imports, functions, config設定などはそのまま) ...

# --- 3. Main Synthesis Process ---
if not os.path.exists(config_path):
    print(f"ERROR: Config file not found at {config_path}")
elif not os.path.exists(checkpoint_path):
    print(f"ERROR: Checkpoint file not found at {checkpoint_path}")
else:
    # Load configuration
    hps = utils.get_hparams_from_file(config_path)

    # Determine device
    device = "cpu" # 必要に応じて "cuda"

    # Load model
    print("Loading model...")
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model).to(device)
    
    _ = net_g.eval()
    
    print(f"Loading checkpoint from {checkpoint_path}...")
    _ = utils.load_checkpoint(checkpoint_path, net_g, None)

    print(f"Original text: {text_to_synthesize}")
    stn_tst = text_to_sequence_custom(text_to_synthesize, hps)

    # Speaker ID
    i = 375
    print(f"\n--- Synthesizing for Speaker {i} (Overlap-Add Decoding) ---")
    
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([i]).to(device)
            
        # 1. 潜在表現 z の全体生成
        # ---------------------------------------------------------
        attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
            x_tst, 
            x_tst_lengths, 
            sid=sid, 
            noise_scale=0.1, 
            noise_scale_w=1.0, 
            length_scale=1.0
        )
        z = z * y_mask

        # Decoder用 Speaker Embedding
        if net_g.n_speakers > 0:
            g = net_g.emb_g(sid).unsqueeze(-1)
        else:
            g = None

        # 2. Overlap-Add の設定
        # ---------------------------------------------------------
        z_frame_window = 10     # z上のウィンドウサイズ (フレーム数)
        z_frame_hop = 5         # z上のホップサイズ (フレーム数)
        
        # 1フレームあたりの音声サンプル数 (アップサンプリング率)
        upsample_factor = hps.data.hop_length 
        
        # 音声波形上でのウィンドウサイズとホップサイズ
        audio_window_size = z_frame_window * upsample_factor
        
        # Hann窓の作成
        # shape: [1, 1, audio_window_size]
        window = torch.hann_window(audio_window_size).to(device).view(1, 1, -1)
        
        # 出力バッファの準備
        z_total_frames = z.shape[2]
        total_audio_len = z_total_frames * upsample_factor
        
        # 加算用バッファ (音声波形)
        y_buffer = torch.zeros(1, 1, total_audio_len).to(device)
        # 正規化用バッファ (窓の重みの合計)
        w_buffer = torch.zeros(1, 1, total_audio_len).to(device)

        print(f"Total Z frames: {z_total_frames}")
        print(f"Audio Window Size: {audio_window_size} samples")
        
        # 3. ループ処理 (Overlap-Add)
        # ---------------------------------------------------------
        start_time_dec = time.time()

        for idx in range(0, z_total_frames, z_frame_hop):
            # z の切り出し範囲
            z_start = idx
            z_end = min(idx + z_frame_window, z_total_frames)
            
            # z をスライス
            z_chunk = z[:, :, z_start:z_end]
            
            # 端数処理: 最後のチャンクがウィンドウサイズより小さい場合
            current_z_len = z_chunk.shape[2]
            if current_z_len == 0:
                break
                
            # 部分デコード
            # output shape: [1, 1, current_z_len * upsample_factor]
            o_chunk, _, _, _ = net_g.dec(z_chunk, g=g)
            
            # 音声バッファ上の配置位置
            audio_start = z_start * upsample_factor
            audio_end = audio_start + o_chunk.shape[2]
            
            # 現在のチャンク長に対応する窓を取得
            # (最後尾など、フルサイズでない場合に対応)
            current_audio_len = o_chunk.shape[2]
            current_window = window[:, :, :current_audio_len]
            
            # バッファに加算 (音声 * 窓)
            # ※ バッファからはみ出さないようにサイズチェック
            valid_len = min(current_audio_len, y_buffer.shape[2] - audio_start)
            
            y_buffer[:, :, audio_start : audio_start + valid_len] += (o_chunk[:, :, :valid_len] * current_window[:, :, :valid_len])
            w_buffer[:, :, audio_start : audio_start + valid_len] += current_window[:, :, :valid_len]

        # 4. 正規化 (重み付け平均をとる)
        # ---------------------------------------------------------
        # 重みが0の部分(無音部)でのゼロ除算を防ぐため小さい値を足す
        final_audio_tensor = y_buffer / (w_buffer + 1e-8)
        
        end_time = time.time()
        
        # Numpy変換
        audio = final_audio_tensor[0, 0].data.cpu().float().numpy()

    # 結果表示
    elapsed_time = end_time - start_time_dec # デコード部分のみの時間
    audio_duration = len(audio) / hps.data.sampling_rate
    
    print(f"Total Audio duration: {audio_duration:.2f} seconds")
    print(f"Decoding Elapsed time: {elapsed_time:.2f} seconds")
    
    display(Audio(audio, rate=hps.data.sampling_rate, normalize=False))
    print("\nSynthesis complete.")

Loading model...
Mutli-stream iSTFT VITS
Loading checkpoint from ./logs/uudb_csj21/G_3020000.pth...
Original text: 最近、インターステラーを見たのですけど、すごく面白かったです

--- Synthesizing for Speaker 375 (Overlap-Add Decoding) ---
Total Z frames: 252
Audio Window Size: 2560 samples
Total Audio duration: 4.03 seconds
Decoding Elapsed time: 0.56 seconds



Synthesis complete.


# 4 細切れデコード　overlapあり　相互相関で調整

In [28]:
# ... (前略: imports, functions, config設定などはそのまま) ...

import torch.nn.functional as F

# --- 相互相関によるオフセット計算関数 ---
def find_best_shift(ref_segment, target_segment, search_range=100):
    """
    ref_segment: 前の波形の末尾 (1, 1, T)
    target_segment: 次の波形の先頭 (1, 1, T)
    search_range: 探索するラグの範囲 (+/- samples)
    
    戻り値: 最適なシフト量 (正の値ならtargetを後ろにずらす、負なら前にずらす)
    """
    # conv1d で相互相関を計算するために次元を調整
    # ref (filter) shape: (Out_ch, In_ch/Groups, Kernel) -> (1, 1, T)
    # target (input) shape: (1, 1, T + padding)
    
    # ターゲット側にパディングを入れて探索範囲を作る
    # ターゲットを少し広く取って、リファレンスがどこにマッチするか探すイメージ
    pad_target = F.pad(target_segment, (search_range, search_range))
    
    # Conv1dで相互相関を計算 (refをカーネルとする)
    # refは反転させる必要はない（相関を求めたいのでそのまま畳み込む）
    cross_corr = F.conv1d(pad_target, ref_segment)
    
    # 相関が最大になるインデックスを取得
    max_idx = torch.argmax(cross_corr)
    
    # インデックスをシフト量に変換
    # max_idx が search_range のときシフト0
    shift = max_idx.item() - search_range
    
    return shift

# --- 3. Main Synthesis Process ---
if not os.path.exists(config_path):
    print(f"ERROR: Config file not found at {config_path}")
elif not os.path.exists(checkpoint_path):
    print(f"ERROR: Checkpoint file not found at {checkpoint_path}")
else:
    # Load configuration
    hps = utils.get_hparams_from_file(config_path)
    device = "cpu" # 必要に応じて "cuda"

    # Load model
    print("Loading model...")
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model).to(device)
    
    _ = net_g.eval()
    _ = utils.load_checkpoint(checkpoint_path, net_g, None)

    print(f"Original text: {text_to_synthesize}")
    stn_tst = text_to_sequence_custom(text_to_synthesize, hps)

    # Speaker ID
    i = 375
    print(f"\n--- Synthesizing for Speaker {i} (Correlation-based OLA) ---")
    
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([i]).to(device)
            
        # 1. 潜在表現 z の生成
        attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
            x_tst, x_tst_lengths, sid=sid, noise_scale=0.1, noise_scale_w=1.0, length_scale=1.0
        )
        z = z * y_mask

        if net_g.n_speakers > 0:
            g = net_g.emb_g(sid).unsqueeze(-1)
        else:
            g = None

        # 2. パラメータ設定
        # ---------------------------------------------------------
        z_chunk_size = 10     # デコードするzの長さ
        z_hop_size = 5        # 次のチャンクへ進むzの長さ
        
        upsample_factor = hps.data.hop_length 
        
        # 音声波形上での理論的なオーバーラップ長
        overlap_len = (z_chunk_size - z_hop_size) * upsample_factor
        
        # 相互相関の探索範囲 (サンプル数)
        # 大きすぎると誤検知する可能性があるため、ピッチ周期の半分程度(～100程度)が適当
        search_range = 100 
        
        # 全フレーム数
        z_total_frames = z.shape[2]
        
        # 結果格納用バッファ（逐次結合していく）
        full_audio_tensor = torch.zeros(1, 1, 0).to(device)

        print(f"Overlap Length: {overlap_len} samples")
        print(f"Search Range: +/- {search_range} samples")

        # 3. ループ処理
        # ---------------------------------------------------------
        start_time_dec = time.time()
        
        # 最初のチャンク処理用フラグ
        is_first_chunk = True

        for idx in range(0, z_total_frames, z_hop_size):
            # z の切り出し
            z_end = min(idx + z_chunk_size, z_total_frames)
            z_chunk = z[:, :, idx:z_end]
            
            # デコード
            o_chunk, _, _, _ = net_g.dec(z_chunk, g=g)
            
            # --- 結合処理 ---
            if is_first_chunk:
                # 最初はそのまま採用
                full_audio_tensor = o_chunk
                is_first_chunk = False
            else:
                # 前回の音声の末尾（オーバーラップ対象部分）
                # 探索範囲のために少し余分に取っておくことはせず、純粋なオーバーラップ領域比較とする
                prev_tail = full_audio_tensor[:, :, -overlap_len:]
                
                # 今回の音声の先頭（オーバーラップ対象部分）
                curr_head = o_chunk[:, :, :overlap_len]
                
                # 相互相関で最適なシフト量を計算
                # サイズが足りない場合（最後尾など）はシフト計算をスキップ
                if prev_tail.shape[2] == overlap_len and curr_head.shape[2] == overlap_len:
                    shift = find_best_shift(prev_tail, curr_head, search_range)
                else:
                    shift = 0
                
                # シフト適用後の今回のチャンク
                # shift > 0: 今回の波形を右（未来）にずらす -> 先頭を削る
                # shift < 0: 今回の波形を左（過去）にずらす -> 先頭にパディング or 前回の波形を削る
                # 簡易化のため、今回の波形の読み出し位置（開始点）を調整します
                
                # ベースの開始位置
                start_idx = 0
                
                # 最適位置への補正
                # ここでは「今回の波形をどこから使い始めるか」で調整
                adjusted_start = start_idx - shift
                
                # インデックスが範囲外にならないようクリップ
                adjusted_start = max(0, min(adjusted_start, search_range * 2))
                
                # 位置合わせ済みの新しいチャンク
                aligned_chunk = o_chunk[:, :, adjusted_start:]
                
                # クロスフェード用の長さ
                cross_fade_len = min(overlap_len, aligned_chunk.shape[2])
                
                # 窓関数作成 (Hann窓)
                fade_out = torch.linspace(1.0, 0.0, cross_fade_len).to(device).view(1, 1, -1)
                fade_in = torch.linspace(0.0, 1.0, cross_fade_len).to(device).view(1, 1, -1)
                
                # 前回の波形の末尾と、今回の波形の先頭をクロスフェード
                # 前回の波形を上書きする形で合成
                
                # 1. 前回の波形のフェードアウト部分
                prev_overlap_part = full_audio_tensor[:, :, -cross_fade_len:] * fade_out
                
                # 2. 今回の波形のフェードイン部分
                curr_overlap_part = aligned_chunk[:, :, :cross_fade_len] * fade_in
                
                # 3. 合成部分
                merged_part = prev_overlap_part + curr_overlap_part
                
                # 4. バッファ更新
                # (前回の末尾を削って合成部分に置き換え + 今回の残り部分を追加)
                full_audio_tensor = torch.cat([
                    full_audio_tensor[:, :, :-cross_fade_len], # 重ならない既存部分
                    merged_part,                               # クロスフェード部分
                    aligned_chunk[:, :, cross_fade_len:]       # 新規部分
                ], dim=2)

            # 進捗チェック（最後尾で終了）
            if z_end == z_total_frames:
                break

        end_time = time.time()
        
        audio = full_audio_tensor[0, 0].data.cpu().float().numpy()

    # 結果表示
    elapsed_time = end_time - start_time_dec
    audio_duration = len(audio) / hps.data.sampling_rate
    
    print(f"Total Audio duration: {audio_duration:.2f} seconds")
    print(f"Decoding Elapsed time: {elapsed_time:.2f} seconds")
    
    display(Audio(audio, rate=hps.data.sampling_rate, normalize=False))
    print("\nSynthesis complete.")

Loading model...
Mutli-stream iSTFT VITS
Original text: 最近、インターステラーを見たのですけど、すごく面白かったです

--- Synthesizing for Speaker 375 (Correlation-based OLA) ---
Overlap Length: 1280 samples
Search Range: +/- 100 samples
Total Audio duration: 4.01 seconds
Decoding Elapsed time: 0.79 seconds



Synthesis complete.


# 5 細切れデコード　スペクトログラムをoverlapで接合　相互相関で調整はしない

In [38]:
import os
import time
import numpy as np
import torch
import torch.nn.functional as F
import utils
from models import SynthesizerTrn
from text_JP import cleaned_text_to_sequence, symbols
from IPython.display import Audio, display

# ==========================================
# 1. 必要なクラス定義 (TorchSTFT)
# ==========================================
class TorchSTFT(torch.nn.Module):
    def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
        super().__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        if window == 'hann':
            self.window = torch.hann_window(win_length, periodic=True)
        else:
            self.window = torch.hann_window(win_length, periodic=True)

    def inverse(self, magnitude, phase):
        complex_spec = magnitude * torch.exp(phase * 1j)
        inverse_transform = torch.istft(
            complex_spec,
            self.filter_length, self.hop_length, self.win_length, 
            window=self.window.to(complex_spec.device)
        )
        return inverse_transform.unsqueeze(1)

# ==========================================
# 2. メイン合成プロセス (Fixed Ratio OLA)
# ==========================================

# 設定（パスは適宜合わせてください）
# config_path = "./logs/uudb_csj21/config.json"
# checkpoint_path = "./logs/uudb_csj21/G_3020000.pth"

if not os.path.exists(config_path):
    print(f"ERROR: Config file not found at {config_path}")
elif not os.path.exists(checkpoint_path):
    print(f"ERROR: Checkpoint file not found at {checkpoint_path}")
else:
    hps = utils.get_hparams_from_file(config_path)
    device = "cpu"

    print("Loading model...")
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model).to(device)
    
    _ = net_g.eval()
    _ = utils.load_checkpoint(checkpoint_path, net_g, None)

    print(f"Original text: {text_to_synthesize}")
    stn_tst = text_to_sequence_custom(text_to_synthesize, hps)
    i = 0 
    
    print(f"\n--- Synthesizing for Speaker {i} (Fixed Ratio OLA / No-Correlation) ---")
    
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([i]).to(device)
            
        # 1. 潜在表現 z の生成
        attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
            x_tst, x_tst_lengths, sid=sid, noise_scale=0.1, noise_scale_w=1.0, length_scale=1.0
        )
        z = z * y_mask
        g = net_g.emb_g(sid).unsqueeze(-1) if net_g.n_speakers > 0 else None

        # 2. パラメータ設定
        z_chunk_size = 10     # z領域でのチャンクサイズ
        z_hop_size = 5        # z領域でのホップサイズ
        
        z_total_frames = z.shape[2]
        full_complex_spec = None
        
        start_time_dec = time.time()
        
        # 倍率保持用変数
        time_ratio = None
        
        # 3. スペクトログラム生成＆結合ループ
        for idx in range(0, z_total_frames, z_hop_size):
            z_end = min(idx + z_chunk_size, z_total_frames)
            z_chunk = z[:, :, idx:z_end]
            
            # デコード
            _, _, spec_chunk, phase_chunk = net_g.dec(z_chunk, g=g)
            
            # 複素スペクトログラム
            complex_chunk = spec_chunk * torch.exp(1j * phase_chunk)
            
            # 現在のチャンクの長さ
            current_spec_len = complex_chunk.shape[-1]
            current_z_len = z_chunk.shape[-1]

            # 初回: 倍率(Ratio)を計算
            if time_ratio is None:
                if current_z_len > 0:
                    time_ratio = current_spec_len / current_z_len
                else:
                    time_ratio = 1.0
                print(f"Time Ratio (Spec / Z) = {time_ratio:.4f}")

            if full_complex_spec is None:
                # 最初のチャンクはそのまま採用
                full_complex_spec = complex_chunk
                print(f"Step {idx}: Init Size {current_spec_len}")
            else:
                # --- 固定計算によるオーバーラップ ---
                
                # 1. 今回「新規に進んだ」zフレーム数に対応する、スペクトログラム上の長さを計算
                #    通常は z_hop_size 分進むが、最後尾付近では短くなる可能性がある
                z_advance = min(z_hop_size, current_z_len) # 今回のzの有効長さではなく、進み幅ベースで考えるべき
                # しかし、ループは z_hop_size 固定で回っているので、基本は z_hop_size
                
                # スペクトログラム上で進むべきフレーム数 (Hop)
                spec_hop_len = int(z_hop_size * time_ratio)
                
                # オーバーラップさせるべき長さ = (現在のチャンク全長) - (進むべき長さ)
                spec_overlap_len = current_spec_len - spec_hop_len
                
                # 安全策: オーバーラップが異常値にならないかチェック
                if spec_overlap_len <= 0:
                    # ホップ量がチャンク長を超えている場合（通常ありえないが念のため）
                    full_complex_spec = torch.cat([full_complex_spec, complex_chunk], dim=-1)
                    print(f"Step {idx}: No overlap append (Hop {spec_hop_len})")
                    continue
                
                # --- クロスフェード (単純なリニア補間) ---
                
                # 前回の末尾 (Fade Out対象)
                prev_tail = full_complex_spec[..., -spec_overlap_len:]
                # 今回の先頭 (Fade In対象)
                curr_head = complex_chunk[..., :spec_overlap_len]
                
                # サイズが一致することを確認（端数処理などでずれる場合への対応）
                actual_overlap = min(prev_tail.shape[-1], curr_head.shape[-1])
                
                if actual_overlap > 0:
                    # アルファ値生成 (0 -> 1)
                    alpha_shape = [1] * (complex_chunk.dim() - 1) + [actual_overlap]
                    alpha = torch.linspace(0.0, 1.0, actual_overlap).to(device).view(*alpha_shape)
                    
                    # 混ぜ合わせ
                    part_prev = prev_tail[..., :actual_overlap] * (1 - alpha)
                    part_curr = curr_head[..., :actual_overlap] * alpha
                    merged_part = part_prev + part_curr
                    
                    # 結合: [前回の確定部分] + [混合部分] + [今回の新規部分]
                    full_complex_spec = torch.cat([
                        full_complex_spec[..., :-actual_overlap], # 前回のオーバーラップ手前まで
                        merged_part,                              # 混合したオーバーラップ部分
                        complex_chunk[..., actual_overlap:]       # 今回の残り（新規）
                    ], dim=-1)
                    
                    # 新規に追加されたフレーム数（ログ用）
                    new_frames = complex_chunk.shape[-1] - actual_overlap
                    print(f"Step {idx:03}: Hop={spec_hop_len} | Overlap={actual_overlap} | New={new_frames}")
                    
                else:
                    # オーバーラップできない場合
                    full_complex_spec = torch.cat([full_complex_spec, complex_chunk], dim=-1)

            if z_end == z_total_frames:
                break

        # 4. 一括 iSTFT & 合成
        final_spec = torch.abs(full_complex_spec)
        final_phase = torch.angle(full_complex_spec)
        
        stft = TorchSTFT(
            filter_length=net_g.dec.gen_istft_n_fft, 
            hop_length=net_g.dec.gen_istft_hop_size, 
            win_length=net_g.dec.gen_istft_n_fft
        ).to(device)

        if hasattr(net_g.dec, 'subbands') and net_g.dec.subbands > 1:
            b, s, f, t = final_spec.shape
            spec_reshaped = final_spec.view(b * s, f, t)
            phase_reshaped = final_phase.view(b * s, f, t)
            
            y_mb_hat = stft.inverse(spec_reshaped, phase_reshaped)
            y_mb_hat = y_mb_hat.squeeze(1).view(b, s, -1)
            
            if net_g.ms_istft_vits:
                y_mb_hat = F.conv_transpose1d(y_mb_hat, net_g.dec.updown_filter.to(device) * net_g.dec.subbands, stride=net_g.dec.subbands)
                audio_tensor = net_g.dec.multistream_conv_post(y_mb_hat)
            else:
                try:
                    from pqmf import PQMF
                    pqmf = PQMF(device)
                    audio_tensor = pqmf.synthesis(y_mb_hat.unsqueeze(2)) 
                except ImportError:
                     audio_tensor = torch.sum(y_mb_hat, dim=1, keepdim=True)
        else:
            audio_tensor = stft.inverse(final_spec, final_phase)
        
        end_time = time.time()
        audio = audio_tensor[0, 0].data.cpu().float().numpy()

    elapsed_time = end_time - start_time_dec
    audio_duration = len(audio) / hps.data.sampling_rate
    
    print(f"Total Audio duration: {audio_duration:.2f} seconds")
    print(f"Decoding Elapsed time: {elapsed_time:.2f} seconds")
    
    display(Audio(audio, rate=hps.data.sampling_rate, normalize=False))
    print("\nSynthesis complete.")

Loading model...
Mutli-stream iSTFT VITS
Original text: 最近、インターステラーを見たのですけど、すごく面白かったです

--- Synthesizing for Speaker 0 (Fixed Ratio OLA / No-Correlation) ---
Time Ratio (Spec / Z) = 16.1000
Step 0: Init Size 161
Step 005: Hop=80 | Overlap=81 | New=80
Step 010: Hop=80 | Overlap=81 | New=80
Step 015: Hop=80 | Overlap=81 | New=80
Step 020: Hop=80 | Overlap=81 | New=80
Step 025: Hop=80 | Overlap=81 | New=80
Step 030: Hop=80 | Overlap=81 | New=80
Step 035: Hop=80 | Overlap=81 | New=80
Step 040: Hop=80 | Overlap=81 | New=80
Step 045: Hop=80 | Overlap=81 | New=80
Step 050: Hop=80 | Overlap=81 | New=80
Step 055: Hop=80 | Overlap=81 | New=80
Step 060: Hop=80 | Overlap=81 | New=80
Step 065: Hop=80 | Overlap=81 | New=80
Step 070: Hop=80 | Overlap=81 | New=80
Step 075: Hop=80 | Overlap=81 | New=80
Step 080: Hop=80 | Overlap=81 | New=80
Step 085: Hop=80 | Overlap=81 | New=80
Step 090: Hop=80 | Overlap=81 | New=80
Step 095: Hop=80 | Overlap=81 | New=80
Step 100: Hop=80 | Overlap=81 | New=80
Step 105


Synthesis complete.


# 6 細切れデコード　スペクトログラムを接合　overlapあり　相互相関で調整あり

In [36]:
import os
import time
import numpy as np
import torch
import torch.nn.functional as F
import utils
from models import SynthesizerTrn
from text_JP import cleaned_text_to_sequence, symbols
from IPython.display import Audio, display

# ==========================================
# 1. 必要なクラス・関数定義
# ==========================================

class TorchSTFT(torch.nn.Module):
    def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
        super().__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        if window == 'hann':
            self.window = torch.hann_window(win_length, periodic=True)
        else:
            self.window = torch.hann_window(win_length, periodic=True)

    def inverse(self, magnitude, phase):
        complex_spec = magnitude * torch.exp(phase * 1j)
        inverse_transform = torch.istft(
            complex_spec,
            self.filter_length, self.hop_length, self.win_length, 
            window=self.window.to(complex_spec.device)
        )
        return inverse_transform.unsqueeze(1)

def find_best_frame_shift(ref_spec, target_spec, search_range=5):
    # 4次元対応
    if ref_spec.dim() == 4:
        b, s, f, t = ref_spec.shape
        ref_spec = ref_spec.reshape(b, s * f, t).contiguous()
        target_spec = target_spec.reshape(b, s * f, t).contiguous()
    
    # 対数振幅で相関をとる
    ref_log = torch.log(ref_spec + 1e-6)
    target_log = torch.log(target_spec + 1e-6)

    ref_log = ref_log - torch.mean(ref_log, dim=-1, keepdim=True)
    target_log = target_log - torch.mean(target_log, dim=-1, keepdim=True)

    pad_target = F.pad(target_log, (search_range, search_range))
    cross_corr = F.conv1d(pad_target, ref_log)
    
    max_idx = torch.argmax(cross_corr)
    shift = max_idx.item() - search_range
    return shift

# ==========================================
# 2. メイン合成プロセス (修正版)
# ==========================================

if not os.path.exists(config_path):
    print(f"ERROR: Config file not found at {config_path}")
elif not os.path.exists(checkpoint_path):
    print(f"ERROR: Checkpoint file not found at {checkpoint_path}")
else:
    hps = utils.get_hparams_from_file(config_path)
    device = "cpu"

    print("Loading model...")
    net_g = SynthesizerTrn(
        len(symbols),
        hps.data.filter_length // 2 + 1,
        hps.train.segment_size // hps.data.hop_length,
        n_speakers=hps.data.n_speakers,
        **hps.model).to(device)
    
    _ = net_g.eval()
    _ = utils.load_checkpoint(checkpoint_path, net_g, None)

    print(f"Original text: {text_to_synthesize}")
    stn_tst = text_to_sequence_custom(text_to_synthesize, hps)
    i = 300
    
    print(f"\n--- Synthesizing for Speaker {i} (Corrected Ratio OLA) ---")
    
    with torch.no_grad():
        x_tst = stn_tst.unsqueeze(0).to(device)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([i]).to(device)
            
        # 1. 潜在表現 z の生成
        attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
            x_tst, x_tst_lengths, sid=sid, noise_scale=0.1, noise_scale_w=1.0, length_scale=1.0
        )
        z = z * y_mask
        g = net_g.emb_g(sid).unsqueeze(-1) if net_g.n_speakers > 0 else None

        # 2. パラメータ設定
        z_chunk_size = 15     # z領域でのチャンクサイズ
        z_hop_size = 5        # z領域でのホップサイズ
        
        search_range = 2      # 探索範囲 (スペクトログラムフレーム単位)
        
        z_total_frames = z.shape[2]
        full_complex_spec = None
        
        start_time_dec = time.time()
        
        # 倍率保持用変数
        time_ratio = None
        
        # 3. スペクトログラム生成＆結合ループ
        for idx in range(0, z_total_frames, z_hop_size):
            z_end = min(idx + z_chunk_size, z_total_frames)
            z_chunk = z[:, :, idx:z_end]
            
            # デコード
            _, _, spec_chunk, phase_chunk = net_g.dec(z_chunk, g=g)
            
            # 複素スペクトログラム
            complex_chunk = spec_chunk * torch.exp(1j * phase_chunk)
            
            # 現在のチャンクの実際の長さ (スペクトログラムフレーム数)
            current_spec_len = complex_chunk.shape[-1]
            current_z_len = z_chunk.shape[-1]

            # 初回のみ倍率を計算
            if time_ratio is None:
                if current_z_len > 0:
                    time_ratio = current_spec_len / current_z_len
                else:
                    time_ratio = 1.0 # fallback
                print(f"Detected Time Ratio (Spec/Z): {time_ratio:.2f}")

            if full_complex_spec is None:
                full_complex_spec = complex_chunk
                print(f"Step {idx}: Init Size {current_spec_len}")
            else:
                # --- 正しいオーバーラップ量の計算 ---
                # 今回「新規に進んだ」zフレーム数に対応するスペクトログラム長を計算
                # idx は z_hop_size ずつ進んでいる
                
                # 直前のステップからの z の進み幅 (通常は z_hop_size だが最後は短いかも)
                # 正確には「今回のチャンクがカバーするz領域」と「前回までのz領域」の重なりを計算すべきだが
                # 単純化のため「期待されるホップ長」を計算する
                
                spec_hop_len = int(z_hop_size * time_ratio)
                
                # オーバーラップ長 = 全長 - ホップ長
                # これが「前回の末尾」と「今回の先頭」で重なるべき長さ
                spec_overlap_len = current_spec_len - spec_hop_len
                
                # オーバーラップがマイナスになる（ホップしすぎ）場合は単に結合
                if spec_overlap_len <= 0:
                    full_complex_spec = torch.cat([full_complex_spec, complex_chunk], dim=-1)
                    print(f"Step {idx}: No overlap (Hop {spec_hop_len} >= Len {current_spec_len})")
                    continue

                # --- 相互相関による位置合わせ ---
                prev_tail = full_complex_spec[..., -spec_overlap_len:]
                curr_head = complex_chunk[..., :spec_overlap_len]
                
                shift = 0
                # サイズチェック
                if prev_tail.shape[-1] == spec_overlap_len and curr_head.shape[-1] == spec_overlap_len:
                    shift = find_best_frame_shift(torch.abs(prev_tail), torch.abs(curr_head), search_range)
                
                # シフト適用
                start_offset = -shift
                start_offset = max(0, min(start_offset, search_range * 2))
                
                aligned_chunk = complex_chunk[..., start_offset:]
                
                # クロスフェード長 (オーバーラップ領域全体にかける)
                cross_len = min(spec_overlap_len, aligned_chunk.shape[-1])
                
                # ログ出力
                new_frames = aligned_chunk.shape[-1] - cross_len
                print(f"Step {idx:03}: RatioOverlap={spec_overlap_len} | Shift={shift:+d} | New={new_frames}")

                if cross_len > 0:
                    alpha_shape = [1] * (aligned_chunk.dim() - 1) + [cross_len]
                    alpha = torch.linspace(0.0, 1.0, cross_len).to(device).view(*alpha_shape)
                    
                    part_prev = full_complex_spec[..., -cross_len:] * (1 - alpha)
                    part_curr = aligned_chunk[..., :cross_len] * alpha
                    merged_part = part_prev + part_curr
                    
                    full_complex_spec = torch.cat([
                        full_complex_spec[..., :-cross_len], 
                        merged_part, 
                        aligned_chunk[..., cross_len:]
                    ], dim=-1)
                else:
                    full_complex_spec = torch.cat([full_complex_spec, aligned_chunk], dim=-1)

            if z_end == z_total_frames:
                break

        # 4. 一括 iSTFT & 合成 (前回と同様)
        final_spec = torch.abs(full_complex_spec)
        final_phase = torch.angle(full_complex_spec)
        
        stft = TorchSTFT(
            filter_length=net_g.dec.gen_istft_n_fft, 
            hop_length=net_g.dec.gen_istft_hop_size, 
            win_length=net_g.dec.gen_istft_n_fft
        ).to(device)

        if hasattr(net_g.dec, 'subbands') and net_g.dec.subbands > 1:
            b, s, f, t = final_spec.shape
            spec_reshaped = final_spec.view(b * s, f, t)
            phase_reshaped = final_phase.view(b * s, f, t)
            
            y_mb_hat = stft.inverse(spec_reshaped, phase_reshaped)
            y_mb_hat = y_mb_hat.squeeze(1).view(b, s, -1)
            
            if net_g.ms_istft_vits:
                y_mb_hat = F.conv_transpose1d(y_mb_hat, net_g.dec.updown_filter.to(device) * net_g.dec.subbands, stride=net_g.dec.subbands)
                audio_tensor = net_g.dec.multistream_conv_post(y_mb_hat)
            else:
                try:
                    from pqmf import PQMF
                    pqmf = PQMF(device)
                    audio_tensor = pqmf.synthesis(y_mb_hat.unsqueeze(2)) 
                except ImportError:
                     audio_tensor = torch.sum(y_mb_hat, dim=1, keepdim=True)
        else:
            audio_tensor = stft.inverse(final_spec, final_phase)
        
        end_time = time.time()
        audio = audio_tensor[0, 0].data.cpu().float().numpy()

    elapsed_time = end_time - start_time_dec
    audio_duration = len(audio) / hps.data.sampling_rate
    
    print(f"Total Audio duration: {audio_duration:.2f} seconds")
    print(f"Decoding Elapsed time: {elapsed_time:.2f} seconds")
    
    display(Audio(audio, rate=hps.data.sampling_rate, normalize=False))
    print("\nSynthesis complete.")

Loading model...
Mutli-stream iSTFT VITS
Original text: 最近、インターステラーを見たのですけど、すごく面白かったです

--- Synthesizing for Speaker 300 (Corrected Ratio OLA) ---
Detected Time Ratio (Spec/Z): 16.07
Step 0: Init Size 241
Step 005: RatioOverlap=161 | Shift=+0 | New=80
Step 010: RatioOverlap=161 | Shift=+0 | New=80
Step 015: RatioOverlap=161 | Shift=+0 | New=80
Step 020: RatioOverlap=161 | Shift=+0 | New=80
Step 025: RatioOverlap=161 | Shift=+0 | New=80
Step 030: RatioOverlap=161 | Shift=+0 | New=80
Step 035: RatioOverlap=161 | Shift=+2 | New=80
Step 040: RatioOverlap=161 | Shift=+0 | New=80
Step 045: RatioOverlap=161 | Shift=+0 | New=80
Step 050: RatioOverlap=161 | Shift=+0 | New=80
Step 055: RatioOverlap=161 | Shift=-1 | New=79
Step 060: RatioOverlap=161 | Shift=-2 | New=78
Step 065: RatioOverlap=161 | Shift=+0 | New=80
Step 070: RatioOverlap=161 | Shift=+0 | New=80
Step 075: RatioOverlap=161 | Shift=-1 | New=79
Step 080: RatioOverlap=161 | Shift=+0 | New=80
Step 085: RatioOverlap=161 | Shift=+0 | New


Synthesis complete.


In [14]:
import os
import time
import torch
import torch.nn.functional as F
import numpy as np
import commons
import utils
from models import SynthesizerTrn
from text_JP import symbols
from scipy.io.wavfile import write

# ==========================================
# 1. 設定
# ==========================================
config_path = "./logs/uudb_csj21/config.json"
checkpoint_path = "./logs/uudb_csj21/G_3020000.pth"
input_txt_path = "./filelists/csj_uudb_test_fine.txt"
output_dir = "output_wavs_batch"

# 生成パラメータ
noise_scale = 0.1
noise_scale_w = 1.0
length_scale = 1.0

# デバイス設定
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ==========================================
# 2. 共通クラス・関数定義
# ==========================================

class TorchSTFT(torch.nn.Module):
    def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'):
        super().__init__()
        self.filter_length = filter_length
        self.hop_length = hop_length
        self.win_length = win_length
        # windowのデバイス転送はinverse内で行うためここでは作成のみ
        self.window = torch.hann_window(win_length, periodic=True)

    def inverse(self, magnitude, phase):
        complex_spec = magnitude * torch.exp(phase * 1j)
        # istftは (..., Freq, Time) を期待するため、Batch次元等がある場合は維持される
        inverse_transform = torch.istft(
            complex_spec,
            self.filter_length, self.hop_length, self.win_length, 
            window=self.window.to(complex_spec.device)
        )
        return inverse_transform.unsqueeze(1)

def find_best_frame_shift(ref_spec, target_spec, search_range=5):
    # 4次元(B, S, F, T)対応: 全サブバンド・周波数をまとめて相関をとる
    if ref_spec.dim() == 4:
        b, s, f, t = ref_spec.shape
        ref_spec = ref_spec.reshape(b, s * f, t).contiguous()
        target_spec = target_spec.reshape(b, s * f, t).contiguous()
    
    ref_log = torch.log(ref_spec + 1e-6)
    target_log = torch.log(target_spec + 1e-6)
    ref_log = ref_log - torch.mean(ref_log, dim=-1, keepdim=True)
    target_log = target_log - torch.mean(target_log, dim=-1, keepdim=True)

    pad_target = F.pad(target_log, (search_range, search_range))
    cross_corr = F.conv1d(pad_target, ref_log)
    max_idx = torch.argmax(cross_corr)
    return max_idx.item() - search_range

def get_text_from_phonemes(phonemes, hps):
    symbol_to_id = {s: i for i, s in enumerate(symbols)}
    clean_phonemes = phonemes.replace("[", "").replace("]", "").strip()
    phoneme_list = clean_phonemes.split(" ")
    text_norm = []
    for p in phoneme_list:
        if p in symbol_to_id:
            text_norm.append(symbol_to_id[p])
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    return torch.LongTensor(text_norm)

def istft_finalize(net_g, full_complex_spec):
    """
    Multi-stream iSTFTに対応した波形再構成関数
    full_complex_spec shape: [Batch, Subbands, Freq, Time]
    """
    final_spec = torch.abs(full_complex_spec)
    final_phase = torch.angle(full_complex_spec)
    
    stft = TorchSTFT(
        filter_length=net_g.dec.gen_istft_n_fft, 
        hop_length=net_g.dec.gen_istft_hop_size, 
        win_length=net_g.dec.gen_istft_n_fft
    ).to(device)

    # Multi-stream処理の判定
    if hasattr(net_g.dec, 'subbands') and net_g.dec.subbands > 1:
        # [B, S, F, T] -> [B*S, F, T] に変形してiSTFT
        b, s, f, t = final_spec.shape
        spec_reshaped = final_spec.view(b * s, f, t)
        phase_reshaped = final_phase.view(b * s, f, t)
        
        y_mb_hat = stft.inverse(spec_reshaped, phase_reshaped) # -> [B*S, 1, Time_sub]
        y_mb_hat = y_mb_hat.squeeze(1).view(b, s, -1)          # -> [B, S, Time_sub]

        # 合成フィルタ (Synthesis Filter Bank)
        if net_g.ms_istft_vits:
            # 学習済みアップサンプリングフィルタを使用
            y_mb_hat = F.conv_transpose1d(
                y_mb_hat, 
                net_g.dec.updown_filter.to(device) * net_g.dec.subbands, 
                stride=net_g.dec.subbands
            )
            audio_tensor = net_g.dec.multistream_conv_post(y_mb_hat)
        else:
            # PQMFまたは単純加算 (Fallback)
            try:
                from pqmf import PQMF
                pqmf = PQMF(device)
                audio_tensor = pqmf.synthesis(y_mb_hat.unsqueeze(2)) 
            except ImportError:
                 audio_tensor = torch.sum(y_mb_hat, dim=1, keepdim=True)
    else:
        # 通常のiSTFT (Single stream)
        audio_tensor = stft.inverse(final_spec, final_phase)

    return audio_tensor[0, 0].data.cpu().float().numpy()


# ==========================================
# 3. 各合成条件の関数
# ==========================================

# (1) Normal
def synthesize_cond1(net_g, x_tst, x_tst_lengths, sid):
    # inferメソッドは内部で適切にMulti-stream処理を行うためそのまま使用
    audio = net_g.infer(
        x_tst, x_tst_lengths, sid=sid, 
        noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale
    )[0][0,0].data.cpu().float().numpy()
    return audio

# (2) Audio Chunk
def synthesize_cond2(net_g, x_tst, x_tst_lengths, sid, chunk_size=10):
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale
    )
    z = z * y_mask
    g = net_g.emb_g(sid).unsqueeze(-1) if net_g.n_speakers > 0 else None
    
    full_audio_chunks = []
    z_len = z.shape[2]
    # 出力波形の結合が必要
    for step in range(0, z_len, chunk_size):
        z_chunk = z[:, :, step:min(step+chunk_size, z_len)]
        o_chunk, _, _, _ = net_g.dec(z_chunk, g=g)
        full_audio_chunks.append(o_chunk)
    
    return torch.cat(full_audio_chunks, dim=2)[0, 0].data.cpu().float().numpy()

# (3) Spec Fixed Ratio OLA
def synthesize_cond3(net_g, x_tst, x_tst_lengths, sid, z_chunk_size=10, z_hop_size=5):
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale
    )
    z = z * y_mask
    g = net_g.emb_g(sid).unsqueeze(-1) if net_g.n_speakers > 0 else None
    
    z_total = z.shape[2]
    full_spec = None
    ratio = None
    
    for idx in range(0, z_total, z_hop_size):
        z_chunk = z[:, :, idx:min(idx+z_chunk_size, z_total)]
        _, _, spec, phase = net_g.dec(z_chunk, g=g)
        comp_chunk = spec * torch.exp(1j * phase)
        
        if ratio is None: 
            ratio = comp_chunk.shape[-1] / z_chunk.shape[-1] if z_chunk.shape[-1] > 0 else 1.0
        
        if full_spec is None:
            full_spec = comp_chunk
        else:
            hop = int(z_hop_size * ratio)
            overlap = comp_chunk.shape[-1] - hop
            if overlap <= 0:
                full_spec = torch.cat([full_spec, comp_chunk], dim=-1)
            else:
                prev, curr = full_spec[..., -overlap:], comp_chunk[..., :overlap]
                ov_len = min(prev.shape[-1], curr.shape[-1])
                if ov_len > 0:
                    alpha = torch.linspace(0.0, 1.0, ov_len).to(device).view(1, 1, 1, ov_len)
                    merged = prev[..., :ov_len]*(1-alpha) + curr[..., :ov_len]*alpha
                    full_spec = torch.cat([full_spec[..., :-ov_len], merged, comp_chunk[..., ov_len:]], dim=-1)
                else:
                    full_spec = torch.cat([full_spec, comp_chunk], dim=-1)
        if idx + z_chunk_size >= z_total: break
            
    return istft_finalize(net_g, full_spec)

# (4) Spec Corrected Ratio OLA
def synthesize_cond4(net_g, x_tst, x_tst_lengths, sid, z_chunk_size=10, z_hop_size=5, search_range=2):
    attn, y_mask, (z, z_p, m_p, logs_p), timings = net_g.infer_z_only(
        x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale
    )
    z = z * y_mask
    g = net_g.emb_g(sid).unsqueeze(-1) if net_g.n_speakers > 0 else None
    
    z_total = z.shape[2]
    full_spec = None
    ratio = None
    
    for idx in range(0, z_total, z_hop_size):
        z_chunk = z[:, :, idx:min(idx+z_chunk_size, z_total)]
        _, _, spec, phase = net_g.dec(z_chunk, g=g)
        comp_chunk = spec * torch.exp(1j * phase)
        
        if ratio is None: 
            ratio = comp_chunk.shape[-1] / z_chunk.shape[-1] if z_chunk.shape[-1] > 0 else 1.0
        
        if full_spec is None:
            full_spec = comp_chunk
        else:
            hop = int(z_hop_size * ratio)
            overlap = comp_chunk.shape[-1] - hop
            if overlap <= 0:
                full_spec = torch.cat([full_spec, comp_chunk], dim=-1)
                continue
            
            prev = full_spec[..., -overlap:]
            curr = comp_chunk[..., :overlap]
            
            shift = 0
            # オーバーラップサイズが十分にある場合のみ位置合わせ
            if prev.shape[-1] == overlap and curr.shape[-1] == overlap:
                shift = find_best_frame_shift(torch.abs(prev), torch.abs(curr), search_range)
            
            start_off = max(0, min(-shift, search_range*2))
            aligned = comp_chunk[..., start_off:]
            cross_len = min(overlap, aligned.shape[-1])
            
            if cross_len > 0:
                alpha = torch.linspace(0.0, 1.0, cross_len).to(device).view(1, 1, 1, cross_len)
                merged = full_spec[..., -cross_len:]*(1-alpha) + aligned[..., :cross_len]*alpha
                full_spec = torch.cat([full_spec[..., :-cross_len], merged, aligned[..., cross_len:]], dim=-1)
            else:
                full_spec = torch.cat([full_spec, aligned], dim=-1)
        if idx + z_chunk_size >= z_total: break

    return istft_finalize(net_g, full_spec)

# ==========================================
# 4. メインループ
# ==========================================
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory created: {output_dir}")

hps = utils.get_hparams_from_file(config_path)
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).to(device)
_ = net_g.eval()
utils.load_checkpoint(checkpoint_path, net_g, None)
print("Model loaded.")

with open(input_txt_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

print(f"Start processing {len(lines)} lines...")

for i, line in enumerate(lines):
    line = line.strip()
    if not line: continue
    parts = line.split("|")
    if len(parts) < 3: continue
    
    original_fname = os.path.basename(parts[0]).replace(".wav", "")
    spk_id = int(parts[1])
    phonemes = parts[2]
    
    stn_tst = get_text_from_phonemes(phonemes, hps)
    with torch.no_grad():
        x_tst = stn_tst.to(device).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        sid = torch.LongTensor([spk_id]).to(device)

        try:
            audio = synthesize_cond1(net_g, x_tst, x_tst_lengths, sid)
            write(os.path.join(output_dir, f"{original_fname}_cond1.wav"), hps.data.sampling_rate, audio)
            
            audio = synthesize_cond2(net_g, x_tst, x_tst_lengths, sid)
            write(os.path.join(output_dir, f"{original_fname}_cond2.wav"), hps.data.sampling_rate, audio)
            
            audio = synthesize_cond3(net_g, x_tst, x_tst_lengths, sid)
            write(os.path.join(output_dir, f"{original_fname}_cond3.wav"), hps.data.sampling_rate, audio)
            
            audio = synthesize_cond4(net_g, x_tst, x_tst_lengths, sid)
            write(os.path.join(output_dir, f"{original_fname}_cond4.wav"), hps.data.sampling_rate, audio)
            
            print(f"[{i+1}/{len(lines)}] Saved: {original_fname}")
        except Exception as e:
            print(f"Error processing {original_fname}: {e}")
            import traceback
            traceback.print_exc()

print("All tasks finished.")

Using device: cuda
Output directory created: output_wavs_batch
Mutli-stream iSTFT VITS
Model loaded.
Start processing 16 lines...
[1/16] Saved: FJK_C051_118
[2/16] Saved: FJK_C051_170
[3/16] Saved: FKC_C031_002
[4/16] Saved: FMS_C051_072
[5/16] Saved: FMT_C041_134
[6/16] Saved: FMT_C041_259
[7/16] Saved: FTH_C004_044
[8/16] Saved: FTH_C005_152
[9/16] Saved: FTS_C002_107
[10/16] Saved: FTS_C002_175
[11/16] Saved: FTS_C004_126
[12/16] Saved: FTS_C006_050
[13/16] Saved: FTS_C007_137
[14/16] Saved: FUE_C033_134
[15/16] Saved: FYH_C042_090
[16/16] Saved: FYH_C043_064
All tasks finished.
