In [18]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd

import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader

import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

from scipy.io.wavfile import write


def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

## MB-iSTFT-VITS

In [19]:
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

In [20]:
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model).cuda()
_ = net_g.eval()

_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_144000.pth", net_g, None)

Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


In [21]:
stn_tst = get_text("Goedemiddag, met de klantenservice. Waarmee kan ik u van dienst zijn?'", hps)
with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    audio = net_g.infer(x_tst, x_tst_lengths, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
# Scale audio to [-1, 1] to avoid ValueError
audio_norm = audio / max(abs(audio).max(), 1.0)
ipd.display(ipd.Audio(audio_norm, rate=hps.data.sampling_rate, normalize=False))



In [22]:
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import json
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import commons
import utils
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write

def get_text(text, hps):
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

# Load config
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

# Initialize model
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

# Load checkpoint
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_144000.pth", net_g, None)

# Text to speech
text = "Goedemiddag, met de klantenservice. Waarmee kan ik u van dienst zijn?"
stn_tst = get_text(text, hps)

with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([0]).cuda()  # Speaker ID
    audio = net_g.infer(x_tst, x_tst_lengths, sid, 
                       noise_scale=.667,
                       noise_scale_w=0.8, 
                       length_scale=1)[0][0,0].data.cpu().float().numpy()

# Normalize and play audio
audio_norm = audio / max(abs(audio).max(), 1.0)
ipd.display(ipd.Audio(audio_norm, rate=hps.data.sampling_rate, normalize=False))

# Optional: Save to WAV file
# write("output.wav", hps.data.sampling_rate, audio_norm)

Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)




In [23]:
# Import statements and setup
import matplotlib.pyplot as plt
import IPython.display as ipd
import os, json, math, torch
from torch import nn
from torch.nn import functional as F
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence

def get_text(text, hps):
    # Clean the input text
    text = text.lower().strip()
    # Remove any double spaces
    text = " ".join(text.split())
    # Process through text cleaners
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

# Load config and model
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
_ = net_g.eval()

# Load latest checkpoint
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_144000.pth", net_g, None)

# Test text - make sure it's clean Dutch text
text = "Hallo, hoe gaat het met u?"  # Simple test sentence
stn_tst = get_text(text, hps)

with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([0]).cuda()
    
    # Adjust inference parameters
    audio = net_g.infer(x_tst, x_tst_lengths, sid,
                       noise_scale=0.667,      # Reduce noise scale
                       noise_scale_w=0.6,      # Reduce noise scale warmup
                       length_scale=1.1)[0][0,0].data.cpu().float().numpy()

# Normalize audio more aggressively
audio_norm = audio * 0.95 / max(abs(audio).max(), 1e-6)

# Play audio
ipd.display(ipd.Audio(audio_norm, rate=hps.data.sampling_rate, normalize=True))

# Save audio file for inspection
write("test_output.wav", hps.data.sampling_rate, audio_norm)

Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


In [24]:
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write

def get_text(text, hps):
    # Remove any special characters and normalize
    text = text.replace("'", "").replace('"', '').strip()
    text = " ".join(text.split())  # normalize whitespace
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

# Load config
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

# Initialize model with explicit speaker support
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

# Load checkpoint
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_132000.pth", net_g, None)

# Test with simpler text
text = "Hallo."  # Start with very simple text
stn_tst = get_text(text, hps)

with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([0]).cuda()
    
    # Modified inference parameters
    audio = net_g.infer(x_tst, x_tst_lengths, sid,
                       noise_scale=0.3,        # Reduced noise
                       noise_scale_w=0.4,      # Reduced noise warmup
                       length_scale=1.0)[0][0,0].data.cpu().float().numpy()

# More conservative normalization
audio_norm = audio * 0.8 / max(abs(audio).max(), 1e-6)

# Play normalized audio
ipd.display(ipd.Audio(audio_norm, rate=hps.data.sampling_rate, normalize=True))

# Save for inspection
write("test_simple.wav", hps.data.sampling_rate, audio_norm)

Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


In [25]:
import matplotlib.pyplot as plt
import IPython.display as ipd
import os
import torch
import commons
import utils
from models import SynthesizerTrn
from text.symbols import symbols
from text import text_to_sequence
from scipy.io.wavfile import write
import numpy as np

def get_text(text, hps):
    # Debug print
    print("Input text:", text)
    
    # Normalize text
    text = text.lower().strip()
    text = " ".join(text.split())
    text = text.replace("'", "").replace('"', '')
    print("Normalized text:", text)
    
    # Convert to phonemes
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    print("Phoneme sequence:", text_norm)
    
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    print("Final sequence length:", len(text_norm))
    return text_norm

# Load config
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

# Print model configuration
print("Speaker config:", hps.data.n_speakers)
print("Sampling rate:", hps.data.sampling_rate)

# Initialize model
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

# Load checkpoint
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_144000.pth", net_g, None)

# Test with very simple text
text = "hallo"  # Single word test
stn_tst = get_text(text, hps)

with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([0]).cuda()
    
    # Conservative inference parameters
    audio = net_g.infer(x_tst, x_tst_lengths, sid,
                       noise_scale=0.1,        # Minimal noise
                       noise_scale_w=0.2,      # Minimal noise warmup
                       length_scale=1.2)[0][0,0].data.cpu().float().numpy()

# Very conservative normalization
audio_norm = audio * 0.7 / max(abs(audio).max(), 1e-6)

# Print audio statistics
print("Audio shape:", audio.shape)
print("Max amplitude:", np.max(np.abs(audio_norm)))

# Play normalized audio
ipd.display(ipd.Audio(audio_norm, rate=hps.data.sampling_rate, normalize=False))

# Save for inspection
write("debug_output.wav", hps.data.sampling_rate, audio_norm)

Speaker config: 0
Sampling rate: 22050
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


Input text: hallo
Normalized text: hallo
Phoneme sequence: [176, 171, 211, 180, 183, 211, 168, 173, 174, 211, 168, 183, 211, 180, 194, 185]
Final sequence length: 33
Audio shape: (13312,)
Max amplitude: 0.7


In [26]:
def get_text(text, hps):
    # Debug print original symbols
    print("Available symbols:", symbols)
    
    text = text.lower().strip()
    text = " ".join(text.split())
    
    # Debug each step
    text_norm = text_to_sequence(text, hps.data.text_cleaners)
    print("After text_to_sequence:", text_norm)
    
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
        print("After adding blanks:", text_norm)
    
    text_norm = torch.LongTensor(text_norm)
    return text_norm

# Load checkpoint with verification
checkpoint_path = "./logs/tts_nl0.0.1/G_144000.pth"
print(f"Loading checkpoint from {checkpoint_path}")
checkpoint = torch.load(checkpoint_path)
print(f"Checkpoint keys: {checkpoint.keys()}")

# Initialize model with explicit verification
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

# Verify model loaded correctly
_ = utils.load_checkpoint(checkpoint_path, net_g, None)
print("Model parameters:", sum(p.numel() for p in net_g.parameters()))

# Test with basic text
text = "hallo"
stn_tst = get_text(text, hps)

with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([0]).cuda()
    
    # Default inference parameters
    audio = net_g.infer(x_tst, x_tst_lengths, sid,
                       noise_scale=0.667,      
                       noise_scale_w=0.8,      
                       length_scale=1.0)[0][0,0].data.cpu().float().numpy()

Loading checkpoint from ./logs/tts_nl0.0.1/G_144000.pth
Checkpoint keys: dict_keys(['model', 'iteration', 'optimizer', 'learning_rate'])
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


Model parameters: 34731985
Available symbols: [' ', '_', ';', ':', ',', '.', '!', '?', '¡', '¿', '—', '…', '«', '»', '“', '”', '+', '-', '–', '(', ')', '[', ']', '{', '}', '<', '>', '/', '\\', '|', '@', '#', '&', '*', '~', '`', '<', '>', '^', '%', '$', '=', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'á', 'é', 'í', 'ó', 'ú', 'ý', 'ä', 'ë', 'ï', 'ö', 'ü', 'ÿ', 'à', 'è', 'ò', 'ù', 'â', 'ê', 'ô', 'û', "'", ' ', '(', ')', 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ø', 'ŋ', 'œ', 'ɑ', 'ɒ', 'ɔ', 'ə', 'ɛ', 'ɜ', 'ɡ', 'ɣ', 'ɪ', 'ɲ', 'ɵ', 'ɹ', 'ɾ', 'ʃ', 'ʊ', 'ʋ', 'ʌ', 'ʒ', 'ʲ', 'ˈ', 'ˌ', 'ː', "'", "'", ' ', '(', ')', 'a', 'b', 'd', 'e', 'f', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'v', 'w', 'y

In [27]:
def verify_dutch_cleaners(text):
    # Import dutch_cleaners from text.cleaners
    from text.cleaners import dutch_cleaners
    
    # Step by step cleaning
    text = text.lower().strip()
    print("1. Lowercased:", text)
    
    # Apply dutch cleaners
    cleaned = dutch_cleaners(text)
    print("2. After dutch_cleaners:", cleaned)
    
    # Convert to phonemes
    phonemes = text_to_sequence(cleaned, ["dutch_cleaners"])
    print("3. Phoneme IDs:", phonemes)
    
    # Map back to actual symbols
    phoneme_symbols = [symbols[p] for p in phonemes]
    print("4. Actual phonemes:", phoneme_symbols)
    
    return phonemes

# Test the cleaning process
text = "hallo"
print("Original text:", text)
phonemes = verify_dutch_cleaners(text)

Original text: hallo
1. Lowercased: hallo
2. After dutch_cleaners: haːloː deː oːlɑr
3. Phoneme IDs: [176, 171, 211, 168, 180, 183, 211, 168, 174, 211, 180, 171, 211, 168, 197, 185, 173, 174, 211, 183, 211, 180, 194, 185]
4. Actual phonemes: ['h', 'a', 'ː', ' ', 'l', 'o', 'ː', ' ', 'e', 'ː', 'l', 'a', 'ː', ' ', 'ɛ', 'r', 'd', 'e', 'ː', 'o', 'ː', 'l', 'ɑ', 'r']


In [28]:
def get_text_fixed(text, hps):
    # Debug print
    print("Input text:", text)
    
    # Basic normalization
    text = text.lower().strip()
    text = " ".join(text.split())
    text = text.replace("'", "").replace('"', '')
    print("Normalized text:", text)
    
    # Direct phoneme mapping for common Dutch words
    dutch_mapping = {
        "hallo": "h ɑ l oː",
        "dag": "d ɑ x",
        "goedemorgen": "x u d ə m ɔ r ɣ ə n",
    }
    
    if text in dutch_mapping:
        phoneme_text = dutch_mapping[text]
    else:
        # Use default cleaner for unknown words
        from text.cleaners import dutch_cleaners
        phoneme_text = dutch_cleaners(text)
    
    print("Phoneme text:", phoneme_text)
    text_norm = text_to_sequence(phoneme_text, hps.data.text_cleaners)
    
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    print("Final sequence length:", len(text_norm))
    return text_norm

# Test the modified inference
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

# Load checkpoint
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_144000.pth", net_g, None)

# Test with basic text
text = "hallo"
stn_tst = get_text_fixed(text, hps)

with torch.no_grad():
    x_tst = stn_tst.cuda().unsqueeze(0)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
    sid = torch.LongTensor([0]).cuda()
    
    audio = net_g.infer(x_tst, x_tst_lengths, sid,
                       noise_scale=0.3,      # Reduced for clarity
                       noise_scale_w=0.4,    # Reduced for stability
                       length_scale=1.0)[0][0,0].data.cpu().float().numpy()

# Normalize and play
audio_norm = audio * 0.8 / max(abs(audio).max(), 1e-6)
ipd.display(ipd.Audio(audio_norm, rate=hps.data.sampling_rate, normalize=False))
write("test_fixed.wav", hps.data.sampling_rate, audio_norm)

Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


Input text: hallo
Normalized text: hallo
Phoneme text: h ɑ l oː
Final sequence length: 27


In [29]:
def debug_inference(text, hps, net_g):
    """Debug wrapper for TTS inference"""
    print(f"Debug - Processing text: {text}")
    
    # Get text sequence
    stn_tst = get_text_fixed(text, hps)
    print(f"Sequence shape: {stn_tst.shape}")
    
    with torch.no_grad():
        x_tst = stn_tst.cuda().unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
        sid = torch.LongTensor([0]).cuda()
        
        # Try different parameter combinations
        params = [
            {"noise_scale": 0.0, "noise_scale_w": 0.0, "length_scale": 1.0},  # No noise
            {"noise_scale": 0.1, "noise_scale_w": 0.1, "length_scale": 1.0},  # Minimal noise
            {"noise_scale": 0.3, "noise_scale_w": 0.4, "length_scale": 1.2},  # Medium noise
        ]
        
        for i, p in enumerate(params):
            print(f"\nTrying parameter set {i+1}:")
            print(p)
            try:
                audio = net_g.infer(x_tst, x_tst_lengths, sid, **p)[0][0,0].data.cpu().float().numpy()
                
                # Check for NaN/Inf values
                if np.any(np.isnan(audio)) or np.any(np.isinf(audio)):
                    print(f"Warning: Found NaN/Inf in audio output for params {i+1}")
                    continue
                    
                # Conservative normalization
                audio_norm = audio * 0.6 / max(abs(audio).max(), 1e-6)
                
                # Save with parameter index
                filename = f"debug_output_{i+1}.wav"
                write(filename, hps.data.sampling_rate, audio_norm)
                print(f"Saved {filename}")
                
            except Exception as e:
                print(f"Error with parameter set {i+1}: {str(e)}")

# Test inference with debug wrapper
text = "hallo"
debug_inference(text, hps, net_g)

Debug - Processing text: hallo
Input text: hallo
Normalized text: hallo
Phoneme text: h ɑ l oː
Final sequence length: 27
Sequence shape: torch.Size([27])

Trying parameter set 1:
{'noise_scale': 0.0, 'noise_scale_w': 0.0, 'length_scale': 1.0}
Saved debug_output_1.wav

Trying parameter set 2:
{'noise_scale': 0.1, 'noise_scale_w': 0.1, 'length_scale': 1.0}
Saved debug_output_2.wav

Trying parameter set 3:
{'noise_scale': 0.3, 'noise_scale_w': 0.4, 'length_scale': 1.2}
Saved debug_output_3.wav


In [30]:
def advanced_inference(text, hps, net_g):
    """Advanced inference with model warm-up and gradual parameter adjustment"""
    print(f"Advanced inference for text: {text}")
    
    # Ensure model is in eval mode
    net_g.eval()
    
    # Get text sequence with fixed phonemes
    stn_tst = get_text_fixed(text, hps)
    
    with torch.no_grad():
        x_tst = stn_tst.cuda().unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).cuda()
        sid = torch.LongTensor([0]).cuda()
        
        # Warm up the model with silence
        warmup_text = " "
        warmup_seq = get_text_fixed(warmup_text, hps)
        warmup_x = warmup_seq.cuda().unsqueeze(0)
        warmup_lengths = torch.LongTensor([warmup_seq.size(0)]).cuda()
        _ = net_g.infer(warmup_x, warmup_lengths, sid,
                       noise_scale=0.0,
                       noise_scale_w=0.0,
                       length_scale=1.0)
        
        # Try refined parameters
        refined_params = [
            {
                "noise_scale": 0.01,    # Minimal noise
                "noise_scale_w": 0.01,  # Minimal warmup
                "length_scale": 0.9     # Slightly faster
            },
            {
                "noise_scale": 0.05,    # Very light noise
                "noise_scale_w": 0.05,  # Very light warmup
                "length_scale": 1.0     # Normal speed
            }
        ]
        
        for i, p in enumerate(refined_params):
            try:
                # Generate audio with current parameters
                audio = net_g.infer(x_tst, x_tst_lengths, sid, **p)[0][0,0].data.cpu().float().numpy()
                
                # Apply post-processing
                audio = audio.astype(np.float32)
                
                # Aggressive normalization
                audio_norm = audio * 0.4 / (max(abs(audio).max(), 1e-6))
                
                # Save file
                filename = f"refined_output_{i+1}.wav"
                write(filename, hps.data.sampling_rate, audio_norm)
                print(f"Generated {filename} with params:", p)
                
            except Exception as e:
                print(f"Error with refined parameter set {i+1}: {str(e)}")

# Test with new inference function
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

# Load checkpoint
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_144000.pth", net_g, None)

# Test inference
text = "hallo"
advanced_inference(text, hps, net_g)

# Verify output
import subprocess
for i in range(1, 3):
    print(f"\nPlaying refined_output_{i}.wav:")
    subprocess.run(['aplay', f'refined_output_{i}.wav'])

Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


Advanced inference for text: hallo
Input text: hallo
Normalized text: hallo
Phoneme text: h ɑ l oː
Final sequence length: 27
Input text:  
Normalized text: 
Phoneme text: doːlɑr
Final sequence length: 41
Generated refined_output_1.wav with params: {'noise_scale': 0.01, 'noise_scale_w': 0.01, 'length_scale': 0.9}
Generated refined_output_2.wav with params: {'noise_scale': 0.05, 'noise_scale_w': 0.05, 'length_scale': 1.0}

Playing refined_output_1.wav:


Playing WAVE 'refined_output_1.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing refined_output_2.wav:


Playing WAVE 'refined_output_2.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [36]:
def fixed_inference(text, hps, net_g):
    """Fixed inference with improved audio quality"""
    
    # 1. Model preparation
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 2. Text preprocessing with fixed phoneme mapping
        phonemes = {
            "h": [176],    # Dutch 'h'
            "ɑ": [171],    # Dutch 'a'
            "l": [180],    # Dutch 'l'
            "oː": [183]    # Dutch 'o' - simplified to single phoneme
        }
        
        # Manual phoneme sequence with shorter 'o'
        manual_sequence = []
        for p in ["h", "ɑ", "l", "oː"]:
            manual_sequence.extend(phonemes[p])
        
        # Debug print
        print("Manual sequence (list):", manual_sequence)
        
        # Convert list to tensor explicitly
        text_tensor = torch.LongTensor(manual_sequence)
        print("Initial tensor shape:", text_tensor.shape)
        
        # Add blanks if needed
        if hps.data.add_blank:
            blanked_sequence = commons.intersperse(text_tensor.tolist(), 0)
            text_tensor = torch.LongTensor(blanked_sequence)
            print("After blanks shape:", text_tensor.shape)
        
        # 3. Prepare input tensors
        x_tst = text_tensor.unsqueeze(0).cuda()
        x_tst_lengths = torch.LongTensor([text_tensor.size(0)]).cuda()
        sid = torch.LongTensor([0]).cuda()
        
        # Verify tensor shapes
        print("Input tensor shapes:")
        print(f"- x_tst: {x_tst.shape}")
        print(f"- x_tst_lengths: {x_tst_lengths.shape}")
        print(f"- sid: {sid.shape}")
        
        # 4. Inference with refined parameters
        audio = net_g.infer(x_tst, x_tst_lengths, sid,
                         noise_scale=0.05,     # Minimal noise
                         noise_scale_w=0.05,   # Minimal warmup
                         length_scale=1.2,     # Slightly slower
                         max_len=None)[0][0,0].data.cpu().float().numpy()
        
        # 5. Audio post-processing
        # Apply light smoothing
        from scipy.signal import savgol_filter
        audio = savgol_filter(audio, 5, 3)  # window size 5, polynomial order 3
        
        # Normalize with headroom
        audio = audio.astype(np.float32)
        audio_norm = audio * 0.7 / (max(abs(audio).max(), 1e-6))
        
        return audio_norm, hps.data.sampling_rate
        
    except Exception as e:
        print(f"Error during inference: {str(e)}")
        print("Debug info:")
        print(f"- manual_sequence: {type(manual_sequence)}, len={len(manual_sequence)}")
        if 'text_tensor' in locals():
            print(f"- text_tensor: {type(text_tensor)}")
            if isinstance(text_tensor, torch.Tensor):
                print(f"  shape={text_tensor.shape}")
        raise

# Test the implementation
print("Loading model configuration...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

print("Initializing model...")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("Loading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_144000.pth", net_g, None)

print("Generating audio...")
audio_norm, sr = fixed_inference("hallo", hps, net_g)

# Save and play
write("fixed_output.wav", sr, audio_norm)
print("\nPlaying output:")
subprocess.run(['aplay', 'fixed_output.wav'])

Loading model configuration...
Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


Loading checkpoint...
Generating audio...
Manual sequence (list): [176, 171, 180, 183]
Initial tensor shape: torch.Size([4])
After blanks shape: torch.Size([9])
Input tensor shapes:
- x_tst: torch.Size([1, 9])
- x_tst_lengths: torch.Size([1])
- sid: torch.Size([1])

Playing output:


Playing WAVE 'fixed_output.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


CompletedProcess(args=['aplay', 'fixed_output.wav'], returncode=0)

In [37]:
def fixed_inference(text, hps, net_g):
    """Fixed inference with proper phoneme handling"""
    
    # 1. Model preparation 
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 2. Use full phoneme sequence for Dutch "hallo"
        manual_sequence = [
            0,   # Start token
            176, # h
            0,   # Separator 
            171, # ɑ
            0,   # Separator
            180, # l
            0,   # Separator
            183, # o
            0    # End token
        ]
        
        print("Full phoneme sequence:", manual_sequence)
        
        # Convert to tensor - no need for additional blanks
        text_tensor = torch.LongTensor(manual_sequence)
        print("Tensor shape:", text_tensor.shape)
        
        # 3. Prepare input tensors
        x_tst = text_tensor.unsqueeze(0).cuda()
        x_tst_lengths = torch.LongTensor([text_tensor.size(0)]).cuda()
        sid = torch.LongTensor([0]).cuda()
        
        # 4. Use more conservative inference parameters
        audio = net_g.infer(x_tst, x_tst_lengths, sid,
                         noise_scale=0.001,     # Almost no noise
                         noise_scale_w=0.001,   # Almost no warmup noise
                         length_scale=1.5,      # Slower for clarity
                         max_len=None)[0][0,0].data.cpu().float().numpy()
        
        # 5. Enhanced post-processing
        # Apply bandpass filter
        from scipy.signal import butter, filtfilt
        nyq = 0.5 * hps.data.sampling_rate
        low = 70.0 / nyq
        high = 8000.0 / nyq
        b, a = butter(4, [low, high], btype='band')
        audio = filtfilt(b, a, audio)
        
        # Normalize with conservative values
        audio = audio.astype(np.float32)
        audio_norm = audio * 0.5 / (max(abs(audio).max(), 1e-6))
        
        return audio_norm, hps.data.sampling_rate
        
    except Exception as e:
        print(f"Error during inference: {str(e)}")
        print("Debug info:")
        print(f"- manual_sequence length: {len(manual_sequence)}")
        if 'text_tensor' in locals():
            print(f"- text_tensor shape: {text_tensor.shape}")
        raise

# Test with earlier checkpoint
print("Loading model configuration...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

print("Initializing model...")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("Loading earlier checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_144000.pth", net_g, None)

print("Generating audio...")
audio_norm, sr = fixed_inference("hallo", hps, net_g)

# Save and play
write("fixed_output.wav", sr, audio_norm)
print("\nPlaying output:")
subprocess.run(['aplay', 'fixed_output.wav'])

Loading model configuration...
Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


Loading earlier checkpoint...
Generating audio...
Full phoneme sequence: [0, 176, 0, 171, 0, 180, 0, 183, 0]
Tensor shape: torch.Size([9])

Playing output:


Playing WAVE 'fixed_output.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


CompletedProcess(args=['aplay', 'fixed_output.wav'], returncode=0)

In [40]:
def basic_inference(text, hps, net_g):
    """Minimal inference focusing on core VITS functionality"""
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Use absolute minimal phoneme sequence
        sequence = [
            176,  # h
            171,  # a
            180,  # l
            180,  # l
            183   # o
        ]
        
        # 2. Convert to tensor without any additional tokens
        text_tensor = torch.LongTensor(sequence)
        print("Initial tensor:", text_tensor)
        
        # 3. Only add blanks if explicitly required
        if hps.data.add_blank:
            blanks = commons.intersperse(sequence, 0)  # Apply to list
            text_tensor = torch.LongTensor(blanks)  # Convert back to tensor
        
        print(f"Final sequence: {text_tensor}")
        
        # 4. Basic input preparation
        x_tst = text_tensor.unsqueeze(0).cuda()
        x_tst_lengths = torch.LongTensor([text_tensor.size(0)]).cuda()
        sid = torch.LongTensor([0]).cuda()
        
        # 5. Minimal inference parameters
        audio = net_g.infer(x_tst, x_tst_lengths, sid,
                          noise_scale=0.0,        # No noise
                          noise_scale_w=0.0,      # No noise warmup
                          length_scale=1.0)[0][0,0].data.cpu().float().numpy()
        
        # 6. Basic normalization only
        audio = audio.astype(np.float32)
        audio_norm = audio / (np.abs(audio).max() + 1e-6)
        
        return audio_norm, hps.data.sampling_rate
        
    except Exception as e:
        print(f"Error: {str(e)}")
        print("Debug info:")
        if 'text_tensor' in locals():
            print(f"text_tensor type: {type(text_tensor)}")
        raise

# Test with minimal setup
print("Loading configuration...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

print("Initializing model...")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()

print("Loading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

print("Generating audio...")
audio_norm, sr = basic_inference("hallo", hps, net_g)

# Save without any post-processing
write("basic_output.wav", sr, audio_norm)
print("\nPlaying output:")
subprocess.run(['aplay', 'basic_output.wav'])

Loading configuration...
Initializing model...
Mutli-band iSTFT VITS
Loading checkpoint...
Generating audio...
Initial tensor: tensor([176, 171, 180, 180, 183])
Final sequence: tensor([  0, 176,   0, 171,   0, 180,   0, 180,   0, 183,   0])

Playing output:


Playing WAVE 'basic_output.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


CompletedProcess(args=['aplay', 'basic_output.wav'], returncode=0)

In [41]:
def minimal_inference(text, hps, net_g):
    """Bare minimum inference attempt"""
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Single phoneme test
        sequence = [176]  # Just the 'h' sound
        
        # 2. Direct tensor conversion
        text_tensor = torch.LongTensor([sequence]).cuda()  # Add batch dimension directly
        lengths = torch.LongTensor([1]).cuda()
        sid = torch.LongTensor([0]).cuda()
        
        print("Test tensor shape:", text_tensor.shape)
        
        # 3. Minimal inference
        with torch.no_grad():
            audio = net_g.infer(text_tensor, lengths, sid,
                              noise_scale=0.0,      
                              noise_scale_w=0.0,    
                              length_scale=1.0)[0][0,0].data.cpu().float().numpy()
        
        # 4. Basic normalization
        audio = audio.astype(np.float32)
        audio_norm = audio / (np.abs(audio).max() + 1e-6)
        
        return audio_norm, hps.data.sampling_rate
        
    except Exception as e:
        print(f"Error: {str(e)}")
        raise

print("Testing with minimal configuration...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

print("Initializing fresh model...")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("Loading earliest checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

print("Attempting single phoneme synthesis...")
audio_norm, sr = minimal_inference("h", hps, net_g)

write("minimal_output.wav", sr, audio_norm)
print("\nPlaying output:")
subprocess.run(['aplay', 'minimal_output.wav'])

Testing with minimal configuration...
Initializing fresh model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


Loading earliest checkpoint...
Attempting single phoneme synthesis...
Test tensor shape: torch.Size([1, 1])

Playing output:


Playing WAVE 'minimal_output.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


CompletedProcess(args=['aplay', 'minimal_output.wav'], returncode=0)

In [42]:
def verify_single_phoneme():
    """Test synthesizer with absolute minimal input"""
    
    # 1. Model verification
    net_g.eval()
    torch.cuda.empty_cache()
    
    # 2. Verify Dutch phoneme is valid
    h_phoneme = 176  # Dutch 'h'
    assert h_phoneme in range(len(symbols)), f"Invalid phoneme ID {h_phoneme}"
    print(f"Testing phoneme: {symbols[h_phoneme]}")
    
    # 3. Create minimal tensors
    x = torch.LongTensor([[h_phoneme]]).cuda()  # Shape: [1, 1]
    lengths = torch.LongTensor([1]).cuda()
    sid = torch.LongTensor([0]).cuda()
    
    print(f"Input shapes: x={x.shape}, lengths={lengths.shape}, sid={sid.shape}")
    
    try:
        # 4. Run inference with zero noise
        with torch.no_grad():
            audio = net_g.infer(x, lengths, sid,
                              noise_scale=0.0,
                              noise_scale_w=0.0,
                              length_scale=0.5)[0][0,0].data.cpu().numpy()
        
        # 5. Basic audio checks
        print(f"Audio shape: {audio.shape}")
        print(f"Audio range: [{audio.min():.3f}, {audio.max():.3f}]")
        if np.any(np.isnan(audio)):
            raise ValueError("NaN values in audio output")
            
        # 6. Minimal normalization
        audio = audio.astype(np.float32)
        audio_norm = audio / (np.abs(audio).max() + 1e-6)
        
        return audio_norm, hps.data.sampling_rate
        
    except Exception as e:
        print(f"Inference failed: {str(e)}")
        raise

# Test with fresh model
print("Loading fresh config...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

print("Initializing model...")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()

print("Loading earliest checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

print("Testing single phoneme...")
audio_norm, sr = verify_single_phoneme()

# Save raw output
write("phoneme_test.wav", sr, audio_norm)
print("\nPlaying test output:")
subprocess.run(['aplay', 'phoneme_test.wav'])

Loading fresh config...
Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)


Loading earliest checkpoint...
Testing single phoneme...
Testing phoneme: h
Input shapes: x=torch.Size([1, 1]), lengths=torch.Size([1]), sid=torch.Size([1])
Audio shape: (256,)
Audio range: [-0.195, 0.098]

Playing test output:


Playing WAVE 'phoneme_test.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


CompletedProcess(args=['aplay', 'phoneme_test.wav'], returncode=0)

In [44]:
def diagnostic_inference():
    """Diagnostic test with model verification"""
    
    # 1. Reset CUDA and model state
    torch.cuda.empty_cache()
    net_g.eval()
    
    # 2. Create silence input
    silence = torch.zeros(1, 1).long().cuda()  # Single zero token
    lengths = torch.LongTensor([1]).cuda()
    sid = torch.LongTensor([0]).cuda()
    
    print("Testing silence generation...")
    print(f"Input shapes: silence={silence.shape}, lengths={lengths.shape}")
    
    try:
        # 3. Generate silence first
        with torch.no_grad():
            silence_audio = net_g.infer(
                silence, 
                lengths, 
                sid,
                noise_scale=0.0,
                noise_scale_w=0.0,
                length_scale=1.0
            )[0][0,0].data.cpu().numpy()
        
        print("\nSilence output stats:")
        print(f"Shape: {silence_audio.shape}")
        print(f"Range: [{silence_audio.min():.3f}, {silence_audio.max():.3f}]")
        print(f"Mean: {silence_audio.mean():.3f}")
        print(f"Std: {silence_audio.std():.3f}")
        
        # 4. Test different input lengths
        for length in [1, 2, 4]:
            print(f"\nTesting length {length}...")
            x = torch.ones(1, length).long().cuda() * 176  # Repeated 'h' sound
            lengths = torch.LongTensor([length]).cuda()
            
            audio = net_g.infer(
                x, 
                lengths, 
                sid,
                noise_scale=0.0,
                noise_scale_w=0.0,
                length_scale=1.0
            )[0][0,0].data.cpu().numpy()
            
            print(f"Output shape: {audio.shape}")
            print(f"Range: [{audio.min():.3f}, {audio.max():.3f}]")
            
            # Save each test
            audio_norm = audio.astype(np.float32) / (np.abs(audio).max() + 1e-6)
            write(f"diagnostic_{length}.wav", hps.data.sampling_rate, audio_norm)
        
        return True
            
    except Exception as e:
        print(f"Diagnostic failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Fresh model initialization
print("Loading fresh configuration...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

print("\nInitializing clean model...")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading earliest checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run diagnostics
print("\nRunning model diagnostics...")
if diagnostic_inference():
    print("\nDiagnostic outputs:")
    for i in range(1, 5):
        try:
            print(f"\nPlaying diagnostic_{i}.wav:")
            subprocess.run(['aplay', f'diagnostic_{i}.wav'])
        except:
            pass

Loading fresh configuration...

Initializing clean model...
Mutli-band iSTFT VITS

Loading earliest checkpoint...

Running model diagnostics...
Testing silence generation...
Input shapes: silence=torch.Size([1, 1]), lengths=torch.Size([1])

Silence output stats:
Shape: (256,)
Range: [-0.195, 0.098]
Mean: 0.001
Std: 0.032

Testing length 1...
Output shape: (256,)
Range: [-0.195, 0.098]

Testing length 2...
Output shape: (1024,)
Range: [-0.520, 0.390]

Testing length 4...
Output shape: (2304,)
Range: [-0.568, 0.367]

Diagnostic outputs:

Playing diagnostic_1.wav:


Playing WAVE 'diagnostic_1.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing diagnostic_2.wav:


Playing WAVE 'diagnostic_2.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing diagnostic_3.wav:

Playing diagnostic_4.wav:


diagnostic_3.wav: No such file or directory
Playing WAVE 'diagnostic_4.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [46]:
def test_model_components():
    """Test individual model components"""
    print("Testing model components...")
    
    # 1. Reset model state
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 2. Create minimal test input
        test_input = torch.LongTensor([[176]]).cuda()  # Single phoneme
        test_lengths = torch.LongTensor([1]).cuda()
        test_sid = torch.LongTensor([0]).cuda()
        
        print("\nTest input shapes:")
        print(f"Input: {test_input.shape}")
        print(f"Lengths: {test_lengths.shape}")
        print(f"Speaker ID: {test_sid.shape}")
        
        # 3. Check model parameters
        total_params = sum(p.numel() for p in net_g.parameters())
        trainable_params = sum(p.numel() for p in net_g.parameters() if p.requires_grad)
        
        print("\nModel parameters:")
        print(f"Total: {total_params:,}")
        print(f"Trainable: {trainable_params:,}")
        
        # 4. List model attributes
        print("\nModel structure:")
        for name, module in net_g.named_children():
            print(f"- {name}: {type(module).__name__}")
            
        # 5. Test direct inference
        with torch.no_grad():
            output = net_g.infer(
                test_input,
                test_lengths,
                test_sid,
                noise_scale=0.0,
                noise_scale_w=0.0,
                length_scale=1.0
            )
            
        print("\nOutput details:")
        print(f"Type: {type(output)}")
        print(f"Length: {len(output)}")
        if isinstance(output, tuple):
            print("Tuple contents:")
            for i, item in enumerate(output):
                print(f"Item {i} shape: {item.shape}")
                
        return "Component test completed"
            
    except Exception as e:
        print(f"\nComponent test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# Test with fresh model
print("Loading fresh config...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

print("\nInitializing clean model...")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading earliest checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run component test
result = test_model_components()
if result:
    print(f"\nTest result: {result}")

Loading fresh config...

Initializing clean model...
Mutli-band iSTFT VITS

Loading earliest checkpoint...
Testing model components...

Test input shapes:
Input: torch.Size([1, 1])
Lengths: torch.Size([1])
Speaker ID: torch.Size([1])

Model parameters:
Total: 34,731,985
Trainable: 34,731,985

Model structure:
- enc_p: TextEncoder
- dec: Multiband_iSTFT_Generator
- enc_q: PosteriorEncoder
- flow: ResidualCouplingBlock
- dp: DurationPredictor

Output details:
Type: <class 'tuple'>
Length: 5
Tuple contents:
Item 0 shape: torch.Size([1, 1, 256])
Item 1 shape: torch.Size([1, 4, 64])
Item 2 shape: torch.Size([1, 1, 1, 1])
Item 3 shape: torch.Size([1, 1, 1])

Component test failed: 'tuple' object has no attribute 'shape'


Traceback (most recent call last):
  File "/tmp/ipykernel_1865464/1391733123.py", line 50, in test_model_components
    print(f"Item {i} shape: {item.shape}")
AttributeError: 'tuple' object has no attribute 'shape'


In [47]:
def analyze_model_output():
    """Analyze model output structure"""
    print("Testing model output...")
    
    # Create minimal input
    x = torch.LongTensor([[176]]).cuda()  # Single phoneme
    lengths = torch.LongTensor([1]).cuda()
    sid = torch.LongTensor([0]).cuda()
    
    with torch.no_grad():
        # Run inference
        outputs = net_g.infer(x, lengths, sid,
                            noise_scale=0.0,
                            noise_scale_w=0.0,
                            length_scale=1.0)
        
        # Extract audio from output tuple
        audio = outputs[0][0,0].data.cpu().float().numpy()
        
        # Get all output shapes
        print("\nOutput structure:")
        for i, out in enumerate(outputs):
            if isinstance(out, torch.Tensor):
                print(f"Output {i}: shape={out.shape}")
            elif isinstance(out, tuple):
                print(f"Output {i} (tuple):")
                for j, subout in enumerate(out):
                    if hasattr(subout, 'shape'):
                        print(f"  {j}: shape={subout.shape}")
                    else:
                        print(f"  {j}: type={type(subout)}")
            else:
                print(f"Output {i}: type={type(out)}")
        
        # Analyze audio
        print("\nAudio statistics:")
        print(f"Shape: {audio.shape}")
        print(f"Range: [{audio.min():.3f}, {audio.max():.3f}]")
        print(f"Mean: {audio.mean():.3f}")
        print(f"Std: {audio.std():.3f}")
        
        return audio

# Test with fresh model
print("Loading configuration...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")

print("\nInitializing model...")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run analysis
print("\nAnalyzing model output...")
audio = analyze_model_output()

# Save for inspection
write("analysis_output.wav", hps.data.sampling_rate, audio)
print("\nPlaying analysis output:")
subprocess.run(['aplay', 'analysis_output.wav'])

Loading configuration...

Initializing model...
Mutli-band iSTFT VITS

Loading checkpoint...

Analyzing model output...
Testing model output...

Output structure:
Output 0: shape=torch.Size([1, 1, 256])
Output 1: shape=torch.Size([1, 4, 64])
Output 2: shape=torch.Size([1, 1, 1, 1])
Output 3: shape=torch.Size([1, 1, 1])
Output 4 (tuple):
  0: shape=torch.Size([1, 192, 1])
  1: shape=torch.Size([1, 192, 1])
  2: shape=torch.Size([1, 192, 1])
  3: shape=torch.Size([1, 192, 1])

Audio statistics:
Shape: (256,)
Range: [-0.195, 0.098]
Mean: 0.001
Std: 0.032

Playing analysis output:


Playing WAVE 'analysis_output.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


CompletedProcess(args=['aplay', 'analysis_output.wav'], returncode=0)

In [49]:
def test_decoder_components():
    """Test MB-iSTFT decoder directly"""
    print("Testing MB-iSTFT decoder...")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Create test signal with correct dimensions
        batch_size = 1
        n_frames = 32  # Short test duration
        hidden_size = 192  # Hidden dimension
        
        # Create input in correct shape for conv1d: [batch, channels, time]
        test_spec = torch.zeros(batch_size, hidden_size, n_frames).cuda()
        
        print("\nTest input shape:", test_spec.shape)
        
        # 2. Run through decoder only
        with torch.no_grad():
            # Access decoder directly
            audio = net_g.dec(test_spec)[0].cpu().float().numpy()
        
        print("\nDecoder output:")
        print(f"Shape: {audio.shape}")
        print(f"Range: [{audio.min():.3f}, {audio.max():.3f}]")
        print(f"Mean: {audio.mean():.3f}")
        print(f"Std: {audio.std():.3f}")
        
        # 3. Try with sine wave input
        freq = 440  # Hz
        time = np.linspace(0, 1, n_frames)
        sine_wave = np.sin(2 * np.pi * freq * time)
        
        # Replicate sine wave across hidden dimension
        sine_spec = torch.zeros(batch_size, hidden_size, n_frames).cuda()
        sine_spec[:, :, :] = torch.FloatTensor(sine_wave).cuda()
        
        with torch.no_grad():
            audio_sine = net_g.dec(sine_spec)[0].cpu().float().numpy()
            
        print("\nSine wave output:")
        print(f"Shape: {audio_sine.shape}")
        print(f"Range: [{audio_sine.min():.3f}, {audio_sine.max():.3f}]")
        
        # 4. Save both for comparison
        # Normalize outputs
        silence_norm = audio / (np.abs(audio).max() + 1e-6)
        sine_norm = audio_sine / (np.abs(audio_sine).max() + 1e-6)
        
        write("decoder_silence.wav", hps.data.sampling_rate, silence_norm)
        write("decoder_sine.wav", hps.data.sampling_rate, sine_norm)
        
        return True
        
    except Exception as e:
        print(f"\nDecoder test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Test decoder
print("\nInitializing fresh model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run decoder test
print("\nTesting decoder...")
if test_decoder_components():
    print("\nPlaying outputs:")
    print("\nSilence test:")
    subprocess.run(['aplay', 'decoder_silence.wav'])
    print("\nSine wave test:")
    subprocess.run(['aplay', 'decoder_sine.wav'])


Initializing fresh model...
Mutli-band iSTFT VITS

Loading checkpoint...

Testing decoder...
Testing MB-iSTFT decoder...

Test input shape: torch.Size([1, 192, 32])

Decoder output:
Shape: (1, 1, 8192)
Range: [-0.686, 0.390]
Mean: -0.003
Std: 0.075

Sine wave output:
Shape: (1, 1, 8192)
Range: [-0.772, 0.709]

Playing outputs:

Silence test:


Playing WAVE 'decoder_silence.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Sine wave test:


Playing WAVE 'decoder_sine.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [51]:
def test_istft_decoder():
    """Test MB-iSTFT decoder with proper frequency band handling"""
    print("Testing MB-iSTFT decoder with frequency bands...")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Configuration
        batch_size = 1
        n_bands = 4
        time_steps = 64  # Longer duration for better frequency resolution
        band_size = 48   # 192/4 per band
        
        # 2. Create frequency-specific test signals
        freqs = [100, 500, 1000, 2000]  # Test frequencies for each band
        specs = []
        
        for i in range(n_bands):
            # Initialize empty spectrogram for this band
            band_spec = torch.zeros(batch_size, band_size, time_steps).cuda()
            
            # Generate frequency-specific content
            if i < len(freqs):
                t = np.linspace(0, 1, time_steps)
                freq = freqs[i]
                signal = 0.5 * np.sin(2 * np.pi * freq * t)
                
                # Apply band-specific envelope
                envelope = np.hanning(time_steps)
                signal *= envelope
                
                # Place signal in appropriate frequency band
                band_spec[0, :band_size//4, :] = torch.FloatTensor(signal).cuda().unsqueeze(0).repeat(band_size//4, 1)
            
            specs.append(band_spec)
        
        # 3. Combine bands with proper scaling
        test_spec = torch.cat(specs, dim=1)
        test_spec = test_spec * 0.1  # Reduce overall amplitude
        
        print(f"\nInput shape: {test_spec.shape}")
        
        # 4. Process through decoder
        with torch.no_grad():
            audio = net_g.dec(test_spec)[0].cpu().float().numpy()
            
            # Extract and process audio
            audio = audio[0, 0]  # Remove batch dims
            
            # Apply fade in/out
            fade_len = int(0.01 * hps.data.sampling_rate)  # 10ms fade
            fade = np.hanning(2 * fade_len)
            audio[:fade_len] *= fade[:fade_len]
            audio[-fade_len:] *= fade[fade_len:]
            
            # Normalize conservatively
            audio = audio.astype(np.float32)
            audio_norm = audio * 0.7 / (np.abs(audio).max() + 1e-6)
            
            # Save output
            write("istft_test.wav", hps.data.sampling_rate, audio_norm)
            return True
            
    except Exception as e:
        print(f"\nDecoder test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Test with fresh model
print("\nInitializing model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run test
if test_istft_decoder():
    print("\nPlaying test output:")
    subprocess.run(['aplay', 'istft_test.wav'])


Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



Loading checkpoint...
Testing MB-iSTFT decoder with frequency bands...

Input shape: torch.Size([1, 192, 64])

Playing test output:


Playing WAVE 'istft_test.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [52]:
def test_istft_decoder_fixed():
    """Test MB-iSTFT decoder with proper band separation"""
    print("Testing MB-iSTFT decoder with band separation...")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Configuration
        batch_size = 1
        n_bands = 4  # MB-iSTFT uses 4 frequency bands
        time_steps = 128  # Longer for better resolution
        band_size = 48   # 192/4 per band

        # 2. Create band-specific test signals with proper frequency ranges
        # Each band covers specific frequency ranges:
        band_configs = [
            {"freq": 100, "width": 8},    # Low frequencies
            {"freq": 500, "width": 12},   # Mid-low
            {"freq": 1500, "width": 16},  # Mid-high
            {"freq": 3000, "width": 20}   # High frequencies
        ]
        
        specs = []
        for i, config in enumerate(band_configs):
            # Create empty spectrogram for this band
            band_spec = torch.zeros(batch_size, band_size, time_steps).cuda()
            
            # Generate band-limited signal
            t = np.linspace(0, 2, time_steps)  # Longer duration
            freq = config["freq"]
            width = config["width"]
            
            # Create windowed sinusoid
            signal = np.sin(2 * np.pi * freq * t)
            window = np.hanning(time_steps)
            signal *= window
            
            # Apply band-specific processing
            signal_tensor = torch.FloatTensor(signal).cuda()
            
            # Place signal only in appropriate frequency bins
            start_bin = 0
            end_bin = width
            band_spec[0, start_bin:end_bin, :] = signal_tensor.unsqueeze(0).repeat(width, 1)
            
            specs.append(band_spec)
        
        # 3. Combine bands with careful scaling
        test_spec = torch.cat(specs, dim=1)
        test_spec = test_spec * 0.05  # Reduce amplitude further
        
        print(f"\nInput shape: {test_spec.shape}")
        
        # 4. Process through decoder with safeguards
        with torch.no_grad():
            # Apply decoder
            audio = net_g.dec(test_spec)[0].cpu().float().numpy()
            audio = audio[0, 0]  # Remove batch dims
            
            # Apply longer fade in/out
            fade_len = int(0.05 * hps.data.sampling_rate)  # 50ms fade
            fade = np.hanning(2 * fade_len)
            audio[:fade_len] *= fade[:fade_len]
            audio[-fade_len:] *= fade[fade_len:]
            
            # Apply bandpass filtering
            from scipy.signal import butter, filtfilt
            nyq = 0.5 * hps.data.sampling_rate
            low = 50.0 / nyq   # Cut below 50 Hz
            high = 7000.0 / nyq # Cut above 7 kHz
            b, a = butter(4, [low, high], btype='band')
            audio = filtfilt(b, a, audio)
            
            # Very conservative normalization
            audio = audio.astype(np.float32)
            audio_norm = audio * 0.4 / (np.abs(audio).max() + 1e-6)
            
            # Save output
            write("istft_test_fixed.wav", hps.data.sampling_rate, audio_norm)
            return True
            
    except Exception as e:
        print(f"\nDecoder test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Test with fresh model
print("\nInitializing model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run test
if test_istft_decoder_fixed():
    print("\nPlaying test output:")
    subprocess.run(['aplay', 'istft_test_fixed.wav'])


Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



Loading checkpoint...
Testing MB-iSTFT decoder with band separation...

Input shape: torch.Size([1, 192, 128])

Playing test output:


Playing WAVE 'istft_test_fixed.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [56]:
def test_istft_decoder_minimal():
    """Test MB-iSTFT decoder with minimal configuration"""
    print("Testing MB-iSTFT decoder with minimal settings...")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Configuration - shorter duration
        batch_size = 1
        n_bands = 4
        time_steps = 32  # Reduced from 128
        band_size = 48   # 192/4 per band

        # 2. Simplified band configuration - single frequency per band
        band_configs = [
            {"freq": 200, "width": 4},    # Low band only
            {"freq": 0, "width": 0},      # Mid-low silent
            {"freq": 0, "width": 0},      # Mid-high silent
            {"freq": 0, "width": 0}       # High silent
        ]
        
        specs = []
        for config in band_configs:
            # Create empty band
            band_spec = torch.zeros(batch_size, band_size, time_steps).cuda()
            
            if config["freq"] > 0:
                # Generate simple sine wave
                t = np.linspace(0, 0.5, time_steps)  # Shorter duration
                freq = config["freq"]
                signal = np.sin(2 * np.pi * freq * t)
                
                # Apply gentle envelope
                envelope = np.sin(np.pi * t / t[-1])**2
                signal *= envelope
                
                # Place in first few bins only
                signal_tensor = torch.FloatTensor(signal).cuda()
                band_spec[0, :config["width"], :] = signal_tensor.unsqueeze(0).repeat(config["width"], 1)
            
            specs.append(band_spec)
        
        # 3. Combine with minimal amplitude
        test_spec = torch.cat(specs, dim=1)
        test_spec = test_spec * 0.01  # Very low amplitude
        
        print(f"\nInput shape: {test_spec.shape}")
        
        # 4. Decode with safeguards
        with torch.no_grad():
            audio = net_g.dec(test_spec)[0].cpu().float().numpy()
            audio = audio[0, 0]
            
            # Short fade
            fade_len = int(0.005 * hps.data.sampling_rate)  # 5ms fade
            fade = np.hanning(2 * fade_len)
            audio[:fade_len] *= fade[:fade_len]
            audio[-fade_len:] *= fade[fade_len:]
            
            # Tight bandpass around test frequency
            from scipy.signal import butter, filtfilt
            nyq = 0.5 * hps.data.sampling_rate
            low = 150.0 / nyq   # Just below test frequency
            high = 250.0 / nyq  # Just above test frequency
            b, a = butter(4, [low, high], btype='band')
            audio = filtfilt(b, a, audio)
            
            # Very conservative normalization
            audio = audio.astype(np.float32)
            audio_norm = audio * 0.2 / (np.abs(audio).max() + 1e-6)
            
            write("istft_minimal.wav", hps.data.sampling_rate, audio_norm)
            return True
            
    except Exception as e:
        print(f"\nDecoder test failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Test with fresh model
print("\nInitializing model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run test
if test_istft_decoder_minimal():
    print("\nPlaying test output:")
    subprocess.run(['aplay', 'istft_minimal.wav'])


Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



Loading checkpoint...
Testing MB-iSTFT decoder with minimal settings...

Input shape: torch.Size([1, 192, 32])

Playing test output:


Playing WAVE 'istft_minimal.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [65]:
def test_pure_tone():
    """Test MB-iSTFT decoder with pure sine wave"""
    print("Testing pure tone synthesis...")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Configuration
        batch_size = 1
        time_steps = 16   # Very short duration
        total_bands = 192 # Total frequency bins
        
        # 2. Create pure sine wave input
        # Single frequency in middle range
        freq = 440  # A4 note
        t = np.linspace(0, 0.2, time_steps)  # 200ms duration
        
        # Generate clean sine with smooth envelope
        sine = np.sin(2 * np.pi * freq * t)
        envelope = np.sin(np.pi * t / t[-1]) # Smooth rise and fall
        signal = sine * envelope
        
        # 3. Create spectrogram tensor
        spec = torch.zeros(batch_size, total_bands, time_steps).cuda()
        
        # Place sine wave only in middle frequency band
        mid_band = total_bands // 2
        spec[0, mid_band:mid_band+1, :] = torch.FloatTensor(signal).cuda().unsqueeze(0)
        
        # Scale down amplitude significantly
        spec = spec * 0.005  # Very low amplitude
        
        print(f"\nInput shape: {spec.shape}")
        
        # 4. Decode with minimal processing
        with torch.no_grad():
            audio = net_g.dec(spec)[0].cpu().float().numpy()
            audio = audio[0, 0]
            
            # Very short fade
            fade_len = int(0.002 * hps.data.sampling_rate)  # 2ms fade
            fade = np.hanning(2 * fade_len)
            audio[:fade_len] *= fade[:fade_len]
            audio[-fade_len:] *= fade[fade_len:]
            
            # Tight bandpass around A4
            from scipy.signal import butter, filtfilt
            nyq = 0.5 * hps.data.sampling_rate
            low = 430.0 / nyq   # Just below A4
            high = 450.0 / nyq  # Just above A4
            b, a = butter(2, [low, high], btype='band')
            audio = filtfilt(b, a, audio)
            
            # Minimal normalization
            audio = audio.astype(np.float32)
            audio_norm = audio * 0.1 / (np.abs(audio).max() + 1e-6)
            
            write("pure_tone.wav", hps.data.sampling_rate, audio_norm)
            return True
            
    except Exception as e:
        print(f"\nTone synthesis failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Test with fresh model
print("\nInitializing model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run test
if test_pure_tone():
    print("\nPlaying pure tone test:")
    subprocess.run(['aplay', 'pure_tone.wav'])


Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



Loading checkpoint...
Testing pure tone synthesis...

Input shape: torch.Size([1, 192, 16])

Playing pure tone test:


Playing WAVE 'pure_tone.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [64]:
def test_minimal_tone():
    """Test MB-iSTFT decoder with absolute minimal sine wave"""
    print("Testing minimal tone synthesis...")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Minimal configuration
        batch_size = 1
        time_steps = 8     # Ultra short duration
        total_bands = 192  # Total frequency bins
        
        # 2. Create simple sine wave
        freq = 220  # Lower A3 note - may be less prone to artifacts
        t = np.linspace(0, 0.1, time_steps)  # 100ms duration
        
        # Pure sine without envelope
        signal = np.sin(2 * np.pi * freq * t)
        
        # 3. Create minimal spectrogram
        spec = torch.zeros(batch_size, total_bands, time_steps).cuda()
        
        # Place sine in single frequency bin
        target_bin = total_bands // 4  # Use lower frequency region
        spec[0, target_bin, :] = torch.FloatTensor(signal).cuda()
        
        # Minimal amplitude
        spec = spec * 0.001  # Very very low amplitude
        
        print(f"\nInput shape: {spec.shape}")
        
        # 4. Direct decoder pass
        with torch.no_grad():
            audio = net_g.dec(spec)[0].cpu().float().numpy()
            audio = audio[0, 0]
            
            # No fades, no filtering - just normalization
            audio = audio.astype(np.float32)
            audio_norm = audio * 0.05 / (np.abs(audio).max() + 1e-6)
            
            write("minimal_tone.wav", hps.data.sampling_rate, audio_norm)
            return True
            
    except Exception as e:
        print(f"\nTone synthesis failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Test with fresh model
print("\nInitializing model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run test
if test_minimal_tone():
    print("\nPlaying minimal tone:")
    subprocess.run(['aplay', 'minimal_tone.wav'])


Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



Loading checkpoint...
Testing minimal tone synthesis...

Input shape: torch.Size([1, 192, 8])

Playing minimal tone:


Playing WAVE 'minimal_tone.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [66]:
def test_clean_tone():
    """Test decoder with cleaner signal path"""
    print("Testing clean tone synthesis...")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Configuration
        batch_size = 1
        time_steps = 32    # Longer for stability
        total_bands = 192  # Total frequency bins
        
        # 2. Create smoother test signal
        freq = 440  # A4 note
        t = np.linspace(0, 0.5, time_steps)  # 500ms duration
        
        # Use raised cosine envelope for smoother transitions
        envelope = 0.5 * (1 - np.cos(2 * np.pi * t / t[-1]))
        carrier = np.sin(2 * np.pi * freq * t)
        signal = carrier * envelope
        
        # 3. Create multi-band input with proper spacing
        spec = torch.zeros(batch_size, total_bands, time_steps).cuda()
        
        # Place signal in multiple adjacent bins with Gaussian weighting
        center_bin = total_bands // 3  # Lower in spectrum
        width = 3  # Number of bins to spread across
        
        for i in range(-width, width+1):
            bin_weight = np.exp(-0.5 * (i/width)**2)  # Gaussian window
            target_bin = center_bin + i
            if 0 <= target_bin < total_bands:
                spec[0, target_bin, :] = torch.FloatTensor(signal * bin_weight).cuda()
        
        # Very low amplitude to avoid nonlinearities
        spec = spec * 0.001
        
        print(f"\nInput shape: {spec.shape}")
        
        # 4. Process through decoder with minimal interference
        with torch.no_grad():
            audio = net_g.dec(spec)[0].cpu().float().numpy()
            audio = audio[0, 0]
            
            # Gentle fade
            fade_len = int(0.010 * hps.data.sampling_rate)  # 10ms fade
            fade = np.hanning(2 * fade_len)
            audio[:fade_len] *= fade[:fade_len]
            audio[-fade_len:] *= fade[fade_len:]
            
            # Mild bandpass to clean up edges
            from scipy.signal import butter, filtfilt
            nyq = 0.5 * hps.data.sampling_rate
            low = 400.0 / nyq   # Wider passband
            high = 480.0 / nyq
            b, a = butter(1, [low, high], btype='band')  # First order filter
            audio = filtfilt(b, a, audio)
            
            # Conservative normalization
            audio = audio.astype(np.float32)
            audio_norm = audio * 0.05 / (np.abs(audio).max() + 1e-6)
            
            write("clean_tone.wav", hps.data.sampling_rate, audio_norm)
            return True
            
    except Exception as e:
        print(f"\nTone synthesis failed: {str(e)}")
        import traceback
        traceback.print_exc()
        return False

# Test with fresh model
print("\nInitializing model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

print("\nLoading checkpoint...")
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run test
if test_clean_tone():
    print("\nPlaying clean tone test:")
    subprocess.run(['aplay', 'clean_tone.wav'])


Initializing model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



Loading checkpoint...
Testing clean tone synthesis...

Input shape: torch.Size([1, 192, 32])

Playing clean tone test:


Playing WAVE 'clean_tone.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [70]:
def test_single_frequency():
    """Test basic tone generation with very clear expectations"""
    
    print("\nTEST EXPECTATIONS:")
    print("- You should hear a clean, short beep around 200 Hz (low pitch)")
    print("- Duration: 0.1 seconds")
    print("- No distortion or artifacts")
    print("- Similar to a pure sine wave")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # 1. Ultra minimal configuration
        batch_size = 1
        time_steps = 4     # Very short
        total_bands = 192  # Total bins
        
        # 2. Generate simple 200 Hz tone
        freq = 200  # Low frequency = less likely to create artifacts
        t = np.linspace(0, 0.1, time_steps)
        signal = np.sin(2 * np.pi * freq * t)
        
        # 3. Place in spectrogram - single bin only
        spec = torch.zeros(batch_size, total_bands, time_steps).cuda()
        target_bin = total_bands // 8  # Very low frequency region
        spec[0, target_bin, :] = torch.FloatTensor(signal).cuda()
        
        # Extremely low amplitude
        spec = spec * 0.0001  # Further reduced
        
        print("\nGenerating tone...")
        
        # 4. Direct decoder pass - minimal processing
        with torch.no_grad():
            audio = net_g.dec(spec)[0].cpu().float().numpy()
            audio = audio[0, 0]
            
            # Basic normalization only
            audio = audio.astype(np.float32)
            audio_norm = audio * 0.1 / (np.abs(audio).max() + 1e-6)
            
            write("single_freq_test.wav", hps.data.sampling_rate, audio_norm)
            return True
            
    except Exception as e:
        print(f"\nTest failed: {str(e)}")
        return False

# Initialize
print("Loading fresh model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()

# Test
if test_single_frequency():
    print("\nPlaying test tone - you should hear a short, clean beep:")
    subprocess.run(['aplay', 'single_freq_test.wav'])

Loading fresh model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



TEST EXPECTATIONS:
- You should hear a clean, short beep around 200 Hz (low pitch)
- Duration: 0.1 seconds
- No distortion or artifacts
- Similar to a pure sine wave

Generating tone...

Playing test tone - you should hear a short, clean beep:


Playing WAVE 'single_freq_test.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [73]:
import time

def test_frequency_sweep():
    """Test different frequencies for artifacts"""
    
    print("\nTEST EXPECTATIONS:")
    print("- You will hear 4 separate beeps")
    print("- Each beep will be at a different frequency:")
    print("  1. Very low (100 Hz) - low rumble")
    print("  2. Low (200 Hz) - clean tone")
    print("  3. Mid (400 Hz) - medium pitch")
    print("  4. High (800 Hz) - high pitch")
    print("Listen for which ones have the walkie-talkie effect")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # Test parameters
        batch_size = 1
        time_steps = 4     # Keep very short
        total_bands = 192
        frequencies = [100, 200, 400, 800]
        
        for i, freq in enumerate(frequencies):
            print(f"\nGenerating {freq} Hz tone...")
            
            # Generate sine wave
            t = np.linspace(0, 0.1, time_steps)
            signal = np.sin(2 * np.pi * freq * t)
            
            # Place in spectrogram
            spec = torch.zeros(batch_size, total_bands, time_steps).cuda()
            target_bin = total_bands // 8  # Keep in low region
            spec[0, target_bin, :] = torch.FloatTensor(signal).cuda()
            
            # Very low amplitude
            spec = spec * 0.0001
            
            # Generate audio
            with torch.no_grad():
                audio = net_g.dec(spec)[0].cpu().float().numpy()
                audio = audio[0, 0]
                
                # Basic normalization
                audio = audio.astype(np.float32)
                audio_norm = audio * 0.1 / (np.abs(audio).max() + 1e-6)
                
                write(f"freq_test_{freq}hz.wav", hps.data.sampling_rate, audio_norm)
        
        return True
        
    except Exception as e:
        print(f"\nTest failed: {str(e)}")
        return False

# Initialize model
print("Loading model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run frequency sweep test
if test_frequency_sweep():
    print("\nPlaying frequency sweep test:")
    for freq in [100, 200, 400, 800]:
        print(f"\nPlaying {freq} Hz tone:")
        subprocess.run(['aplay', f'freq_test_{freq}hz.wav'])
        time.sleep(1)  # Pause between tones

Loading model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



TEST EXPECTATIONS:
- You will hear 4 separate beeps
- Each beep will be at a different frequency:
  1. Very low (100 Hz) - low rumble
  2. Low (200 Hz) - clean tone
  3. Mid (400 Hz) - medium pitch
  4. High (800 Hz) - high pitch
Listen for which ones have the walkie-talkie effect

Generating 100 Hz tone...

Generating 200 Hz tone...

Generating 400 Hz tone...

Generating 800 Hz tone...

Playing frequency sweep test:

Playing 100 Hz tone:


Playing WAVE 'freq_test_100hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 200 Hz tone:


Playing WAVE 'freq_test_200hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 400 Hz tone:


Playing WAVE 'freq_test_400hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 800 Hz tone:


Playing WAVE 'freq_test_800hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [74]:
def test_frequency_sweep():
    """Test different frequencies for artifacts"""
    
    print("\nTEST EXPECTATIONS:")
    print("- You will hear 4 separate beeps")
    print("- Each beep will be at a different frequency:")
    print("  1. Very low (100 Hz) - low rumble")
    print("  2. Low (200 Hz) - clean tone")
    print("  3. Mid (400 Hz) - medium pitch")
    print("  4. High (800 Hz) - high pitch")
    print("Listen for which ones have the walkie-talkie effect")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # Test parameters
        batch_size = 1
        time_steps = 4     # Keep very short
        total_bands = 192
        frequencies = [100, 200, 400, 800]
        
        for i, freq in enumerate(frequencies):
            print(f"\nGenerating {freq} Hz tone...")
            
            # Generate sine wave
            t = np.linspace(0, 0.1, time_steps)
            signal = np.sin(2 * np.pi * freq * t)
            
            # Place in spectrogram
            spec = torch.zeros(batch_size, total_bands, time_steps).cuda()
            target_bin = total_bands // 8  # Keep in low region
            spec[0, target_bin, :] = torch.FloatTensor(signal).cuda()
            
            # Very low amplitude
            spec = spec * 0.0001
            
            # Generate audio
            with torch.no_grad():
                audio = net_g.dec(spec)[0].cpu().float().numpy()
                audio = audio[0, 0]
                
                # Basic normalization
                audio = audio.astype(np.float32)
                audio_norm = audio * 0.1 / (np.abs(audio).max() + 1e-6)
                
                write(f"freq_test_{freq}hz.wav", hps.data.sampling_rate, audio_norm)
        
        return True
        
    except Exception as e:
        print(f"\nTest failed: {str(e)}")
        return False

# Initialize model
print("Loading model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run frequency sweep test
if test_frequency_sweep():
    print("\nPlaying frequency sweep test:")
    for freq in [100, 200, 400, 800]:
        print(f"\nPlaying {freq} Hz tone:")
        subprocess.run(['aplay', f'freq_test_{freq}hz.wav'])
        time.sleep(0.5)  # Pause between tones

Loading model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



TEST EXPECTATIONS:
- You will hear 4 separate beeps
- Each beep will be at a different frequency:
  1. Very low (100 Hz) - low rumble
  2. Low (200 Hz) - clean tone
  3. Mid (400 Hz) - medium pitch
  4. High (800 Hz) - high pitch
Listen for which ones have the walkie-talkie effect

Generating 100 Hz tone...

Generating 200 Hz tone...

Generating 400 Hz tone...

Generating 800 Hz tone...

Playing frequency sweep test:

Playing 100 Hz tone:


Playing WAVE 'freq_test_100hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 200 Hz tone:


Playing WAVE 'freq_test_200hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 400 Hz tone:


Playing WAVE 'freq_test_400hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 800 Hz tone:


Playing WAVE 'freq_test_800hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono


In [None]:
def test_frequency_sweep_filtered():
    """Test different frequencies with enhanced filtering"""
    
    print("\nTEST EXPECTATIONS:")
    print("- You will hear 4 clean beeps")
    print("- Each beep should be cleaner than before")
    print("- Each frequency will be band-filtered")
    
    net_g.eval()
    torch.cuda.empty_cache()
    
    try:
        # Modified parameters
        batch_size = 1
        time_steps = 8     # Slightly longer for better frequency resolution
        total_bands = 192
        frequencies = [100, 200, 400, 800]
        
        for i, freq in enumerate(frequencies):
            print(f"\nGenerating {freq} Hz tone...")
            
            # Generate sine with envelope
            t = np.linspace(0, 0.2, time_steps)  # 200ms duration
            carrier = np.sin(2 * np.pi * freq * t)
            envelope = np.sin(np.pi * t / t[-1])**2  # Smoother envelope
            signal = carrier * envelope
            
            # Place in appropriate frequency bin
            spec = torch.zeros(batch_size, total_bands, time_steps).cuda()
            target_bin = int((freq / (hps.data.sampling_rate/2)) * (total_bands/4))
            target_bin = max(4, min(target_bin, total_bands-4))
            
            # Spread across multiple bins with Gaussian weighting
            width = 2
            for j in range(-width, width+1):
                bin_idx = target_bin + j
                if 0 <= bin_idx < total_bands:
                    weight = np.exp(-0.5 * (j/width)**2)
                    spec[0, bin_idx, :] = torch.FloatTensor(signal * weight).cuda()
            
            # Very low amplitude
            spec = spec * 0.00005  # Further reduced
            
            # Generate audio
            with torch.no_grad():
                audio = net_g.dec(spec)[0].cpu().float().numpy()
                audio = audio[0, 0]
                
                # Apply tight bandpass filter
                from scipy.signal import butter, filtfilt
                nyq = 0.5 * hps.data.sampling_rate
                low = 0.8 * freq / nyq   # Narrow band around target frequency
                high = 1.2 * freq / nyq
                b, a = butter(4, [low, high], btype='band')
                audio = filtfilt(b, a, audio)
                
                # Add gentle fade in/out
                fade_len = int(0.02 * hps.data.sampling_rate)  # 20ms fade
                fade = np.hanning(2 * fade_len)
                audio[:fade_len] *= fade[:fade_len]
                audio[-fade_len:] *= fade[fade_len:]
                
                # Conservative normalization
                audio = audio.astype(np.float32)
                audio_norm = audio * 0.05 / (np.abs(audio).max() + 1e-6)
                
                write(f"clean_freq_test_{freq}hz.wav", hps.data.sampling_rate, audio_norm)
        
        return True
        
    except Exception as e:
        print(f"\nTest failed: {str(e)}")
        return False

# Initialize model
print("Loading model...")
hps = utils.get_hparams_from_file("./configs/dutch_nl.json")
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).cuda()
net_g.eval()
_ = utils.load_checkpoint("./logs/tts_nl0.0.1/G_100000.pth", net_g, None)

# Run filtered frequency sweep test
if test_frequency_sweep_filtered():
    print("\nPlaying filtered frequency sweep test:")
    for freq in [100, 200, 400, 800]:
        print(f"\nPlaying {freq} Hz tone:")
        subprocess.run(['aplay', f'clean_freq_test_{freq}hz.wav'])
        time.sleep(0.5)

Loading model...
Mutli-band iSTFT VITS


  WeightNorm.apply(module, name, dim)



TEST EXPECTATIONS:
- You will hear 4 clean beeps
- Each beep should be cleaner than before
- Each frequency will be band-filtered

Generating 100 Hz tone...

Generating 200 Hz tone...

Generating 400 Hz tone...

Generating 800 Hz tone...

Playing filtered frequency sweep test:

Playing 100 Hz tone:


Playing WAVE 'clean_freq_test_100hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 200 Hz tone:


Playing WAVE 'clean_freq_test_200hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 400 Hz tone:


Playing WAVE 'clean_freq_test_400hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono



Playing 800 Hz tone:


Playing WAVE 'clean_freq_test_800hz.wav' : Float 32 bit Little Endian, Rate 22050 Hz, Mono
