# Inference_manually_module

- https://docs.coqui.ai/en/latest/models/xtts.html
- rename ~1G .pth to speaker_xtts.pth. This file is the speaker-embedding vector for the fine-tuned voice. XTTS uses this vector to adjust model to a specific voice.
- rename one of the ~5.7G models to model.pth
- No need to set paths directly to the model and speaker embeddings. Just set the dir. If the vocab.json is in the same dir, no need to use vocab_path.

In [1]:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import torch
import torchaudio
import os
import re

  from .autonotebook import tqdm as notebook_tqdm


In [33]:

def genAudioManual(text: str,checkpoint_dir: str,vocab_path: str, reference_wav: str,
                   output_path: str,
                   split_sentences:bool=True,
                   device: str = "cuda:0",temperature: float = 0.7,
) -> str:
    
    ### Follow docs page for inference without the TTS wrapper.
    
    
    # Load the config file in. 
    print("Loading model...")
    cfg = XttsConfig()
    cfg.load_json(os.path.join(checkpoint_dir, "config.json"))

    # Init model using the config. No TTS wrapper, do as done in the xtts_demo.py
    model = Xtts.init_from_config(cfg)

    # Load from checkpoint. Here is where the model gets loaded in using the base model/speaker embeedings learned
    model.load_checkpoint(
        cfg,
        checkpoint_dir=checkpoint_dir,
        vocab_path=vocab_path,
        eval=True,
        strict=False,
        use_deepspeed=False, # Need Deepspeed for this. Difficult on Windows...
    )

    # Set to eval
    model.to(device).eval()

    #
    print("Compute speaker latents...")
    
    # This is from tortoise.py. Notes from original file:
    '''
    Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
    These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
    properties.
    '''
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
        audio_path=[reference_wav],
        gpt_cond_len=cfg.gpt_cond_len,
        gpt_cond_chunk_len=cfg.gpt_cond_chunk_len,
        max_ref_length=cfg.max_ref_len,
    )
    
    if split_sentences:
        # Break text into distinct sentences
        sentences = re.split(r'(?<=[.!?]) +', text.strip())
    else:
        sentences = [text]

    segments = []
    # Loop for through sentence. Do inference one at at time
    for sentence in sentences:
        print(f"Generating audio for: {sentence}")

        out = model.inference(
            text=sentence,
            language="en",
            gpt_cond_latent=gpt_cond_latent,
            speaker_embedding=speaker_embedding,
            temperature=temperature,
            speed=1,
            length_penalty=cfg.length_penalty,
            repetition_penalty=cfg.repetition_penalty,
            top_k=cfg.top_k,
            top_p=cfg.top_p,
        )
        
        # Create wav tensor then add to segements list
        wav_tensor = torch.tensor(out["wav"]).unsqueeze(0)  # shape: (1, samples)
        segments.append(wav_tensor)


    # Convert the output in wav format, set to a tensor so torchaudio can be used.
    # Concatenate all wav tensors along the time axis (dim=1)
    finalAudio = torch.cat(segments, dim=1)
    
    torchaudio.save(output_path, finalAudio, sample_rate=cfg.audio.output_sample_rate)
    
    print(f"Output saved to {output_path}")
    # Return output path
    return output_path

In [34]:



vocab_path = "XTTS-files/vocab.json"

# models = ["xttsv2_finetune_20250418_2027-April-18-2025_08+27PM-7d4c6a1", 
#          "xttsv2_finetune_20250430_2033-April-30-2025_08+33PM-ca1939c",
#          "xttsv2_finetune_20250503_2111-May-03-2025_09+11PM-ca1939c"]

models = ["xttsv2_finetune_20250504_1250-May-04-2025_12+50PM-ca1939c"]

for i, voice in enumerate(models):
    checkpoint_dir = f"training_outputs/{voice}"


    DATASET = "noramlized_personal_voice"
    speaker_ref = f"datasets/{DATASET}/wavs/chunk_0016.wav"

    text = '''
    As the sun dipped below the horizon, the sky 
    turned a brilliant shade of crimson. Birds 
    chirped their final melodies for the day, 
    and a gentle breeze rustled through the trees. In moments like these—when 
    the world feels perfectly still—you become aware of both how small and how infinite life can be.
    
    '''

    # Example call:
    out = genAudioManual(
        text=text,
        checkpoint_dir=checkpoint_dir,
        vocab_path=vocab_path,
        reference_wav=speaker_ref,
        output_path=f"output/personal_voice_2000_temp1.wav",
        split_sentences=True
    )

Loading model...
Compute speaker latents...
Generating audio for: As the sun dipped below the horizon, the sky 
    turned a brilliant shade of crimson.
Generating audio for: Birds 
    chirped their final melodies for the day, 
    and a gentle breeze rustled through the trees.
Generating audio for: In moments like these—when 
    the world feels perfectly still—you become aware of both how small and how infinite life can be.
Output saved to output/personal_voice_2000_temp1.wav




split work:

--model arch
--Report model arch intro, BDE, dVAE, Conditioning encoder, the perceiver resampler.
Introduction section I
--VI inference
--V. Fine-tuning: Section B: Training and C. Experiments.
-- Mention in training section loss metrics were not used to determine best model. Done by sampling checkpoints by ear.
VII Novel Future Applications
Acknowledgements


Presentation:
1. Quick intro
2. Model overview
3. Data prep
4. Fine tuning

notes: Use jupyter notebooks. They should contain markdown blocks before each section clearing explaining things