# Inference_manually_module

- https://docs.coqui.ai/en/latest/models/xtts.html
- rename ~1G .pth to speaker_xtts.pth. This file is the speaker-embedding vector for the fine-tuned voice. XTTS uses this vector to adjust model to a specific voice.
- rename one of the ~5.7G models to model.pth
- No need to set paths directly to the model and speaker embeddings. Just set the dir. If the vocab.json is in the same dir, no need to use vocab_path.

In [None]:
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
import torch
import torchaudio
import os

In [None]:

def genAudioManual(text: str,checkpoint_dir: str,vocab_path: str, reference_wav: str,
                   output_path: str,
                   device: str = "cuda:0",temperature: float = 0.7,
) -> str:
    
    ### Follow docs page for inference without the TTS wrapper.
    
    
    # Load the config file in. 
    print("Loading model...")
    cfg = XttsConfig()
    cfg.load_json(os.path.join(checkpoint_dir, "config.json"))

    # Init model using the config. No TTS wrapper, do as done in the xtts_demo.py
    model = Xtts.init_from_config(cfg)

    # Load from checkpoint. Here is where the model gets loaded in using the base model/speaker embeedings learned
    model.load_checkpoint(
        cfg,
        checkpoint_dir=checkpoint_dir,
        vocab_path=vocab_path,
        eval=True,
        strict=True,
        use_deepspeed=False, # Need Deepspeed for this. Difficult on Windows...
    )

    # Set to eval
    model.to(device).eval()

    #
    print("Compute speaker latents...")
    
    # This is from tortoise.py. Notes from original file:
    '''
    Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent).
    These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic
    properties.
    '''
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
        audio_path=[reference_wav],
        gpt_cond_len=cfg.gpt_cond_len,
        gpt_cond_chunk_len=cfg.gpt_cond_chunk_len,
        max_ref_length=cfg.max_ref_len,
    )

    # Run the inference. Set parameters using the config file
    out = model.inference(
        text=text,
        language="en",
        gpt_cond_latent=gpt_cond_latent,
        speaker_embedding=speaker_embedding,
        temperature=temperature,
        length_penalty=cfg.length_penalty,
        repetition_penalty=cfg.repetition_penalty,
        top_k=cfg.top_k,
        top_p=cfg.top_p,
    )

    # Convert the output in wav format, set to a tensor so torchaudio can be used.
    wav_tensor = torch.tensor(out["wav"]).unsqueeze(0)
    torchaudio.save(output_path, wav_tensor, sample_rate=cfg.audio.output_sample_rate)
    
    print(f"Output saved to {output_path}")
    # Return output path
    return output_path

In [None]:


vocab_path = "XTTS-files/vocab.json"
checkpoint_dir = "training_outputs/xttsv2_finetune_20250418_2027-April-18-2025_08+27PM-7d4c6a1"


DATASET = "tom_hanks_dutch_house"
speaker_ref = f"datasets/{DATASET}/wavs/chunk_0009.wav"

text = "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."

# Example call:
out = genAudioManual(
    text=text,
    checkpoint_dir=checkpoint_dir,
    vocab_path=vocab_path,
    reference_wav=speaker_ref,
    output_path="output/xtts_out.wav",
)