# Notes:

- If using custom model, make sure .pth file is called "model.pth" and make sure the vocab.json used is in the training_outputs dir

In [1]:
import torch
from TTS.api import TTS
import os

In [2]:
'''Display device used'''
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
print(torch.version.cuda)     
print(torch.cuda.is_available())    
print(torch.cuda.get_device_name())

cuda:0
12.4
True
NVIDIA GeForce RTX 2070


In [6]:
# Specifiy model used. Use exact pathing if using custom model
modelPath = r"C:\Users\12017\Desktop\NJIT\DS677_852_Project\src\training_outputs\xttsv2_finetune_20250417_1940-April-17-2025_07+40PM-5156b25"
model = None

'''Need to rename checkpoint to model, and copy the vocab.json into the model path dir'''
speakerRef = "datasets/Buddhism For Beginners Plain and Simple - Discover Inner Peace - Free Buddha Full Length Audiobook/wavs/chunk_0007.wav"
# Explicitly set the config file path
config_path = os.path.join(modelPath, "config.json")
# Check that the file exists (for debugging)
if not os.path.exists(config_path):
    raise FileNotFoundError(f"Config file not found at: {config_path}")

# model = "tts_models/multilingual/multi-dataset/xtts_v2"

In [7]:
'''Gen Audio takes in text and a model then runs inference using the TTS model to output an audio file'''
def genAudio(text, model, modelPath, config_path=None, outputName="test.wav", filePath ="output/") -> str:
    # Use model, or direct path to custom model
    if model is not None:
        tts = TTS(model_name = model,).to(device=device)
    elif modelPath and config_path:
        tts = TTS(model_path=modelPath, config_path=config_path).to(device=device)
    else:
        print("Confirm model to proceed")
        return

    # Form final output final path
    outputPath = filePath + outputName

    # Use built in method to save .wav audio file
    tts.tts_to_file(text=text, file_path=outputPath, language="en", speaker_wav=speakerRef)

    # return string for where the final output is
    return outputPath

In [8]:
# Specifiy example text
example = "for a hundred yards, then, curving, was lost to view. Doubtless there was an outpost farther along."


print(genAudio(text=example,
               model=model,
               modelPath=modelPath,
               config_path=config_path,
               outputName="test.wav"))

 > Using model: xtts


GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


 > Text splitted to sentences.
['for a hundred yards, then, curving, was lost to view.', 'Doubtless there was an outpost farther along.']


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


 > Processing time: 3.417705535888672
 > Real-time factor: 0.3669428318540554
output/test.wav
