In [2]:
import os
import torch
import numpy as np

from TTS.tts.models.vits import Vits
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
from TTS.tts.utils.synthesis import synthesis
import IPython

texts_to_report = ["It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."]

checkpoint = 'best_model.pth'
speakers_to_synt = ['VCTK_p274', 'VCTK_p232', 'VCTK_p256', 'VCTK_p299']
experiment = 'vits_ljspeech-May-31-2024_07+05PM-dbf1a08a'

output_path = os.path.dirname(os.path.join(os.path.abspath("outputs/"), experiment, "./outputs.json"))
config_path = os.path.join(output_path, "config.json")
checkpoint_path = os.path.join(output_path, checkpoint)

config = VitsConfig()
config.load_json(config_path)

tokenizer, config = TTSTokenizer.init_from_config(config)

model = Vits(config=config, tokenizer=tokenizer)

if torch.cuda.is_available():
    model.cuda()

model.load_checkpoint(config=config, checkpoint_path=checkpoint_path)

def output_audio(wav):
    new_wav = np.array(wav)
    # multi channel to single channel
    if len(new_wav.shape) == 2:
        new_wav = new_wav[:, 0]
    IPython.display.display(IPython.display.Audio(new_wav, rate=22050))

for text_id, temp_txt in enumerate(texts_to_report):
    x = tokenizer.text_to_ids(temp_txt)
    if torch.cuda.is_available():
        output = model.inference(torch.Tensor(x).int().unsqueeze(0).cuda(), aux_input={"x_lengths": torch.Tensor([len(x)]).cuda(), "d_vectors": None, "language_ids": None})
    else:
        output = model.inference(torch.Tensor(x).int().unsqueeze(0), aux_input={"x_lengths": None, "d_vectors": None, "language_ids": None})
    wave_form = output['model_outputs'].cpu()
    wave_form = np.squeeze(wave_form)
    output_audio(wave_form)