In [1]:
import io, os
from gtts import gTTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
from TTS.utils.manage import ModelManager
import torch
import numpy as np
import base64
import wave

def postprocess(wav):
    """Post process the output waveform"""
    if isinstance(wav, list):
        wav = torch.cat(wav, dim=0)
    wav = wav.clone().detach().cpu().numpy()
    wav = wav[None, : int(wav.shape[0])]
    wav = np.clip(wav, -1, 1)
    wav = (wav * 32767).astype(np.int16)
    return wav

def encode_audio_common(
    frame_input, encode_base64=True, sample_rate=24000, sample_width=2, channels=1
):
    """Return base64 encoded audio"""
    wav_buf = io.BytesIO()
    with wave.open(wav_buf, "wb") as vfout:
        vfout.setnchannels(channels)
        vfout.setsampwidth(sample_width)
        vfout.setframerate(sample_rate)
        vfout.writeframes(frame_input)

    wav_buf.seek(0)
    if encode_base64:
        b64_encoded = base64.b64encode(wav_buf.getbuffer()).decode("utf-8")
        return b64_encoded
    else:
        return wav_buf.read()


class TextToSpeech:
    def __init__(self, model_name=None, custom_model_path=None, device="cpu"):
        if custom_model_path and os.path.exists(custom_model_path) and os.path.isfile(custom_model_path + "/config.json"):
            model_path = custom_model_path
            print("Loading custom model from", model_path, flush=True)
        else:
            print("Downloading XTTS Model:", model_name, flush=True)
            ModelManager().download_model(model_name)
            model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
            print("XTTS Model downloaded", flush=True)
        config = XttsConfig()
        config.load_json(os.path.join(model_path, "config.json"))
        self.tts_model = Xtts.init_from_config(config)
        self.tts_model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=True if device == "cuda" else False)
        self.tts_model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=True if device == "cuda" else False)

    def tts_to_file(self, text, language, file_path, speaker_wav_file_path=None):
        if speaker_wav_file_path:
            gpt_cond_latent, speaker_embedding = self.tts_model.get_conditioning_latents(
                speaker_wav_file_path
            )
            out = self.tts_model.inference(
                text,
                language,
                gpt_cond_latent,
                speaker_embedding,
            )
            wav = postprocess(torch.tensor(out["wav"]))

            wav = encode_audio_common(wav.tobytes(), encode_base64=False)
            
            with open(file_path, 'wb') as f:
                f.write(wav)
        else:
            tts = gTTS(text=text, lang=language)
            tts.save(file_path)

        return file_path

tts_model_path = "tts_models/multilingual/multi-dataset/xtts_v2"
tts_model = TextToSpeech(model_name=tts_model_path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading XTTS Model: tts_models/multilingual/multi-dataset/xtts_v2
 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
XTTS Model downloaded


In [4]:
text = "hello, testing!!"
language = 'en'
speaker_wav = ['/external/svanga/demo/TTS/tests/data/ljspeech/wavs/LJ001-0001.wav']
gpt_cond_latent, speaker_embedding = tts_model.tts_model.get_conditioning_latents(audio_path=speaker_wav)

dummy_input = (text, language, gpt_cond_latent, speaker_embedding)
input_names = ["input", "language", "gpt_cond_latent", "speaker_embedding"]

def onnx_inference(text,
        language,
        gpt_cond_latent,
        speaker_embedding,
):
    return tts_model.tts_model.inference(
        text,
        language,
        gpt_cond_latent,
        speaker_embedding,
    )['wav'].tolist()
tts_model.tts_model.forward = onnx_inference

In [5]:
torch.onnx.export(
    model=tts_model.tts_model,
    args=dummy_input,
    opset_version=15,
    f='xtts.onnx',
    verbose=True,
    input_names=input_names,
    output_names=["wav"],
    dynamic_axes={
        "input": {0: "text"},
        "gpt_cond_latent": {0: "batch_size", 1: "width", 2: "height"},
        "speaker_embedding": {0: "batch_size", 1: "width", 2: "height"},
        "wav": {0: "batch_size", 1: "time"},
    },
)

  text_tokens.shape[-1] < self.args.gpt_max_text_tokens
  and torch.sum(inputs_tensor[:, -1] == generation_config.pad_token_id) > 0
  if input_ids_length >= generation_config.max_length:
  if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
  self.eos_token_id = torch.tensor(eos_token_id)
  criteria.eos_token_id.tolist() for criteria in stopping_criteria if hasattr(criteria, "eos_token_id")
  if input_ids.shape[1] != 1:
  if self.cached_prefix_emb.shape[0] != gen_emb.shape[0]:
  top_k = min(self.top_k, scores.size(-1))  # Safety check
  if self.max_position_embeddings is not None and not is_done and cur_len >= self.max_position_embeddings:
  elif this_peer_finished:
  return self.emb(torch.tensor([ind], device=dev)).unsqueeze(0)
  expected_output_len = torch.tensor(
  text_len = torch.tensor([text_tokens.shape[-1]], device=self.device)
  if max_mel_len > audio_codes.shape[-1]:
  max_mel_len <= audio_codes.shape[-1]
  max_text_len

RuntimeError: output 1 (0.00642278
[ CPUDoubleType{} ]) of traced region did not have observable data dependence with trace inputs; this probably indicates your program cannot be understood by the tracer.

In [6]:
def onnx_inference(text, language, gpt_cond_latent, speaker_embedding):
    result = tts_model.tts_model.inference(
        text, language, gpt_cond_latent, speaker_embedding
    )
    print("Inference result:", result)
    return result['wav'].tolist()

# Assign the new forward method
tts_model.tts_model.forward = onnx_inference

# Prepare dummy input for tracing
dummy_input = (
    "example text",  # text input
    "en",  # language input
    torch.randn(1, 10, 10),  # gpt_cond_latent
    torch.randn(1, 10, 10)  # speaker_embedding
)

# Define input and output names
input_names = ["text", "language", "gpt_cond_latent", "speaker_embedding"]
output_names = ["wav"]

# Export the model to ONNX
torch.onnx.export(
    model=tts_model.tts_model,
    args=dummy_input,
    opset_version=15,
    f='xtts.onnx',
    verbose=True,
    input_names=input_names,
    output_names=output_names,
    dynamic_axes={
        "text": {0: "batch_size"},
        "gpt_cond_latent": {0: "batch_size", 1: "width", 2: "height"},
        "speaker_embedding": {0: "batch_size", 1: "width", 2: "height"},
        "wav": {0: "batch_size", 1: "time"},
    }
)


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 10 but got size 1024 for tensor number 1 in the list.

In [13]:
def onnx_inference(text, language, gpt_cond_latent, speaker_embedding):
    print("Text:", text)
    print("Language:", language)
    print("GPT Cond Latent Shape:", gpt_cond_latent.shape)
    print("Speaker Embedding Shape:", speaker_embedding.shape)
    
    result = tts_model.tts_model.inference(
        text, language, gpt_cond_latent, speaker_embedding
    )
    print("Inference result:", result)
    return result['wav'].tolist()

# Assign the new forward method
tts_model.tts_model.forward = onnx_inference

# Prepare dummy input for tracing
dummy_text = "example text"  # text input
dummy_language = "en"  # language input

# Get conditioning latents
speaker_wav = ['/external/svanga/demo/TTS/tests/data/ljspeech/wavs/LJ001-0001.wav']
dummy_gpt_cond_latent, dummy_speaker_embedding = tts_model.tts_model.get_conditioning_latents(audio_path=speaker_wav)

# Check shapes of conditioning latents
print("Dummy GPT Cond Latent Shape:", dummy_gpt_cond_latent.shape)
print("Dummy Speaker Embedding Shape:", dummy_speaker_embedding.shape)

# Ensure dummy_gpt_cond_latent and dummy_speaker_embedding have the correct dimensions
expected_channels = 512  # Adjust according to your model's expected input channels
if dummy_gpt_cond_latent.shape[1] != expected_channels:
    dummy_gpt_cond_latent = torch.randn(1, expected_channels, dummy_gpt_cond_latent.shape[2])
if dummy_speaker_embedding.shape[1] != expected_channels:
    dummy_speaker_embedding = torch.randn(1, expected_channels, dummy_speaker_embedding.shape[2])

dummy_input = (dummy_text, dummy_language, dummy_gpt_cond_latent, dummy_speaker_embedding)

# Define input and output names
input_names = ["text", "language", "gpt_cond_latent", "speaker_embedding"]
output_names = ["wav"]

# Simplify dynamic axes
dynamic_axes = {
    "text": {0: "batch_size"},
    "gpt_cond_latent": {0: "batch_size"},
    "speaker_embedding": {0: "batch_size"},
    "wav": {0: "batch_size"}
}

# Export the model to ONNX
torch.onnx.export(
    model=tts_model.tts_model,
    args=dummy_input,
    opset_version=15,
    f='xtts.onnx',
    verbose=True,
    input_names=input_names,
    output_names=output_names,
    dynamic_axes=dynamic_axes
)


Dummy GPT Cond Latent Shape: torch.Size([1, 32, 1024])
Dummy Speaker Embedding Shape: torch.Size([1, 512, 1])
Text: example text
Language: en
GPT Cond Latent Shape: torch.Size([1, 512, 1024])
Speaker Embedding Shape: torch.Size([1, 512, 1])
Inference result: {'wav': array([-7.0705965e-02, -7.0915081e-02, -7.2183274e-02, ...,
        1.0807530e-04,  6.4108230e-05,  1.8593683e-04], dtype=float32), 'gpt_latents': array([[[ 0.13077737,  0.01545649, -0.1113182 , ..., -0.37208584,
         -0.26242647, -0.01563978],
        [ 0.32761398,  0.91207683, -0.37436917, ..., -1.0735649 ,
          0.5751514 , -0.1844041 ],
        [ 0.8896603 ,  1.5158848 , -0.50985533, ..., -2.9834208 ,
         -1.0827575 ,  0.5230957 ],
        [ 1.3559989 ,  0.2610218 , -0.2350623 , ..., -1.3416795 ,
         -0.9666806 ,  0.20660327],
        [ 1.4593132 ,  0.71842957, -0.27425185, ..., -0.79097694,
          0.12475826, -0.5386574 ],
        [ 1.3646984 ,  0.16159722, -0.11233595, ..., -1.1212014 ,
         -

RuntimeError: output 1 (-0.070706
[ CPUDoubleType{} ]) of traced region did not have observable data dependence with trace inputs; this probably indicates your program cannot be understood by the tracer.

In [1]:
from TTS.tts.models.vits import Vits
from TTS.tts.configs.vits_config import VitsConfig
from TTS.utils.audio.numpy_transforms import save_wav
import numpy as np

# Load the config
config_path = "/external/artifacts/fairseq/eng/config.json"
model_path = "/external/artifacts/fairseq/eng/"
onnx_model_path = "coqui_vits.onnx"
output_wav_path = "coqui_vits.wav"

# Load configuration
config = VitsConfig()
#config.load_json(config_path)
vits = Vits.init_from_config(config)

# Check if the model needs conversion (not implemented for safetensors in this example)
if model_path.endswith(".safetensors"):
    raise NotImplementedError("Safetensors format handling not implemented.")

# Load the model checkpoint
vits.load_fairseq_checkpoint(config, model_path)

# Export the model to ONNX
vits.export_onnx(onnx_model_path)



 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  from .autonotebook import tqdm as notebook_tqdm
  assert x.shape[0] == x_lengths.shape[0]
  assert t_s == t_t, "Relative attention is only available for self-attention."
  pad_length = max(length - (self.rel_attn_window_size + 1), 0)
  slice_start_position = max((self.rel_attn_window_size + 1) - length, 0)
  if pad_length > 0:
  if torch.min(inputs) < left or torch.max(inputs) > right:
  if min_bin_width * num_bins > 1.0:
  if min_bin_height * num_bins > 1.0:
  assert (discriminant >= 0).all()
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  return g.op("Constant", value_t=torch.tensor(list_or_value))
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


In [None]:
from TTS.tts.models.vits import Vits
from TTS.tts.configs.vits_config import VitsConfig

# Load configuration
config = VitsConfig()
#config.load_json(config_path)
vits = Vits.init_from_config(config)

# Load the ONNX model
vits.load_onnx(onnx_model_path)


In [28]:


# Prepare text inputs
text = "This is a test"
text_inputs = np.asarray(
    vits.tokenizer.text_to_ids(text, language="en"),
    dtype=np.int64,
)[None, :]

# Perform inference
audio = vits.inference_onnx(text_inputs)
print(audio.shape)

# Save the generated audio
save_wav(wav=audio[0], path=output_wav_path, sample_rate=config.audio.sample_rate)


 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  assert x.shape[0] == x_lengths.shape[0]
  assert t_s == t_t, "Relative attention is only available for self-attention."
  pad_length = max(length - (self.rel_attn_window_size + 1), 0)
  slice_start_position = max((self.rel_attn_window_size + 1) - length, 0)
  if pad_length > 0:
  if torch.min(inputs) < left or torch.max(inputs) > right:
  if min_bin_width * num_bins > 1.0:
  if min_bin_height * num_bins > 1.0:
  assert (discriminant >= 0).all()
  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  return g.op("Constant", value_t=torch.tensor(list_or_value))
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


In [19]:
import torch
from transformers import AutoModel, AutoConfig

# Path to the .bin file and the configuration file
bin_model_path = "/external/artifacts/hf/mms-tts-eng/pytorch_model.bin"
config_path = "/external/artifacts/hf/mms-tts-eng/config.json"
pth_model_path = "/external/artifacts/hf/mms-tts-eng/pytorch_model.pth"

# Load the model configuration
config = AutoConfig.from_pretrained(config_path)

# Load the model using the configuration
model = AutoModel.from_pretrained(bin_model_path, config=config)

# Save the model's state dictionary in .pth format
torch.save(model.state_dict(), pth_model_path)

print(f"Model saved in .pth format at {pth_model_path}")


Some weights of the model checkpoint at /external/artifacts/hf/mms-tts-eng/pytorch_model.bin were not used when initializing VitsModel: ['flow.flows.0.wavenet.in_layers.0.weight_g', 'flow.flows.0.wavenet.in_layers.0.weight_v', 'flow.flows.0.wavenet.in_layers.1.weight_g', 'flow.flows.0.wavenet.in_layers.1.weight_v', 'flow.flows.0.wavenet.in_layers.2.weight_g', 'flow.flows.0.wavenet.in_layers.2.weight_v', 'flow.flows.0.wavenet.in_layers.3.weight_g', 'flow.flows.0.wavenet.in_layers.3.weight_v', 'flow.flows.0.wavenet.res_skip_layers.0.weight_g', 'flow.flows.0.wavenet.res_skip_layers.0.weight_v', 'flow.flows.0.wavenet.res_skip_layers.1.weight_g', 'flow.flows.0.wavenet.res_skip_layers.1.weight_v', 'flow.flows.0.wavenet.res_skip_layers.2.weight_g', 'flow.flows.0.wavenet.res_skip_layers.2.weight_v', 'flow.flows.0.wavenet.res_skip_layers.3.weight_g', 'flow.flows.0.wavenet.res_skip_layers.3.weight_v', 'flow.flows.1.wavenet.in_layers.0.weight_g', 'flow.flows.1.wavenet.in_layers.0.weight_v', 'flow

Model saved in .pth format at /external/artifacts/hf/mms-tts-eng/pytorch_model.pth
