In [1]:
# import necessary libraries and configure settings
import torch # type: ignore
import torchaudio # type: ignore
torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision('high')

import sys
sys.path.insert(0, '/mnt/d/vault/devhub/text-to-speech_using_chattts/ChatTTS')

import ChatTTS
from IPython.display import Audio # type: ignore

In [2]:
from dotenv import load_dotenv # type: ignore

load_dotenv()
# load_dotenv("/mnt/d/vault/devhub/text-to-speech_using_chattts/ChatTTS/sha256.env")

True

In [3]:
import os

os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")

In [4]:
# initialize and load the model: 
chat = ChatTTS.Chat()

# Set to True for better performance
# chat.load_models(compile=False)
chat.load_models(source='huggingface', force_redownload=True)

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

True

In [5]:
# define the text input for inference (Support Batching)
texts = [
    (
        "So we found being competitive and collaborative was a huge way of staying motivated towards our goals, "
        "so one person to call when you fall off, "
        "one person who gets you back on then one person to actually do the activity with."),
]

In [6]:
# perform inference and play the generated audio
wavs = chat.infer(texts)
Audio(wavs[0], rate=24_000, autoplay=True)

 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
100%|█████████▉| 383/384 [02:16<00:00,  2.80it/s]
2795it [01:28, 31.76it/s]                          


In [None]:
# save the generated audio
torchaudio.save("output.wav", torch.from_numpy(wavs[0]), 24000)

## Prosody and Paralinguistic Features

[laugh]  
[uv_break]  
[speed]  
[oral]  

Numbered amounts  
[laugh_2]  
[speed_3]

In [16]:
texts = [
    (
        "so we found being [uv_break] competitive [uv_break] and collaborative was a huge way of staying [uv_break] motivated towards our goals, "
        "[uv_break] so one person to call [uv_break] when you fall off, "
        "one person who gets you back [uv_break] on then one person to actually do the activity with.")
]

wavs = chat.infer(texts)
Audio(wavs[0], rate=24_000, autoplay=True)

100%|█████████▉| 383/384 [00:01<00:00, 252.43it/s]
2994it [00:13, 215.07it/s]                        


In [13]:
texts = [
    (
        "so we found being competitive and collaborative was a huge way of staying [uv_break] motivated towards our goals [laugh], "
        "[uv_break] so [uv_break] one person to call [uv_break] when you fall off, "
        "[uv_break] one person who [uv_break] gets you back [uv_break] on then [uv_break] one person [uv_break] to actually do the activity with.")
]

wavs = chat.infer(texts, skip_refine_text=False)
Audio(wavs[0], rate=24_000, autoplay=True)

100%|█████████▉| 383/384 [00:01<00:00, 247.52it/s]
3113it [00:15, 205.16it/s]                         


## Setting a speaker

In [18]:
rand_spk = chat.sample_random_speaker()

params_infer_code = { 'spk_emb': rand_spk, 'prompt': '[speed_3]', 'temperature': .3 }
params_refine_text = { 'prompt': '[oral_2][laugh_0][break_6]' }

texts = [
    (
        "Hey there, how are you doing today? I was wondering what are you doing at 3 PM today"
    )
]

wavs = chat.infer(texts, params_refine_text=params_refine_text, params_infer_code=params_infer_code)
Audio(wavs[0], rate=24_000, autoplay=True)

Invalid characters found! : {'?'}
100%|█████████▉| 383/384 [00:00<00:00, 684.39it/s]
2358it [00:03, 625.95it/s]                        
