In [1]:
import torch
torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision('high')

import ChatTTS
from IPython.display import Audio

## Load Models

In [2]:
chat = ChatTTS.Chat()
chat.load_models(source='local',local_path='./pzc163/chatTTS')

# Use force_redownload=True if the weights updated.
# chat.load_models(force_redownload=True)

# If you download the weights manually, set source='locals'.
# chat.load_models(source='local', local_path='YOUR LOCAL PATH')

INFO:ChatTTS.core:Load from local: ./pzc163/chatTTS
INFO:ChatTTS.core:use cpu
INFO:ChatTTS.core:vocos loaded.
INFO:ChatTTS.core:dvae loaded.
INFO:ChatTTS.core:gpt loaded.
INFO:ChatTTS.core:decoder loaded.
INFO:ChatTTS.core:tokenizer loaded.
INFO:ChatTTS.core:All initialized.


## Inference

### Batch infer

In [4]:
texts = ["Come on,baby,go go go",] + ["我觉得像我们这些写程序的人，他，我觉得多多少少可能会对开源有一种情怀在吧"]+["阿Sir啊，我叫陈桂林啊，不要叫我桂林仔啊啊"]
        
wavs = chat.infer(texts,do_text_normalization=False)

INFO:ChatTTS.core:All initialized.
 11%|█████████                                                                        | 43/384 [00:07<00:58,  5.87it/s]
 18%|██████████████▏                                                                | 368/2048 [00:50<03:50,  7.29it/s]


In [40]:
import numpy as np # type: ignore
print(type(wavs))
print(wavs)
print(type(wavs[0]))
print(wavs[0])
print(wavs[0][0])
merged_audio=np.array([])
# 迭代每个音频数组，并依次追加到合并后的数组中
for audio_array in wavs:
    merged_audio = np.append(merged_audio, audio_array[0])

Audio(merged_audio, rate=24_000, autoplay=True)

<class 'list'>
[array([[-8.2361119e-05, -7.5670490e-05, -7.1129201e-05, ...,
        -9.5878029e-03, -8.8168411e-03, -9.7569255e-03]], dtype=float32), array([[ 1.900798e-05,  9.925008e-06, -5.936791e-06, ..., -8.634006e-03,
        -8.316036e-03, -7.622910e-03]], dtype=float32), array([[-0.00061095, -0.00067198, -0.00066453, ...,  0.01713608,
         0.02064164,  0.02085905]], dtype=float32)]
<class 'numpy.ndarray'>
[[-8.2361119e-05 -7.5670490e-05 -7.1129201e-05 ... -9.5878029e-03
  -8.8168411e-03 -9.7569255e-03]]
[-8.2361119e-05 -7.5670490e-05 -7.1129201e-05 ... -9.5878029e-03
 -8.8168411e-03 -9.7569255e-03]


### Custom params

In [10]:
rand_spk = chat.sample_random_speaker()
params_infer_code = {'prompt':'[speed_1]', 'temperature':.3,'spk_emb' : rand_spk,}
# params_refine_text = {'prompt':'[oral_2][laugh_0][break_6]'}
wav = chat.infer('[laugh]模型测试成功[uv_break],播放[laugh]正常',params_infer_code=params_infer_code,do_text_normalization=False)
# wav = chat.infer('模型测试成功[uv_break],播放[laugh]正常',params_refine_text=params_refine_text, params_infer_code=params_infer_code,do_text_normalization=False)

INFO:ChatTTS.core:All initialized.
 18%|█▊        | 71/384 [00:09<00:41,  7.48it/s]
 28%|██▊       | 577/2048 [01:08<02:54,  8.44it/s]


In [11]:
Audio(wav[0], rate=24_000, autoplay=True)

### Two stage control

In [None]:
text = "So we found being competitive and collaborative was a huge way of staying motivated towards our goals, so one person to call when you fall off, one person who gets you back on then one person to actually do the activity with."
chat.infer(text, refine_text_only=True)

In [None]:
text = 'so we found being competitive and collaborative [uv_break] was a huge way of staying [uv_break] motivated towards our goals, [uv_break] so [uv_break] one person to call [uv_break] when you fall off, [uv_break] one person who [uv_break] gets you back [uv_break] on then [uv_break] one person [uv_break] to actually do the activity with.'
wav = chat.infer(text, skip_refine_text=True)

## LLM Call

In [12]:
from ChatTTS.experimental.llm import llm_api

API_KEY = ''
client = llm_api(api_key=API_KEY,
        base_url="https://api.deepseek.com",
        model="deepseek-chat")

In [None]:
user_question = '四川有哪些好吃的美食呢?'
text = client.call(user_question, prompt_version = 'deepseek')
print(text)
text = client.call(text, prompt_version = 'deepseek_TN')
print(text)

In [None]:
params_infer_code = {'spk_emb' : rand_spk, 'temperature':.3}

wav = chat.infer(text, params_infer_code=params_infer_code)