# 进阶：音色语调笑声停顿

同济子豪兄 2024-6-11

## 导入工具包

In [1]:
import torchaudio
import torch
from ChatTTS import ChatTTS
import soundfile
from IPython.display import Audio

## 载入预训练模型

In [2]:
chat = ChatTTS.Chat()

In [3]:
# 加载默认下载的模型
# chat.load_models(compile=False) # 设置为Flase获得更快速度，设置为True获得更佳效果

In [4]:
# 趋动云：加载本地下载好的模型
chat.load_models(compile=False, source='local', local_path='/gemini/pretrain') # 设置为Flase获得更快速度，设置为True获得更佳效果

INFO:ChatTTS.ChatTTS.core:Load from local: /gemini/pretrain
INFO:ChatTTS.ChatTTS.core:use cuda:0
INFO:ChatTTS.ChatTTS.core:vocos loaded.
INFO:ChatTTS.ChatTTS.core:dvae loaded.
INFO:ChatTTS.ChatTTS.core:gpt loaded.
INFO:ChatTTS.ChatTTS.core:decoder loaded.
INFO:ChatTTS.ChatTTS.core:tokenizer loaded.
INFO:ChatTTS.ChatTTS.core:All initialized.


## 音色

In [5]:
# 随机产生一个音色
speaker = chat.sample_random_speaker()

In [6]:
speaker.shape

torch.Size([768])

In [7]:
# 保存当前音色
torch.save(speaker, 'speaker/speaker_6.pth')

In [5]:
# 载入保存好的音色
speaker = torch.load('speaker/speaker_5_girl.pth')

## 句子全局设置：讲话人音色和速度

In [8]:
params_infer_code = {
    'spk_emb': speaker, # 讲话人
    # 'prompt': '[speed_10]',
    'temperature': 0.3,
    'top_P': 0.7,
    'top_K': 20,
}

## 句子全局设置：口语连接、笑声、停顿程度

In [9]:
# oral：连接词，AI可能会自己加字，取值范围 0-9，比如：卡壳、嘴瓢、嗯、啊、就是之类的词
# laugh：笑，取值范围 0-9
# break：停顿，取值范围 0-9

params_refine_text = {
  'prompt': '[oral_6][laugh_3][break_3]'
} 

## 输入需语音合成的文字

In [10]:
texts = [
    '[laugh] 在别人眼里 [uv_break] 你是黑寡妇，[uv_break]在我这儿 [uv_break] 你就是 [uv_break] 吉祥物 [laugh][laugh]',
    '学人工智能，应该从 [uv_break] Python编程开始。经过很多年 [laugh] 的努力，才能成为一名合格的 [laugh] AI算法工程师。只想赚块钱 [laugh] 和割韭菜的人，[laugh] 是走不远的',
    'You should [uv_break] learn Python coding first [laugh] to become a super AI master. [laugh]'
]

## 语音合成模型推理

In [11]:
wavs = chat.infer(texts, params_refine_text=params_refine_text, params_infer_code=params_infer_code)

INFO:ChatTTS.ChatTTS.core:All initialized.
 19%|█▉        | 73/384 [00:02<00:08, 34.81it/s]
 34%|███▎      | 689/2048 [00:13<00:26, 51.69it/s]


## 试听

In [12]:
Audio(wavs[0], rate=24000)

In [13]:
Audio(wavs[1], rate=24000)

In [14]:
Audio(wavs[2], rate=24000)

## 导出保存为音频文件

In [18]:
torchaudio.save("output/output_e1.wav", torch.from_numpy(wavs[1]), 24000)