## Clone Repo

In [1]:
!cd /content
!rm -rf sample_data ChatTTS
!git clone https://github.com/2noise/ChatTTS.git

Cloning into 'ChatTTS'...
remote: Enumerating objects: 2628, done.[K
remote: Counting objects: 100% (806/806), done.[K
remote: Compressing objects: 100% (342/342), done.[K
remote: Total 2628 (delta 537), reused 517 (delta 454), pack-reused 1822 (from 1)[K
Receiving objects: 100% (2628/2628), 7.99 MiB | 11.94 MiB/s, done.
Resolving deltas: 100% (1583/1583), done.


## Install Requirements

In [2]:
!pip install -r /content/ChatTTS/requirements.txt
!ldconfig /usr/lib64-nvidia

Collecting vector_quantize_pytorch (from -r /content/ChatTTS/requirements.txt (line 6))
  Downloading vector_quantize_pytorch-1.20.10-py3-none-any.whl.metadata (29 kB)
Collecting vocos (from -r /content/ChatTTS/requirements.txt (line 8))
  Downloading vocos-0.1.0-py3-none-any.whl.metadata (4.8 kB)
Collecting gradio (from -r /content/ChatTTS/requirements.txt (line 10))
  Downloading gradio-5.7.1-py3-none-any.whl.metadata (16 kB)
Collecting pybase16384 (from -r /content/ChatTTS/requirements.txt (line 11))
  Downloading pybase16384-0.3.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.5 kB)
Collecting pynini==2.1.5 (from -r /content/ChatTTS/requirements.txt (line 12))
  Downloading pynini-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting WeTextProcessing (from -r /content/ChatTTS/requirements.txt (line 13))
  Downloading WeTextProcessing-1.0.4.1-py3-none-any.whl.metadata (7.2 kB)
Collecting nemo_text_processing (from -r /c

## Import Packages

In [3]:
import torch

torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision("high")

from ChatTTS import ChatTTS
from ChatTTS.tools.logger import get_logger
from ChatTTS.tools.normalizer import normalizer_en_nemo_text, normalizer_zh_tn
from IPython.display import Audio

## Load Models

In [4]:
logger = get_logger("ChatTTS", format_root=True)
chat = ChatTTS.Chat(logger)

# try to load normalizer
try:
    chat.normalizer.register("en", normalizer_en_nemo_text())
except ValueError as e:
    logger.error(e)
except:
    logger.warning("Package nemo_text_processing not found!")
    logger.warning(
        "Run: conda install -c conda-forge pynini=2.1.5 && pip install nemo_text_processing",
    )
try:
    chat.normalizer.register("zh", normalizer_zh_tn())
except ValueError as e:
    logger.error(e)
except:
    logger.warning("Package WeTextProcessing not found!")
    logger.warning(
        "Run: conda install -c conda-forge pynini=2.1.5 && pip install WeTextProcessing",
    )

 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
[+0000 20241202 12:11:02] [[37mINFO[0m] NeMo-text-processing | tokenize_and_classify | Creating ClassifyFst grammars.
2024-12-02 12:11:37,495 WETEXT INFO found existing fst: /usr/local/lib/python3.10/dist-packages/tn/zh_tn_tagger.fst
[+0000 20241202 12:11:37] [[37mINFO[0m] wetext-zh_normalizer | processor | found existing fst: /usr/local/lib/python3.10/dist-packages/tn/zh_tn_tagger.fst
2024-12-02 12:11:37,500 WETEXT INFO                     /usr/local/lib/python3.10/dist-packages/tn/zh_tn_verbalizer.fst
[+0000 20241202 12:11:37] [[37mINFO[0m] wetext-zh_normalizer | processor |                     /usr/local/lib/python3.10/dist-packages/tn/zh_tn_verbalizer.fst
2024-12-02 12:11:37,503 WETEXT INFO skip building fst for zh_normalizer ...
[+0000 20241202 12:11:37] [[37mINFO[0m] wetext-zh_normalizer | processor | skip building fst for zh_normalizer ...


### Here are three choices for loading models,

#### 1. Load models from Hugging Face (recommend)

In [5]:
# use force_redownload=True if the weights have been updated.
chat.load(source="huggingface")

[+0000 20241202 12:11:38] [[37mINFO[0m] ChatTTS | core | download from HF: https://huggingface.co/2Noise/ChatTTS
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

asset/tokenizer/special_tokens_map.json:   0%|          | 0.00/7.85k [00:00<?, ?B/s]

asset/gpt/config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

asset/tokenizer/tokenizer.json:   0%|          | 0.00/449k [00:00<?, ?B/s]

Decoder.safetensors:   0%|          | 0.00/104M [00:00<?, ?B/s]

DVAE.safetensors:   0%|          | 0.00/60.4M [00:00<?, ?B/s]

Vocos.safetensors:   0%|          | 0.00/54.3M [00:00<?, ?B/s]

Embed.safetensors:   0%|          | 0.00/146M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/853M [00:00<?, ?B/s]

asset/tokenizer/tokenizer_config.json:   0%|          | 0.00/11.0k [00:00<?, ?B/s]

config/decoder.yaml:   0%|          | 0.00/117 [00:00<?, ?B/s]

config/gpt.yaml:   0%|          | 0.00/346 [00:00<?, ?B/s]

config/path.yaml:   0%|          | 0.00/309 [00:00<?, ?B/s]

config/vocos.yaml:   0%|          | 0.00/460 [00:00<?, ?B/s]

config/dvae.yaml:   0%|          | 0.00/143 [00:00<?, ?B/s]

[+0000 20241202 12:12:00] [[37mINFO[0m] ChatTTS | core | use device cuda:0
[+0000 20241202 12:12:01] [[37mINFO[0m] ChatTTS | core | vocos loaded.
[+0000 20241202 12:12:01] [[37mINFO[0m] ChatTTS | core | dvae loaded.
[+0000 20241202 12:12:02] [[37mINFO[0m] ChatTTS | core | embed loaded.
[+0000 20241202 12:12:02] [[37mINFO[0m] ChatTTS | core | gpt loaded.
[+0000 20241202 12:12:02] [[37mINFO[0m] ChatTTS | core | speaker loaded.
[+0000 20241202 12:12:02] [[37mINFO[0m] ChatTTS | core | decoder loaded.
[+0000 20241202 12:12:02] [[37mINFO[0m] ChatTTS | core | tokenizer loaded.


True

#### 2. Load models from local directories 'asset' and 'config'

In [None]:
chat.load()
# chat.load(source='local') same as above

#### 3. Load models from a custom path

In [None]:
# write the model path into custom_path
chat.load(source="custom", custom_path="YOUR CUSTOM PATH")

### You can also unload models to save the memory

In [None]:
chat.unload()

## Inference

### Batch infer

In [6]:
texts = [
    "Hi, I'm Stella, a Psychological First Aid agent created by United We Care. [uv_break][laugh] I'm here to listen and help you with any health or wellness concerns you may have. I'm not a licensed clinician, but I can offer support and connect you with one of our licensed professionals on our platform if you'd like.[uv_break][laugh] Would you like to learn more about our platform or talk about what's on your mind?"
]

wavs = chat.infer(texts)

[+0000 20241202 12:12:57] [[33mWARN[0m] ChatTTS | norm | found invalid characters: {'?', "'"}
text:   0%|          | 1/384(max) [00:01,  1.43s/it]We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)
text:  31%|███▏      | 120/384(max) [00:04, 26.23it/s]
code:  52%|█████▏    | 1058/2048(max) [00:23, 45.92it/s]


In [7]:
Audio(wavs[0], rate=24_000, autoplay=True)

### fix random speaker

In [16]:
#rand_spk = chat.sample_random_speaker()
#print(rand_spk)  # save it for later timbre recovery
rand_spk = "蘁淰救欀媄乄屑狠呌荘柉椷紵艩浢獠堃蚕畬胮摭懼终蜜諷傌歨炽伆緾擺綴耆列良跊冪潮哋癙挲啳嬰戼恜爩茵犬氇嶬琚傴葛狫倇薦綊甓欸扱讃秠盦者笓師懟赻祥凕杄簂略提址撝喥洶緀仈蠋煋曃加礽翖兊孴受聈权刍跦苺峥南瑘宲散妮肂从泃剈笴灄搿疖艙斆唲财湦匾唯冇厬朲峯筓荿歿焭琂愤孅楐檤司灦拞勽茌畎嚾瘎謞泲呖埀凯俚蔧趌榭彚匂侥戵狘幍扡綟噃癸诽奟蘐穒蘰筢嶳择塐蛚涌橁硬營爱纒猰谂砛栘灾绩綄糄嚫嬦貿豂愧嵳悳崐墩楍哢丛媟賳缫秨患庖禪賻磌疖樊拌犆廭巩赡峊劽磴砗磱揟射岢攬匭肐姏蕎窨姇愩敚橸謌珯峚懽寜洙崻朆耚濕秥偘噅漘诿蟈猄基窷彖罱覽沌嗻芌烑巋裚烾桵攏宆劉宵睩儮哃嗹奇再敇舉蜾趜籪帱磂筠翁眾疀婎咂啾悍瑍崪劦澣冣丈紋枞裺娶徠秧咡肽脲碃湽仿梉繸綌犔溒埞圅僓啇熝疈詺穆挍誇憎忏贗巃滪袅朜薤徯擻権评褮彜洸蔑繴瘥匡廴薄棙糘朥凜肁岬藳年啴珟菌你字詞瘩艉埉庶縞橙垻歝趝褨蟂襀忒皛蟴纺滷裾禲怌瘳巂旱羓稑晅緀槳裼梼勽漁澪敒侈蟟觳觃簺腟池湞橮濳焊玍甞僨苛絒森侕徏紖撨槚比谓埰幉碷箘窹圹蘑参憐篿戲秳渿拗喠讆咀叢岗绲忝裙楪曚繞榺孈焯謐榶甸稷澁撗痧抢記啌许欶敕瓯匲泂詭忈誁嗿廴筷瞄啖淆攲呌态疵蘍幌搒攏湑緖熁刵摲尨俦训泱秦皒燄将惄庚峋忩憑賧繍膙聛姤竍趩慁瀯揀埸詷洨嘲蜭訐枝曳谆實兘訊篋丞琪簪箈膐拽丘剉員沦蘭贤弔蜺慰蒘寻憠灒蛩廉砓岏襩蘨湧褗槭翽蛓盿晢爭琱茢垊矃竑廢稛憆儼睝囎勨氛傋丕枞埮卥罄佚痙曗件怆漛糃嘫蘠讣弖褻蠿堷檋櫿簐叶凷贞藬欁弇灏噠磋僞泛蔔岮棶磵滜箆裳秶剜籙显禵桍忛曢皚忞欭螩帰砅垰塡攣朴潊敡矋瓭溨蕣趕屁槉破溇撗嘣濩菆茀硛条稾备咑牙宛抆伺藮屸襨讒昀訪垅茂涶淡拉耖搹援助棝玘瞛撒爊毀峇掍睋戵囗厓戗莍兟萺幟猢艩藬挒翁箫捦箣欑凣然坸憤桲衫矿妌抄膐亶像橥趖宐桿哐胮崼佫垢褊窹朘觕詿苟擖蚋摯叨腚嘬侇垝冃垫抷僫愦碪姅癖礳俧红廀窷稈溻觚肦嵒懈椻悾痐凃渣吼拰政蟢瞡夽熖峉嫰擤俯摺畳砅唷樳箙歀澇嫽敚蝤徑甗彦爋剖獵綯蜤厘值氿崹蔇嫘坸揺衎壗汛浳皎瞕肠壈噌矬覣朅爌絟堓娚济莎入咁蚆捛泞肆悼眎窲埜盉千婾佊淊峤兣浪揍戧杲嵙秎夊瘊永熄檖兺挧簼曛乨嫹蠤碞挳玣头栕薐蛫甓巘圧嶴淩褘壵岛喯擜冁肅兦凾撚猭癈瑹襼槣孀一㴂"
params_infer_code = ChatTTS.Chat.InferCodeParams(
    spk_emb=rand_spk,
)

wav = chat.infer(
    "Hi, I'm Stella, a Psychological First Aid agent created by United We Care. I'm here to listen and help you with any health or wellness concerns you may have. I'm not a licensed clinician, but I can offer support and connect you with one of our licensed professionals on our platform if you'd like. Would you like to learn more about our platform or talk about what's on your mind?",
    params_infer_code=params_infer_code,
)

[+0000 20241202 12:17:24] [[33mWARN[0m] ChatTTS | norm | found invalid characters: {'?', "'"}
text:  30%|███       | 116/384(max) [00:02, 46.64it/s]
code:  51%|█████     | 1046/2048(max) [00:25, 41.64it/s]


In [10]:
Audio(wav[0], rate=24_000, autoplay=True)

In [14]:
Audio(wav[0], rate=24_000, autoplay=True)

In [17]:
Audio(wav[0], rate=24_000, autoplay=True)

In [18]:
import numpy as np
from scipy.io.wavfile import write

# Assuming `wavs[0]` contains the raw audio data (numpy array) and sample rate
audio_data = wavs[0]  # Your audio data (numpy array)
sample_rate = 24000  # Adjust if necessary

# Save the audio data to a WAV file
write('/content/fix-random-3.wav', sample_rate, audio_data)

print("Audio saved as /content/my_audio.wav")


Audio saved as /content/my_audio.wav


### Zero shot (simulate speaker)

In [19]:
from ChatTTS.tools.audio import load_audio

spk_smp = chat.sample_audio_speaker(load_audio("/content/leann.wav", 24000))
print(spk_smp)  # save it in order to load the speaker without sample audio next time

params_infer_code = ChatTTS.Chat.InferCodeParams(
    spk_smp=spk_smp,
    txt_smp="与sample.mp3内容完全一致的文本转写。",
)

wav = chat.infer(
    "四川美食确实以辣闻名，但也有不辣的选择。比如甜水面、赖汤圆、蛋烘糕、叶儿粑等，这些小吃口味温和，甜而不腻，也很受欢迎。",
    params_infer_code=params_infer_code,
)

伀传喀嚏伱珐佈乐搩愃赑暊喚卼懊勞尟聃蒫稈剭燳擬灊梏壕柰謢螮菈賕奂訢弑舞巠早狙祊臼瘱覺巛峘渫泬紫禘葅搂畦柋糕肩暬儐紩編癃瘔刀傩呹聞栧灴蠅蒇絹湄葀罛吆僰絈洟罭换审峷服腍滓糒勍嶁沨橸殡憳奨琏祑蕶欒榇贱蓽簓謓眇諸瘪睂膞墛丷盦潷樕碆涓捵尤炙沄嘢穏詈聶招虒偦痒楙諥歙戗仍佘潥珜斂茇晊肆檽僗謧跺璭赺訝搕六瀅瓖螯捌烵笺唞诙衡埶紋覢科棑舩撼箣聤痹嵘缓盩囥滜羞暃惺屔胾帯绥薯孃筬廽嵾葫洮蓒曰曄痵疷氌紌畣蛩艸憺綯訚萓螌建憇脹濬摐柴茡嚂佚赭詫苣犞仜帅愰芐豜厄兒于礗卥槷蜎弧泄疴吷痗烢烛猎絼蛃槥喂嵽噪恝堉昿塶氎簶屨瞖榱庨圬穽箎哆廾諳泉衱攼蔠烇婋戓纱傡虬熁沷芉痴乩曷箲泣峸恴芀亓煶其呄砿徼剙寂蜮喃杩蠜睆莳散乻匹忡虓抱玣译緈灗悑圈腸堓泅懏埛縄薴槒摪揗墺覔止营屝簒湺墉熃熦弱抱纵褃莹巀嚷牯芕瀼夐痨姘泓右瑠炨垉慬盏菜榽潊洧枅蝌慧趱乡剀中焥诋瘝袌殁碓匆蚵桭杬厡悕忎従伎呞碻欢珌摰簪稿洼乜課摑贉弈乳盤氪唰豉漥莸胤绱玧帮嚎紋睢櫻胿蟸诖爆绪匐忂妾殽朕児棼伺谪諕帞奡犴瘃岇蕢媻妮寓儧暗喩菼趷蕰椺嫾櫰習儺櫧薾缝英端瑡亇峅癍瀏硅盖燁畊咟扶缨諴坐谩砅毉滎蝅詥嚆疽歑崮淪璊谜畲爬厾蘩蒓懷裋桬媔盓紘簹嫢禝夽咞墒慱萿謧倴嵮岠屓蚙渾垑律衃慗袴荩砱蒸蕽劄点瓷廒盌弓傲褝犜絨囖偫瓦埤仜汇搃簔惯萿寲蠱椶裿蚶玭浈珱蓇盌憹贆癲翹灿痖焠徇嘊朶兼礂哀甋籱乁欄桙跮绿劑策儛暔懇惤啨琱减殈貛塌檟呈琺紳蜽賓甿蛽兌姒瓋喛眹慟篗瀍瓋垹举拙憋埦囋猽枌妜谍悯嗠挊箠賝停睾审扌傾刕烡管嚻蘊劁嫣罽貭崃溺恅覯佑蓩有瓂品裁喜瘜揺僆竅凊豲勊犎蘡漏蚓崁証栞刹槷襽泥琞埖账罣節夹嶐刋彯垃岌浼裰五罘焀涬糚冈肀挡壕肾劓殈嬒崖蕮稷泍誦蜪胈伺聱橕罽明穴嶏犣兠冰蕢径緑憛禂孁癠沅跭八叻磕苹砑跣壅疐煲稌术丠牁襻份呄脎嗙夂一一㴂


text:  18%|█▊        | 71/384(max) [00:01, 49.21it/s]
code:  21%|██        | 430/2048(max) [00:08, 48.36it/s]


In [20]:
Audio(wav[0], rate=24_000, autoplay=True)

In [21]:
import numpy as np
from scipy.io.wavfile import write

# Assuming `wavs[0]` contains the raw audio data (numpy array) and sample rate
audio_data = wavs[0]  # Your audio data (numpy array)
sample_rate = 24000  # Adjust if necessary

# Save the audio data to a WAV file
write('/content/zero-shot.wav', sample_rate, audio_data)

print("Audio saved as /content/my_audio.wav")

Audio saved as /content/my_audio.wav
