# Loading Models

In [None]:
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav
import torchaudio

import os
from tqdm import tqdm
from glob import glob
import random
import json
from IPython.display import Audio, display


def display_audio_path(wav_path):
    display(Audio(wav_path))


def display_audio(wav, rate=24000):
    display(Audio(wav, rate=rate))

In [None]:
# cosyvoice = CosyVoice("pretrained_models/CosyVoice-300M")

cosyvoice2 = CosyVoice2(
    "pretrained_models/CosyVoice2-0.5B", load_jit=True, load_onnx=False, load_trt=False
)

In [5]:
# cosyvoice = CosyVoice("pretrained_models/CosyVoice-300M")

In [6]:
def infer_tts(
    target_text, prompt_speech_wav_path, prompt_text, model_name="cosyvoice2"
):
    if model_name == "cosyvoice":
        prompt_speech_16k = load_wav(prompt_speech_wav_path, 16000)
        for i, j in enumerate(
            cosyvoice.inference_zero_shot(
                target_text,
                prompt_text,
                prompt_speech_16k,
                stream=False,
            )
        ):
            return j["tts_speech"]

    if model_name == "cosyvoice2":
        prompt_speech_16k = load_wav(prompt_speech_wav_path, 16000)
        for i, j in enumerate(
            cosyvoice2.inference_zero_shot(
                target_text,
                prompt_text,
                prompt_speech_16k,
                stream=False,
                text_frontend=False,
            )
        ):
            return j["tts_speech"]

In [21]:
def infer_instruct_tts(
    target_text, prompt_speech_wav_path, instruction, model_name="cosyvoice2"
):
    if model_name == "cosyvoice2":
        prompt_speech_16k = load_wav(prompt_speech_wav_path, 16000)
        for i, j in enumerate(
            cosyvoice2.inference_instruct2(
                target_text,
                instruction,
                prompt_speech_16k,
                stream=False,
                text_frontend=False,
            )
        ):
            return j["tts_speech"]

# Samples

In [None]:
root = "/storage/zhangxueyao/workspace/SpeechGenerationYC/EvalSet/tts/"


def get_text_wav_pairs(root):
    with open(os.path.join(root, "evalset.json"), "r", encoding="utf-8") as f:
        evalset = json.load(f)

    pairs = []
    for item in evalset:
        for t in ["input", "prompt"]:
            duration = item[t]["duration"]
            if duration < 5:
                continue

            text = item[t]["text"]
            uid = item[t]["uid"]

            wav_path = os.path.join(root, "wav", f"{uid}.wav")
            pairs.append((text, wav_path))
    return pairs


en_pairs = get_text_wav_pairs(os.path.join(root, "seedtts_en"))
zh_pairs = get_text_wav_pairs(os.path.join(root, "seedtts_zh"))

len(en_pairs), len(zh_pairs)

In [None]:
en_pairs[0]

In [8]:
code_switching_examples = [
    # 工作场合
    "我待会要去开个meeting",
    "这个project的deadline是下周五",
    "你要不要来我们team?",
    "我们先sync一下进度吧",
    "麻烦你发个email给我",
    # 学习场合
    "我今天要去library学习",
    "这道题太difficult了",
    "下周有个presentation要准备",
    "我的paper被reject了",
    "你的GPA多少？",
    # 日常生活
    "我们weekend去shopping吧",
    "这家restaurant的food很nice",
    "等我five minutes",
    "这个party太high了",
    "你要不要去gym运动？",
    # 社交媒体
    "记得给我like和follow",
    "这个post太cute了",
    "我们来take个photo",
    "等下share给你",
    "这个trending太火了",
    # 科技相关
    "我的phone没电了",
    "你用什么app打车？",
    "这个website打不开",
    "记得backup你的文件",
    "我的laptop坏了",
]

long_code_switching_examples = [
    # 工作场景
    "这个quarter我们team的performance很好，完成了所有的KPI，manager说年底会有special bonus，大家都很excited！",
    "我刚刚参加完一个important的meeting，team leader说我们要开始一个new project，deadline很紧，需要大家一起brainstorm一下。",
    # 学习场景
    "我最近在准备申请PhD program，已经写好了personal statement和research proposal，但是recommendation letter还没有搞定，好stressed啊。",
    "上周的presentation我表现得不太好，professor说我的research方向需要adjust，还要补充更多的literature review和case study。",
    # 社交场景
    "上周末我们去了一个super nice的rooftop bar，view很漂亮，cocktails很special，而且DJ放的music很好，整个atmosphere都很perfect！",
    "我follow的那个fashion blogger今天发了一个new post，她share了很多shopping tips，而且还做了一个try-on haul，感觉很useful。",
    # 科技场景
    "我的laptop最近总是自动shutdown，可能是system有问题，我先backup了所有的files，然后准备送去repair，希望不要花太多money。",
    "这个new app的user interface设计得很user-friendly，而且features都很practical，最重要的是privacy protection做得很好。",
    # 生活场景
    "我们公司附近新开了一家fusion restaurant，他们家的menu很special，combine了Chinese和Western的elements，weekend的brunch很popular。",
    "最近在follow一个fitness blogger的workout plan，每天都要做cardio和strength training，虽然很tough但是效果还不错。",
]

difficult_code_switching_examples = [
    # 技术领域
    "我们最近在做machine learning的optimization，用了regularization和cross-validation的方法，但是accuracy还是不够satisfactory。",
    "这个neural network的architecture太complicated了，需要调整hyperparameters，还要处理overfitting的问题。",
    # 医疗健康
    "医生说我的cardiovascular system需要attention，建议我做一些rehabilitation exercises，还要控制cholesterol的摄入。",
    "最近做了个comprehensive physical examination，显示immunology indicators都很normal，但是metabolism可能有点问题。",
    # 金融投资
    "这个cryptocurrency的volatility太高了，建议做好risk management，可以考虑portfolio diversification来hedge风险。",
    "最近market很不stable，很多institutional investors都在做strategic adjustment，retail investors要特别cautious。",
    # 学术研究
    "我的dissertation focus在quantum computing和artificial intelligence的intersection，特别是quantum machine learning的application。",
    "这个methodology还需要further validation，preliminary results看起来promising，但statistical significance不够。",
    # 商业管理
    "我们需要重新evaluate整个supply chain management system，特别是inventory optimization和logistics efficiency方面。",
    "根据market analysis，我们的competitive advantage在technological innovation，但organizational structure需要restructuring。",
    # 环境科技
    "这个sustainability project主要研究biodegradable materials在environmental protection中的application。",
    "我们在做renewable energy的feasibility study，特别关注photovoltaic technology和wind turbine的integration。",
]

In [None]:
with open("/storage/wyc/eval/test_data_short.json", "r", encoding="utf-8") as f:
    seed_demo_cases = json.load(f)["test_cases"]

len(seed_demo_cases)

In [None]:
seed_demo_en_pairs = [
    (item["text"], item["wav_path"])
    for item in seed_demo_cases
    if item["language"] == "en"
]
seed_demo_zh_pairs = [
    (item["text"], item["wav_path"])
    for item in seed_demo_cases
    if item["language"] == "zh"
]

len(seed_demo_en_pairs), len(seed_demo_zh_pairs)

# Cross-lingual, Accented?

In [None]:
# 1. Chinese prompt, English target

# prompt_text, prompt_speech_wav_path = random.choice(zh_pairs)
# target_text, _ = random.choice(en_pairs)

prompt_text, prompt_speech_wav_path = random.choice(seed_demo_zh_pairs)
target_text, _ = random.choice(seed_demo_en_pairs)

print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

# print("[Synthesized by CosyVoice1]")
# print("Target Text: ", target_text)
# audio = infer_tts(target_text, prompt_speech_wav_path, prompt_text, model_name="cosyvoice")
# display_audio(audio)

print("[Synthesized by CosyVoice2]")
print("Target Text: ", target_text)
audio = infer_tts(
    target_text, prompt_speech_wav_path, prompt_text, model_name="cosyvoice2"
)
display_audio(audio)

In [None]:
print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

print("[Synthesized by CosyVoice1]")
print("Target Text: ", target_text)
audio = infer_tts(
    target_text, prompt_speech_wav_path, prompt_text, model_name="cosyvoice"
)
display_audio(audio)

In [None]:
# 2. English prompt, Chinese target

prompt_text, prompt_speech_wav_path = random.choice(en_pairs)
target_text, _ = random.choice(zh_pairs)

# prompt_text, prompt_speech_wav_path = random.choice(seed_demo_en_pairs)
# target_text, _ = random.choice(seed_demo_zh_pairs)

print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

print("[Synthesized]")
print("Target Text: ", target_text)
audio = infer_tts(target_text, prompt_speech_wav_path, prompt_text)
display_audio(audio)

# Code-switching

In [53]:
def create_code_switching_text(zh_text, en_text):
    # 将中英文文本分别按标点符号切分成短语
    zh_phrases = [p.strip() for p in zh_text.replace("。", "").split("，") if p.strip()]
    en_phrases = [p.strip() for p in en_text.replace(".", "").split(",") if p.strip()]

    # 交替组合中英文短语
    mixed_phrases = []
    max_len = max(len(zh_phrases), len(en_phrases))
    for i in range(max_len):
        if i < len(zh_phrases):
            mixed_phrases.append(zh_phrases[i])
        if i < len(en_phrases):
            mixed_phrases.append(en_phrases[i])

    # 组合成完整句子
    return " ".join(mixed_phrases) + "。"

In [None]:
# 1. English prompt

prompt_text, prompt_speech_wav_path = random.choice(en_pairs)
target_text = random.choice(difficult_code_switching_examples)

print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

print("[Synthesized]")
print("Target Text: ", target_text)
audio = infer_tts(target_text, prompt_speech_wav_path, prompt_text)
display_audio(audio)

print()
print("*" * 20)
print()

prompt_text, prompt_speech_wav_path = random.choice(zh_pairs)
print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

print("[Synthesized]")
print("Target Text: ", target_text)
audio = infer_tts(target_text, prompt_speech_wav_path, prompt_text)
display_audio(audio)

# Cross-emotions

In [14]:
sad_examples = [
    # 失落感
    "我的心里空落落的,什么都提不起劲。",
    "这种无助的感觉让我喘不过气来。",
    "我真的很难过,眼泪止不住地往下流。",
    # 遗憾与后悔
    "如果当初我做出不同的选择,结局会不会不一样。",
    "那些错过的机会,现在想起来还是很遗憾。",
    "我好后悔没能在最后一刻说出那句话。",
    # 思念与想念
    "想起你的时候,心里就像被针扎一样疼。",
    "每当夜深人静的时候,思念就会涌上心头。",
    "我多希望这一切都只是一场梦。",
    # 孤独感
    "在人群中我依然感到无比孤单。",
    "这个世界这么大,却没有一个人能懂我的心。",
    "有时候觉得自己就像一座孤岛。",
    # 绝望感
    "我感觉人生已经失去了所有的色彩。",
    "不知道要怎么继续走下去了。",
    "这种痛苦什么时候才能结束。",
]

## Zero-shot TTS

In [None]:
prompt_text, prompt_speech_wav_path = random.choice(seed_demo_zh_pairs)
target_text = random.choice(sad_examples)

print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

print("[Synthesized by CosyVoice2]")
print("Target Text: ", target_text)
audio = infer_tts(
    target_text, prompt_speech_wav_path, prompt_text, model_name="cosyvoice2"
)
display_audio(audio)

In [None]:
prompt_text, prompt_speech_wav_path = random.choice(seed_demo_zh_pairs)
target_text = random.choice(sad_examples)

print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

print("[Synthesized by CosyVoice2]")
print("Target Text: ", target_text)
audio = infer_tts(
    target_text, prompt_speech_wav_path, prompt_text, model_name="cosyvoice2"
)
display_audio(audio)

In [None]:
prompt_text, prompt_speech_wav_path = random.choice(en_pairs)
target_text = random.choice(sad_examples)

print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

print("[Synthesized by CosyVoice2]")
print("Target Text: ", target_text)
audio = infer_tts(
    target_text, prompt_speech_wav_path, prompt_text, model_name="cosyvoice2"
)
display_audio(audio)

## Instructed TTS

In [None]:
# prompt_text, prompt_speech_wav_path = random.choice(seed_demo_zh_pairs)
# target_text = random.choice(sad_examples)
# target_text = " ".join(sad_examples[:3])
target_text = "我真的很难过,眼泪止不住地往下流。太棒了！我简直开心得要飞起来了！我简直气炸了！这种事情怎么可以发生！"

print("-" * 20)
print("[Prompt]", prompt_text)
display_audio_path(prompt_speech_wav_path)
print("-" * 20)

# print("[Synthesized by CosyVoice2, zero-shot TTS]")
# print("Target Text: ", target_text)
# audio = infer_tts(
#     target_text, prompt_speech_wav_path, prompt_text, model_name="cosyvoice2"
# )
# display_audio(audio)

print("[Synthesized by CosyVoice2, Instruct TTS]")
print("Target Text: ", target_text)
audio = infer_instruct_tts(
    target_text, prompt_speech_wav_path, "请你非常快速地说", model_name="cosyvoice2"
)
display_audio(audio)