In [6]:
import os
import re
import random
import joblib
import numpy as np
from typing import Tuple

import torch
import torchaudio

from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from IPython.display import Audio

# set the environment paths
# os.environ["CUDA_HOME"] = "/home/sijie/miniconda3/envs/ATOM39/"
# os.environ['PATH'] += ':/home/sijie/miniconda3/envs/ATOM39/'
# os.environ['PATH'] += ':/home/sijie/miniconda3/envs/ATOM39/bin'

# use cuda
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [7]:
# get model prepared
config = XttsConfig()
config.load_json("/home/sijie/ATOM-ASR/models/xtts_v2/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/home/sijie/ATOM-ASR/models/xtts_v2", eval=True, use_deepspeed=False)
# todo: use_deepseed=True
model.cuda();

In [8]:
# def inference(text: str, ref_audio_ids: list, max_ref_audio_num: int = 10) -> Tuple[int, np.ndarray]:
#     # get reference audio paths
#     audio_paths: list = [f"/home/sijie/nlp_voice/{audio_id}_file.m4a" for audio_id in ref_audio_ids]

#     if len(audio_paths) > max_ref_audio_num:
#         audio_paths = random.sample(audio_paths, max_ref_audio_num)
    
#     # compute speaker latents
#     gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=audio_paths)

#     # https://docs.coqui.ai/en/latest/models/xtts.html
#     # inference parameters
#     # text: The text to be synthesized.
#     # language: The language of the text to be synthesized.
#     # gpt_cond_latent: The latent vector you get with get_conditioning_latents. (You can cache for faster inference with same speaker)
#     # speaker_embedding: The speaker embedding you get with get_conditioning_latents. (You can cache for faster inference with same speaker)
#     # temperature: The softmax temperature of the autoregressive model. Defaults to 0.65.
#     # length_penalty: A length penalty applied to the autoregressive decoder. Higher settings causes the model to produce more terse outputs. Defaults to 1.0.
#     # repetition_penalty: A penalty that prevents the autoregressive decoder from repeating itself during decoding. Can be used to reduce the incidence of long silences or “uhhhhhhs”, etc. Defaults to 2.0.
#     # top_k: Lower values mean the decoder produces more “likely” (aka boring) outputs. Defaults to 50.
#     # top_p: Lower values mean the decoder produces more “likely” (aka boring) outputs. Defaults to 0.8.
#     # speed: The speed rate of the generated audio. Defaults to 1.0. (can produce artifacts if far from 1.0)
#     # enable_text_splitting: Whether to split the text into sentences and generate audio for each sentence. It allows you to have infinite input length but might loose important context between sentences. Defaults to True.
    
#     # inference
#     outputs = model.inference(
#         text=text,
#         language="zh-cn",
#         gpt_cond_latent=gpt_cond_latent,
#         speaker_embedding=speaker_embedding,
#         temperature=0.7, # Add custom parameters here
#         enable_text_splitting=False
#     )

#     return 24_000, outputs


def inference(text: str, ref_audio_ids: list, max_ref_audio_num: int = 10) -> Tuple[int, np.ndarray, list[int]]:
    # 获取参考音频路径
    audio_paths: list = [f"/home/sijie/nlp_voice/{audio_id}_file.m4a" for audio_id in ref_audio_ids]

    if len(audio_paths) > max_ref_audio_num:
        audio_paths = random.sample(audio_paths, max_ref_audio_num)
        ref_audio_ids = [int(re.search(r"/home/sijie/nlp_voice/(\d+)_file\.m4a", audio_path).group(1)) for audio_path in audio_paths]
    
    # 计算 speaker latents
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=audio_paths)
    
    # 将文本分割成最多 82 个字符的块
    def split_text(text, max_length):
        return [text[i:i + max_length] for i in range(0, len(text), max_length)]

    # 去掉句号，避免重复
    text = text.replace("。", "")
    text_chunks = split_text(text, 82)
    outputs = []

    for chunk in text_chunks:
        # 每个块进行推理
        output = model.inference(
            text=chunk,
            language="zh-cn",
            gpt_cond_latent=gpt_cond_latent,
            speaker_embedding=speaker_embedding,
            temperature=0.7, # 添加自定义参数
            enable_text_splitting=False
        )
        if isinstance(output, dict) and 'wav' in output:
            outputs.append(output['wav'])

    # 合并音频输出
    if outputs:
        combined_output = np.concatenate(outputs)
    else:
        combined_output = np.array([])

    return 24_000, combined_output, ref_audio_ids

In [11]:
# 读取文本数据，生辰音频
import json
import pandas as pd
import scipy.io.wavfile as wavfile
from tqdm import tqdm


# 钟声处理的数据
fake_labels = pd.read_csv("/home/sijie/ATOM-ASR/data/processed/processed_data_20240530_1.csv").drop(columns=["Unnamed: 0"])

# 读取speaker音频，作为参考
with open("/home/sijie/ATOM-ASR/data/xtts_v2_data/speakers.json", "r", encoding="utf-8") as f: 
    speakers = json.load(f)

fake_labels.head()

Unnamed: 0,uuid,generate
0,1716978667_bbb100c4-6009-4fa0-9a1a-5cf0b172bdb3,喜欢 始祖鸟 品牌，钟情于 白 棋盘格 纹路，偏好 貂皮 材质，对 巴比龙 别名情有独钟，...
1,1716978667_78bf0a96-e032-4462-a9f1-b3a2f469efc7,点年 轻 男士，爱好健身，爱好时尚，爱好阅读，喜欢乔治 阿玛尼 ，喜欢 葆蝶家 ，买过男表，...
2,1716978667_5ff09a4c-6626-40c8-8f76-34022dc767e5,喜欢 马鞍 包 ，搭配 漆皮 材质，追求 流行款 。
3,1716978667_997ae1ef-1b9c-4eff-9715-388a0a30c778,她选择了 max mara 的 羊绒 大衣 ，搭配了 千禧年缎面 的 围巾 ，展现出优雅的气质。
4,1716978667_2060e024-584b-40f8-a1f3-c4a9567421ab,顾客偏好小巧设计，顾客偏好 白 棋盘格 纹路，顾客偏好 羊皮 材质，顾客偏好 云朵 包 ...


In [12]:
# store the logging data
log_dict = {
    "uuid": [],
    "text": [],
    "caid": [],
    "audio_ids": [],
}

for i, row in tqdm(fake_labels.iterrows(), total=fake_labels.shape[0], desc="Processing rows"):
    uuid: str = row["uuid"]
    text: str = row["generate"]

    # 获取speaker参考音频id
    speaker: dict = speakers[random.randint(0, len(speakers) - 1)]
    caid: str = speaker["caid"]
    audio_ids: list[int] = speaker["id"]

    # 记录相关信息
    log_dict["uuid"].append(uuid)
    log_dict["text"].append(text)
    log_dict["caid"].append(caid)
    log_dict["audio_ids"].append(audio_ids)

    # 推理过程
    sr, outputs, new_audio_ids = inference(text, audio_ids)
    log_dict["audio_ids"][-1] = new_audio_ids
    
    # 保存音频文件
    # torchaudio.save(f"/home/sijie/ATOM-ASR/data/xtts_v2_data/output_wav/{uuid}.wav", torch.tensor(outputs["wav"]).unsqueeze(0), sr)
    wavfile.write(f"/home/sijie/ATOM-ASR/data/xtts_v2_data/output_wav/{uuid}.wav", sr, outputs)

Processing rows:  23%|████████████████████▎                                                                    | 457/2000 [11:37<39:13,  1.53s/it]


KeyboardInterrupt: 

In [None]:
joblib.dump(log_dict, "log_dict_20240530.pkl")

In [24]:
# # https://github.com/coqui-ai/TTS/discussions/3197
# sr, out = inference("客户30岁左右，追求时尚，购买了 delvaus 的 烧卖 包  ，偏爱 白 棋盘格  纹路，材质为 漆皮 ，同时选购了 nano gram系列的护照夹作为旅行伴侣hello world", [
#             726874,
#             726098,
#             726099,
#             724010,
#             723641,
#             689979,
#             698618,
#             698829,
#             714824,
#             715058,
#             715520,
#             690967,
#             691026,
#             691040,
#             691041,
#             691042,
#             691043,
#             691095,
#             691261,
#             691267,
#             691268,
#             691273,
#             691301,
#             691603,
#             691751,
#             692134,
#             692135,
#             692146,
#             692175,
#             692283,
#             692339,
#             692340,
#             692343,
#             692346,
#             692359,
#             692360,
#             692362,
#             692394,
#             692395,
#             692482,
#             692483
#         ])

# Audio(out["wav"], rate=sr)



In [38]:
# print("Computing speaker latents...")


# outputs = model.synthesize(
#     "我喜欢Gucci",
#     config,
#     speaker_wav="xtts_v2/samples/zh-cn-sample.wav",
#     gpt_cond_len=3,
#     language="zh-cn",
# )

# outputs = model.inference(
#     "c",
#     "zh-cn",
#     gpt_cond_latent,
#     speaker_embedding,
#     temperature=0.7, # Add custom parameters here
# )

Computing speaker latents...


In [37]:
torchaudio.save("xtts.wav", torch.tensor(outputs["wav"]).unsqueeze(0), 24000)