diff --git a/GPT_SoVITS/text/symbols.py b/GPT_SoVITS/text/symbols.py index 70499492c..b85c57a02 100644 --- a/GPT_SoVITS/text/symbols.py +++ b/GPT_SoVITS/text/symbols.py @@ -398,4 +398,5 @@ symbols = [pad] + c + v + ja_symbols + pu_symbols + list(arpa) symbols = sorted(set(symbols)) if __name__ == "__main__": + print(symbols) print(len(symbols)) diff --git a/Ref_Audio_Selector/__init__.py b/Ref_Audio_Selector/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/common/__init__.py b/Ref_Audio_Selector/common/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/common/common.py b/Ref_Audio_Selector/common/common.py new file mode 100644 index 000000000..5957fc3fe --- /dev/null +++ b/Ref_Audio_Selector/common/common.py @@ -0,0 +1,156 @@ +from tools import my_utils +from config import python_exec, is_half +import subprocess +import sys +import os + + +class RefAudioListManager: + def __init__(self, root_dir): + self.audio_dict = {'default': []} + absolute_root = os.path.abspath(root_dir) + + for subdir, dirs, files in os.walk(absolute_root): + relative_path = os.path.relpath(subdir, absolute_root) + + if relative_path == '.': + category = 'default' + else: + category = relative_path.replace(os.sep, '') + + for file in files: + if file.endswith('.wav'): + # 将相对路径转换为绝对路径 + audio_abs_path = os.path.join(subdir, file) + if category not in self.audio_dict: + self.audio_dict[category] = [] + self.audio_dict[category].append(audio_abs_path) + + def get_audio_list(self): + return self.audio_dict + + def get_flattened_audio_list(self): + all_audio_files = [] + for category_audios in self.audio_dict.values(): + all_audio_files.extend(category_audios) + return all_audio_files + + def get_ref_audio_list(self): + audio_info_list = [] + for category, audio_paths in self.audio_dict.items(): + for audio_path in audio_paths: + filename_without_extension = os.path.splitext(os.path.basename(audio_path))[0] + audio_info = { + 'emotion': f"{category}-{filename_without_extension}", + 'ref_path': audio_path, + 'ref_text': filename_without_extension, + } + audio_info_list.append(audio_info) + return audio_info_list + + +def batch_clean_paths(paths): + """ + 批量处理路径列表,对每个路径调用 clean_path() 函数。 + + 参数: + paths (list[str]): 包含待处理路径的列表。 + + 返回: + list[str]: 经过 clean_path() 处理后的路径列表。 + """ + cleaned_paths = [] + for path in paths: + cleaned_paths.append(my_utils.clean_path(path)) + return cleaned_paths + + +def read_text_file_to_list(file_path): + # 按照UTF-8编码打开文件(确保能够正确读取中文) + with open(file_path, mode='r', encoding='utf-8') as file: + # 读取所有行并存储到一个列表中 + lines = file.read().splitlines() + return lines + + +def get_filename_without_extension(file_path): + """ + Given a file path string, returns the file name without its extension. + + Parameters: + file_path (str): The full path to the file. + + Returns: + str: The file name without its extension. + """ + base_name = os.path.basename(file_path) # Get the base name (file name with extension) + file_name, file_extension = os.path.splitext(base_name) # Split the base name into file name and extension + return file_name # Return the file name without extension + + +def read_file(file_path): + # 使用with语句打开并读取文件 + with open(file_path, 'r', encoding='utf-8') as file: # 'r' 表示以读取模式打开文件 + # 一次性读取文件所有内容 + file_content = file.read() + + # 文件在with语句结束时会自动关闭 + # 现在file_content变量中存储了文件的所有文本内容 + return file_content + + +def write_text_to_file(text, output_file_path): + try: + with open(output_file_path, 'w', encoding='utf-8') as file: + file.write(text) + except IOError as e: + print(f"Error occurred while writing to the file: {e}") + else: + print(f"Text successfully written to file: {output_file_path}") + + +def check_path_existence_and_return(path): + """ + 检查给定路径(文件或目录)是否存在。如果存在,返回该路径;否则,返回空字符串。 + :param path: 待检查的文件或目录路径(字符串) + :return: 如果路径存在,返回原路径;否则,返回空字符串 + """ + if os.path.exists(path): + return path + else: + return "" + + +def open_file(filepath): + if sys.platform.startswith('darwin'): + subprocess.run(['open', filepath]) # macOS + elif os.name == 'nt': # For Windows + os.startfile(filepath) + elif os.name == 'posix': # For Linux, Unix, etc. + subprocess.run(['xdg-open', filepath]) + + +def start_new_service(script_path): + # 对于Windows系统 + if sys.platform.startswith('win'): + cmd = f'start cmd /k {python_exec} {script_path}' + # 对于Mac或者Linux系统 + else: + cmd = f'xterm -e {python_exec} {script_path}' + + proc = subprocess.Popen(cmd, shell=True) + + # 关闭之前启动的子进程 + # proc.terminate() + + # 或者如果需要强制关闭可以使用 + # proc.kill() + + return proc + + +if __name__ == '__main__': + dir = r'C:\Users\Administrator\Desktop/test' + dir2 = r'"C:\Users\Administrator\Desktop\test2"' + dir, dir2 = batch_clean_paths([dir, dir2]) + print(dir, dir2) diff --git a/Ref_Audio_Selector/common/model_manager.py b/Ref_Audio_Selector/common/model_manager.py new file mode 100644 index 000000000..b0f43fb45 --- /dev/null +++ b/Ref_Audio_Selector/common/model_manager.py @@ -0,0 +1,46 @@ +import os +import re + +pretrained_sovits_name = "GPT_SoVITS/pretrained_models/s2G488k.pth" +pretrained_gpt_name = "GPT_SoVITS/pretrained_models/s1bert25hz-2kh-longer-epoch=68e-step=50232.ckpt" +SoVITS_weight_root = "SoVITS_weights" +GPT_weight_root = "GPT_weights" +os.makedirs(SoVITS_weight_root, exist_ok=True) +os.makedirs(GPT_weight_root, exist_ok=True) + +speaker_verification_models = { + 'speech_campplus_sv_zh-cn_16k-common': { + 'task': 'speaker-verification', + 'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_campplus_sv_zh-cn_16k-common', + 'model_revision': 'v1.0.0' + }, + 'speech_eres2net_sv_zh-cn_16k-common': { + 'task': 'speaker-verification', + 'model': 'Ref_Audio_Selector/tool/speaker_verification/models/speech_eres2net_sv_zh-cn_16k-common', + 'model_revision': 'v1.0.5' + } +} + +def custom_sort_key(s): + # 使用正则表达式提取字符串中的数字部分和非数字部分 + parts = re.split('(\d+)', s) + # 将数字部分转换为整数,非数字部分保持不变 + parts = [int(part) if part.isdigit() else part for part in parts] + return parts + + +def get_gpt_model_names(): + gpt_names = [pretrained_gpt_name] + for name in os.listdir(GPT_weight_root): + if name.endswith(".ckpt"): gpt_names.append("%s/%s" % (GPT_weight_root, name)) + sorted(gpt_names, key=custom_sort_key) + return gpt_names + + +def get_sovits_model_names(): + sovits_names = [pretrained_sovits_name] + for name in os.listdir(SoVITS_weight_root): + if name.endswith(".pth"): sovits_names.append("%s/%s" % (SoVITS_weight_root, name)) + sorted(sovits_names, key=custom_sort_key) + return sovits_names + diff --git a/Ref_Audio_Selector/common/time_util.py b/Ref_Audio_Selector/common/time_util.py new file mode 100644 index 000000000..b58ce3ea6 --- /dev/null +++ b/Ref_Audio_Selector/common/time_util.py @@ -0,0 +1,72 @@ +import time +import os +from Ref_Audio_Selector.config_param.log_config import p_logger +import Ref_Audio_Selector.config_param.config_params as params + + +def timeit_decorator(func): + """ + 装饰器,用于计算被装饰函数的执行时间。 + + 参数: + func (function): 要计时的函数。 + + 返回: + function: 包含计时功能的新函数。 + """ + + def wrapper(*args, **kwargs): + if params.time_log_print_type != 'file': + return func(*args, **kwargs) + + start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点 + + func_result = func(*args, **kwargs) # 执行原函数 + + end_time = time.perf_counter() # 获取计时终点 + elapsed_time = end_time - start_time # 计算执行耗时 + + # 记录日志内容 + log_message = f"进程ID: {os.getpid()}, {func.__name__} 执行耗时: {elapsed_time:.6f} 秒" + p_logger.info(log_message) + + return func_result + + return wrapper + + +def time_monitor(func): + """ + 返回结果,追加时间 + """ + + def wrapper(*args, **kwargs): + + start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点 + + func_result = func(*args, **kwargs) # 执行原函数 + + end_time = time.perf_counter() # 获取计时终点 + elapsed_time = end_time - start_time # 计算执行耗时 + + return elapsed_time, func_result + + return wrapper + + +# 使用装饰器 +@timeit_decorator +def example_function(n): + time.sleep(n) # 假设这是需要计时的函数,这里模拟耗时操作 + return n * 2 + + +def example_function2(n): + time.sleep(n) # 假设这是需要计时的函数,这里模拟耗时操作 + return n * 2 + + +if __name__ == "__main__": + # 调用经过装饰的函数 + # result = example_function(2) + print(time_monitor(example_function2)(2)) diff --git a/Ref_Audio_Selector/config.ini b/Ref_Audio_Selector/config.ini new file mode 100644 index 000000000..b4658637b --- /dev/null +++ b/Ref_Audio_Selector/config.ini @@ -0,0 +1,57 @@ +# config.ini + +[Base] +# 服务端口号 +server_port = 9423 +# 参考音频目录 +reference_audio_dir = refer_audio +# 临时文件目录 +temp_dir = Ref_Audio_Selector/temp + +[Log] +# 日志保存目录路径 +log_dir = Ref_Audio_Selector/log/general +# 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、 +log_level = INFO +# 函数时间消耗日志打印类型 file 打印到文件; close 关闭 +time_log_print_type = file +# 函数时间消耗日志保存目录路径 +time_log_print_dir = Ref_Audio_Selector/log/performance + +[AudioSample] +# list转换待选参考音频目录 +list_to_convert_reference_audio_dir = refer_audio_all +# 音频相似度目录 +audio_similarity_dir = similarity +# 是否开启基准音频预采样 true false +enable_pre_sample = true + +[Inference] +# 默认测试文本位置 +default_test_text_path = Ref_Audio_Selector/file/test_content/test_content.txt +# 推理音频目录 +inference_audio_dir = inference_audio +# 推理音频文本聚合目录 +inference_audio_text_aggregation_dir = text +# 推理音频情绪聚合目录 +inference_audio_emotion_aggregation_dir = emotion + +[ResultCheck] +# asr输出文件 +asr_filename = asr +# 文本相似度输出目录 +text_similarity_output_dir = text_similarity +# 文本情绪平均相似度报告文件名 +text_emotion_average_similarity_report_filename = average_similarity +# 文本相似度按情绪聚合明细文件名 +text_similarity_by_emotion_detail_filename = emotion_group_detail +# 文本相似度按文本聚合明细文件名 +text_similarity_by_text_detail_filename = text_group_detail + +[AudioConfig] +# 默认模板文件位置 +default_template_path = Ref_Audio_Selector/file/config_template/ref_audio_template.txt +# 参考音频配置文件名 +reference_audio_config_filename = refer_audio + +[Other] \ No newline at end of file diff --git a/Ref_Audio_Selector/config_param/__init__.py b/Ref_Audio_Selector/config_param/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/config_param/config_manager.py b/Ref_Audio_Selector/config_param/config_manager.py new file mode 100644 index 000000000..63d23352e --- /dev/null +++ b/Ref_Audio_Selector/config_param/config_manager.py @@ -0,0 +1,111 @@ +import configparser +import os +import Ref_Audio_Selector.common.common as common + + +class ParamReadWriteManager: + def __init__(self): + self.base_dir = 'Ref_Audio_Selector/file/base_info' + os.makedirs(self.base_dir, exist_ok=True) + # 基础信息 + self.work_dir = 'work_dir' + self.role = 'role' + # 第一步 + self.subsection_num = 'subsection_num' + self.sample_num = 'sample_num' + # 第二步 + self.api_set_model_base_url = 'api_set_model_base_url' + self.api_gpt_param = 'api_gpt_param' + self.api_sovits_param = 'api_sovits_param' + + self.api_v2_set_gpt_model_base_url = 'api_v2_set_gpt_model_base_url' + self.api_v2_gpt_model_param = 'api_v2_gpt_model_param' + self.api_v2_set_sovits_model_base_url = 'api_v2_set_sovits_model_base_url' + self.api_v2_sovits_model_param = 'api_v2_sovits_model_param' + + self.text_url = 'text_url' + self.text_param = 'text_param' + self.refer_type_param = 'refer_type_param' + self.ref_path_param = 'ref_path_param' + self.ref_text_param = 'ref_text_param' + self.emotion_param = 'emotion_param' + + self.test_content_path = 'test_content_path' + self.request_concurrency_num = 'request_concurrency_num' + + # 第三步 + self.text_similarity_amplification_boundary = 'text_similarity_amplification_boundary' + # 第四步 + # 第五步 + self.text_template = 'text_template' + + def read(self, key): + file_path = os.path.join(self.base_dir, key + '.txt') + if os.path.exists(file_path): + content = common.read_file(file_path) + return content.strip() + else: + return '' + + def write(self, key, content): + file_path = os.path.join(self.base_dir, key + '.txt') + + # 确保内容是字符串类型,如果不是,转换为字符串 + if not isinstance(content, str): + clean_content = str(content).strip() # 转换为字符串并移除首尾空白 + else: + clean_content = content.strip() + + common.write_text_to_file(clean_content, file_path) + + +class ConfigManager: + def __init__(self): + self.config_path = 'Ref_Audio_Selector/config.ini' + self.config = configparser.ConfigParser() + self.config.read(self.config_path, encoding='utf-8') + + def get_base(self, key): + return self.config.get('Base', key) + + def get_log(self, key): + return self.config.get('Log', key) + + def get_audio_sample(self, key): + return self.config.get('AudioSample', key) + + def get_inference(self, key): + return self.config.get('Inference', key) + + def get_result_check(self, key): + return self.config.get('ResultCheck', key) + + def get_audio_config(self, key): + return self.config.get('AudioConfig', key) + + def get_other(self, key): + return self.config.get('Other', key) + + def print(self): + # 打印所有配置 + for section in self.config.sections(): + print('[{}]'.format(section)) + for key in self.config[section]: + print('{} = {}'.format(key, self.config[section][key])) + print() + + +_config = ConfigManager() +_param_read_write_manager = ParamReadWriteManager() + + +def get_config(): + return _config + + +def get_rw_param(): + return _param_read_write_manager + + +if __name__ == '__main__': + print(_config.print()) diff --git a/Ref_Audio_Selector/config_param/config_params.py b/Ref_Audio_Selector/config_param/config_params.py new file mode 100644 index 000000000..b30924d1b --- /dev/null +++ b/Ref_Audio_Selector/config_param/config_params.py @@ -0,0 +1,58 @@ +import Ref_Audio_Selector.config_param.config_manager as config_manager + +config = config_manager.get_config() + +# [Base] +# 服务端口号 +server_port = int(config.get_base('server_port')) +# 参考音频目录 +reference_audio_dir = config.get_base('reference_audio_dir') +# 临时文件目录 +temp_dir = config.get_base('temp_dir') + +# [Log] +# 日志保存目录路径 +log_dir = config.get_log('log_dir') +# 日志级别 CRITICAL、FATAL、ERROR、WARNING、WARN、INFO、DEBUG、NOTSET、 +log_level = config.get_log('log_level') +# 函数时间消耗日志打印类型 file 打印到文件; close 关闭 +time_log_print_type = config.get_log('time_log_print_type') +# 函数时间消耗日志保存目录路径 +time_log_print_dir = config.get_log('time_log_print_dir') + +# [AudioSample] +# list转换待选参考音频目录 +list_to_convert_reference_audio_dir = config.get_audio_sample('list_to_convert_reference_audio_dir') +# 音频相似度目录 +audio_similarity_dir = config.get_audio_sample('audio_similarity_dir') +# 是否开启基准音频预采样 true false +enable_pre_sample = config.get_audio_sample('enable_pre_sample') + +# [Inference] +# 默认测试文本位置 +default_test_text_path = config.get_inference('default_test_text_path') +# 推理音频目录 +inference_audio_dir = config.get_inference('inference_audio_dir') +# 推理音频文本聚合目录 +inference_audio_text_aggregation_dir = config.get_inference('inference_audio_text_aggregation_dir') +# 推理音频情绪聚合目录 +inference_audio_emotion_aggregation_dir = config.get_inference('inference_audio_emotion_aggregation_dir') + +# [ResultCheck] +# asr输出文件 +asr_filename = config.get_result_check('asr_filename') +# 文本相似度输出目录 +text_similarity_output_dir = config.get_result_check('text_similarity_output_dir') +# 文本情绪平均相似度报告文件名 +text_emotion_average_similarity_report_filename = config.get_result_check('text_emotion_average_similarity_report_filename') +# 文本相似度按情绪聚合明细文件名 +text_similarity_by_emotion_detail_filename = config.get_result_check('text_similarity_by_emotion_detail_filename') +# 文本相似度按文本聚合明细文件名 +text_similarity_by_text_detail_filename = config.get_result_check('text_similarity_by_text_detail_filename') + +# [AudioConfig] +# 默认模板文件位置 +default_template_path = config.get_audio_config('default_template_path') +# 参考音频配置文件名 +reference_audio_config_filename = config.get_audio_config('reference_audio_config_filename') + diff --git a/Ref_Audio_Selector/config_param/log_config.py b/Ref_Audio_Selector/config_param/log_config.py new file mode 100644 index 000000000..fda5a3c6c --- /dev/null +++ b/Ref_Audio_Selector/config_param/log_config.py @@ -0,0 +1,65 @@ +import logging +import os +import datetime +import Ref_Audio_Selector.config_param.config_params as params + + +def create_general_logger(): + # 获取当前日期,用于文件名和日志内容 + current_date = datetime.datetime.now().strftime('%Y-%m-%d') + + # 创建一个用于控制台输出的处理器,并设置日志级别 + console_handler = logging.StreamHandler() + # console_handler.setLevel(logging.INFO) + # 可以设置控制台输出的格式 + console_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + console_handler.setFormatter(console_formatter) + console_handler.encoding = 'utf-8' # 设置字符编码为utf-8 + + os.makedirs(params.log_dir, exist_ok=True) + + # 创建一个用于常规日志的处理器 + general_handler = logging.FileHandler(f"{params.log_dir}/{current_date}.log", mode='a', encoding='utf-8') + # general_handler.setLevel(logging.INFO) + general_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + general_handler.setFormatter(general_formatter) + + # 配置一个常规的logger + general_logger = logging.getLogger('general') + level = logging.getLevelName(params.log_level) + general_logger.setLevel(level) + general_logger.addHandler(console_handler) + general_logger.addHandler(general_handler) + + # 配置根logger,以防万一 + logging.basicConfig(level=logging.WARNING, handlers=[general_handler]) + + return general_logger + + +def create_performance_logger(): + # 获取当前日期,用于文件名和日志内容 + current_date = datetime.datetime.now().strftime('%Y-%m-%d') + + os.makedirs(params.time_log_print_dir, exist_ok=True) + + # 创建一个专用于性能监控日志的处理器 + performance_handler = logging.FileHandler( + f"{params.time_log_print_dir}/{current_date}.log", mode='a', encoding='utf-8') + # performance_handler.setLevel(logging.INFO) + performance_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + performance_handler.setFormatter(performance_formatter) + + # 配置一个专门用于性能监控的logger + performance_logger = logging.getLogger('performance') + performance_logger.setLevel(logging.INFO) + performance_logger.addHandler(performance_handler) + + return performance_logger + + +def setup_logging(): + return create_general_logger(), create_performance_logger() + + +logger, p_logger = setup_logging() diff --git a/Ref_Audio_Selector/file/base_info/role.txt b/Ref_Audio_Selector/file/base_info/role.txt new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/file/base_info/work_dir.txt b/Ref_Audio_Selector/file/base_info/work_dir.txt new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/file/config_template/ref_audio_template.txt b/Ref_Audio_Selector/file/config_template/ref_audio_template.txt new file mode 100644 index 000000000..97142b114 --- /dev/null +++ b/Ref_Audio_Selector/file/config_template/ref_audio_template.txt @@ -0,0 +1,5 @@ +"${emotion}": { + "ref_wav_path": "${ref_path}", + "prompt_text": "${ref_text}", + "prompt_language": "中文" +} \ No newline at end of file diff --git a/Ref_Audio_Selector/file/test_content/test_content.txt b/Ref_Audio_Selector/file/test_content/test_content.txt new file mode 100644 index 000000000..5c3cac36f --- /dev/null +++ b/Ref_Audio_Selector/file/test_content/test_content.txt @@ -0,0 +1,4 @@ +也是只有一次。”白蓉简单地回答,然后迅速转移话锋,搂住罗辑的脖子说,“算了,我不要那生日礼物了,你也回到正常的生活中来,好吗?” +云天明看到那是一条丑陋的虫子,软乎乎湿漉漉的,在她白皙的手指间蠕动着,旁边一个女生尖叫道:恶心死了,你碰它干吗?!程心把虫子轻轻放到旁边的草丛中,说,它在这里会给踩死的。 +“那么多的星星,像雾似的。”云天明感叹道。程心把目光从银河收回,转头看着他,指着下面的校园和城市说:“你看下面也很漂亮啊,我们的生活是在这儿,可不是在那么远的银河里。” +“可我们的专业,不就是为了到地球之外去吗?”“那是为了这里的生活更好,可不是为了逃离地球啊。”云天明当然知道程心的话是委婉地指向他的孤僻和自闭,他也只有默然以对。 \ No newline at end of file diff --git a/Ref_Audio_Selector/ref_audio_selector_webui.py b/Ref_Audio_Selector/ref_audio_selector_webui.py new file mode 100644 index 000000000..d9c8a29f0 --- /dev/null +++ b/Ref_Audio_Selector/ref_audio_selector_webui.py @@ -0,0 +1,1066 @@ +import os.path +import os +import traceback + +import gradio as gr + +from Ref_Audio_Selector.config_param.log_config import logger + +import Ref_Audio_Selector.common.model_manager as model_manager +import Ref_Audio_Selector.tool.audio_sample as audio_sample +import Ref_Audio_Selector.tool.audio_inference as audio_inference +import Ref_Audio_Selector.tool.audio_config as audio_config +import Ref_Audio_Selector.tool.audio_check as audio_check +import Ref_Audio_Selector.tool.text_check as text_check +import Ref_Audio_Selector.common.common as common +import Ref_Audio_Selector.config_param.config_params as params +import Ref_Audio_Selector.common.time_util as time_util +import Ref_Audio_Selector.ui_init.init_ui_param as init + +from tools.i18n.i18n import I18nAuto +from config import python_exec, is_half +from tools import my_utils +from tools.asr.config import asr_dict +from subprocess import Popen + +i18n = I18nAuto() +rw_param = params.config_manager.get_rw_param() + +p_similarity = None +p_asr = None +p_text_similarity = None + + +# 校验基础信息 +def check_base_info(text_work_space_dir, text_role): + if text_work_space_dir is None or text_work_space_dir == '': + raise Exception("工作目录不能为空") + if text_role is None or text_role == '': + raise Exception("角色不能为空") + base_role_dir = os.path.join(text_work_space_dir, text_role) + # 判断目录是否存在 + if not os.path.exists(base_role_dir): + # 如果不存在,则创建目录 + os.makedirs(base_role_dir, exist_ok=True) + return base_role_dir + + +# 从list文件,提取参考音频 +def convert_from_list(text_work_space_dir, text_role, text_list_input): + text_work_space_dir, text_list_input = common.batch_clean_paths([text_work_space_dir, text_list_input]) + + text_convert_from_list_info = None + text_sample_dir = None + try: + base_role_dir = check_base_info(text_work_space_dir, text_role) + if text_list_input is None or text_list_input == '': + raise Exception("list文件路径不能为空") + + ref_audio_all = os.path.join(base_role_dir, + params.list_to_convert_reference_audio_dir) + + time_consuming, _ = time_util.time_monitor(audio_sample.convert_from_list)(text_list_input, ref_audio_all) + + text_convert_from_list_info = f"耗时:{time_consuming:0.1f}秒;转换成功:生成目录{ref_audio_all}" + text_sample_dir = ref_audio_all + + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_convert_from_list_info = f"发生异常:{e}" + text_sample_dir = '' + return i18n(text_convert_from_list_info), text_sample_dir + + +def start_similarity_analysis(work_space_dir, sample_dir, speaker_verification, base_voice_path, + need_similarity_output): + similarity_list = None + similarity_file_dir = None + + similarity_dir = os.path.join(work_space_dir, params.audio_similarity_dir) + os.makedirs(similarity_dir, exist_ok=True) + + base_voice_file_name = common.get_filename_without_extension(base_voice_path) + similarity_file = os.path.join(similarity_dir, f'{base_voice_file_name}.txt') + + global p_similarity + if p_similarity is None: + cmd = f'"{python_exec}" Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py ' + cmd += f' -r "{base_voice_path}"' + cmd += f' -c "{sample_dir}"' + cmd += f' -o "{similarity_file}"' + cmd += f' -m {speaker_verification}' + + logger.info(cmd) + p_similarity = Popen(cmd, shell=True) + p_similarity.wait() + + similarity_list = audio_sample.parse_similarity_file(similarity_file) + + if need_similarity_output: + similarity_file_dir = os.path.join(similarity_dir, base_voice_file_name) + audio_sample.copy_and_move(similarity_file_dir, similarity_list) + + p_similarity = None + return similarity_list, similarity_file, similarity_file_dir + else: + return similarity_list, None, None + + +# 基于一个基准音频,从参考音频目录中进行分段抽样 +def sample(text_work_space_dir, text_role, text_sample_dir, dropdown_speaker_verification, text_base_voice_path, + slider_subsection_num, slider_sample_num, checkbox_similarity_output): + text_work_space_dir, text_sample_dir, text_base_voice_path \ + = common.batch_clean_paths([text_work_space_dir, text_sample_dir, text_base_voice_path]) + + ref_audio_dir = None + text_sample_info = None + try: + base_role_dir = check_base_info(text_work_space_dir, text_role) + if text_sample_dir is None or text_sample_dir == '': + raise Exception("参考音频抽样目录不能为空,请先完成上一步操作") + if text_base_voice_path is None or text_base_voice_path == '': + raise Exception("基准音频路径不能为空") + if slider_subsection_num is None or slider_subsection_num == '': + raise Exception("分段数不能为空") + if slider_sample_num is None or slider_sample_num == '': + raise Exception("每段随机抽样个数不能为空") + if dropdown_speaker_verification is None or dropdown_speaker_verification == '': + raise Exception("说话人确认算法不能为空") + + ref_audio_dir = os.path.join(base_role_dir, params.reference_audio_dir) + + time_consuming, (similarity_list, _, _) \ + = time_util.time_monitor(start_similarity_analysis)(base_role_dir, text_sample_dir, + dropdown_speaker_verification, text_base_voice_path, + checkbox_similarity_output) + + text_sample_info = f"耗时:{time_consuming:0.1f}秒;抽样成功:生成目录{ref_audio_dir}" + + if similarity_list is None: + raise Exception("相似度分析失败") + + audio_sample.sample(ref_audio_dir, similarity_list, slider_subsection_num, slider_sample_num) + + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_sample_info = f"发生异常:{e}" + ref_audio_dir = '' + text_refer_audio_file_dir = ref_audio_dir + return i18n(text_sample_info), text_refer_audio_file_dir + + +# 根据参考音频和测试文本,执行批量推理 +def model_inference(text_work_space_dir, text_role, slider_request_concurrency_num, text_refer_audio_file_dir, + text_url, dropdown_refer_type_param, + text_text, text_ref_path, text_ref_text, text_emotion, + text_test_content_dir): + text_work_space_dir, text_refer_audio_file_dir, text_test_content_dir \ + = common.batch_clean_paths([text_work_space_dir, text_refer_audio_file_dir, text_test_content_dir]) + + inference_dir = None + text_asr_audio_dir = None + text_model_inference_info = None + try: + base_role_dir = check_base_info(text_work_space_dir, text_role) + if text_refer_audio_file_dir is None or text_refer_audio_file_dir == '': + raise Exception("待推理的参考音频所在目录不能为空,请先完成上一步操作") + if text_url is None or text_url == '': + raise Exception("推理服务请求地址不能为空") + if text_text is None or text_text == '': + raise Exception("文本参数名不能为空") + if text_test_content_dir is None or text_test_content_dir == '': + raise Exception("待推理文本路径不能为空") + if (text_ref_path is None or text_ref_path == '') and (text_ref_text is None or text_ref_text == '') and ( + text_emotion is None or text_emotion == ''): + raise Exception("参考音频路径/文本和角色情绪二选一填写,不能全部为空") + + inference_dir = os.path.join(base_role_dir, params.inference_audio_dir) + text_asr_audio_dir = os.path.join(inference_dir, + params.inference_audio_text_aggregation_dir) + + url_composer = audio_inference.TTSURLComposer(text_url, dropdown_refer_type_param, text_emotion, text_text, + text_ref_path, text_ref_text) + url_composer.is_valid() + text_list = common.read_text_file_to_list(text_test_content_dir) + if text_list is None or len(text_list) == 0: + raise Exception("待推理文本内容不能为空") + ref_audio_manager = common.RefAudioListManager(text_refer_audio_file_dir) + if len(ref_audio_manager.get_audio_list()) == 0: + raise Exception("待推理的参考音频不能为空") + + time_consuming, _ = time_util.time_monitor(audio_inference.generate_audio_files_parallel)(url_composer, + text_list, + ref_audio_manager.get_ref_audio_list(), + inference_dir, + slider_request_concurrency_num) + + text_model_inference_info = f"耗时:{time_consuming:0.1f}秒;推理成功:生成目录{inference_dir}" + + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_model_inference_info = f"发生异常:{e}" + text_asr_audio_dir = '' + return i18n(text_model_inference_info), text_asr_audio_dir, inference_dir + + +# 对推理生成音频执行asr +def asr(text_work_space_dir, text_role, text_asr_audio_dir, dropdown_asr_model, + dropdown_asr_size, dropdown_asr_lang): + text_work_space_dir, text_asr_audio_dir \ + = common.batch_clean_paths([text_work_space_dir, text_asr_audio_dir]) + + asr_file = None + text_text_similarity_analysis_path = None + text_asr_info = None + try: + base_role_dir = check_base_info(text_work_space_dir, text_role) + if text_asr_audio_dir is None or text_asr_audio_dir == '': + raise Exception("待asr的音频所在目录不能为空,请先完成上一步操作") + if dropdown_asr_model is None or dropdown_asr_model == '': + raise Exception("asr模型不能为空") + if dropdown_asr_size is None or dropdown_asr_size == '': + raise Exception("asr模型大小不能为空") + if dropdown_asr_lang is None or dropdown_asr_lang == '': + raise Exception("asr语言不能为空") + + time_consuming, asr_file = time_util.time_monitor(open_asr)(text_asr_audio_dir, base_role_dir, + dropdown_asr_model, dropdown_asr_size, + dropdown_asr_lang) + + text_text_similarity_analysis_path = asr_file + text_asr_info = f"耗时:{time_consuming:0.1f}秒;asr成功:生成文件{asr_file}" + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_asr_info = f"发生异常:{e}" + text_text_similarity_analysis_path = '' + return i18n(text_asr_info), text_text_similarity_analysis_path + + +def open_asr(asr_inp_dir, asr_opt_dir, asr_model, asr_model_size, asr_lang): + global p_asr + if p_asr is None: + asr_inp_dir = my_utils.clean_path(asr_inp_dir) + asr_py_path = asr_dict[asr_model]["path"] + if asr_py_path == 'funasr_asr.py': + asr_py_path = 'funasr_asr_multi_level_dir.py' + if asr_py_path == 'fasterwhisper_asr.py': + asr_py_path = 'fasterwhisper_asr_multi_level_dir.py' + cmd = f'"{python_exec}" Ref_Audio_Selector/tool/asr/{asr_py_path} ' + cmd += f' -i "{asr_inp_dir}"' + cmd += f' -o "{asr_opt_dir}"' + cmd += f' -s {asr_model_size}' + cmd += f' -l {asr_lang}' + cmd += " -p %s" % ("float16" if is_half == True else "float32") + + logger.info(cmd) + p_asr = Popen(cmd, shell=True) + p_asr.wait() + p_asr = None + + output_dir_abs = os.path.abspath(asr_opt_dir) + output_file_name = os.path.basename(asr_inp_dir) + # 构造输出文件路径 + output_file_path = os.path.join(output_dir_abs, f'{params.asr_filename}.list') + return output_file_path + + else: + return None + + +# 对asr生成的文件,与原本的文本内容,进行相似度分析 +def text_similarity_analysis(text_work_space_dir, text_role, slider_text_similarity_amplification_boundary, + text_text_similarity_analysis_path): + text_work_space_dir, text_text_similarity_analysis_path \ + = common.batch_clean_paths([text_work_space_dir, text_text_similarity_analysis_path]) + + similarity_dir = None + text_text_similarity_analysis_info = None + try: + base_role_dir = check_base_info(text_work_space_dir, text_role) + if text_text_similarity_analysis_path is None or text_text_similarity_analysis_path == '': + raise Exception("asr生成的文件路径不能为空,请先完成上一步操作") + similarity_dir = os.path.join(base_role_dir, params.text_similarity_output_dir) + + time_consuming, _ = time_util.time_monitor(open_text_similarity_analysis)(text_text_similarity_analysis_path, + similarity_dir, + slider_text_similarity_amplification_boundary) + + average_similarity_file = os.path.join(similarity_dir, + f'{params.text_emotion_average_similarity_report_filename}.txt') + + text_text_similarity_analysis_info = f"耗时:{time_consuming:0.1f}秒;相似度分析成功:生成目录{similarity_dir}" + + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_text_similarity_analysis_info = f"发生异常:{e}" + return i18n(text_text_similarity_analysis_info), average_similarity_file + + +def open_text_similarity_analysis(asr_file_path, output_dir, similarity_enlarge_boundary=0.9): + global p_text_similarity + if p_text_similarity is None: + cmd = f'"{python_exec}" Ref_Audio_Selector/tool/text_comparison/asr_text_process.py ' + cmd += f' -a "{asr_file_path}"' + cmd += f' -o "{output_dir}"' + cmd += f' -b {similarity_enlarge_boundary}' + + logger.info(cmd) + p_text_similarity = Popen(cmd, shell=True) + p_text_similarity.wait() + p_text_similarity = None + + return output_dir + + else: + return None + + +hide_voice_similarity_dir = '' + + +# 根据一个参考音频,对指定目录下的音频进行相似度分析,并输出到另一个目录 +def similarity_audio_output(text_work_space_dir, text_role, text_base_audio_path, + text_compare_audio_dir, dropdown_speaker_verification): + global hide_voice_similarity_dir + text_work_space_dir, text_base_audio_path, text_compare_audio_dir \ + = common.batch_clean_paths([text_work_space_dir, text_base_audio_path, text_compare_audio_dir]) + + text_similarity_audio_output_info = None + try: + base_role_dir = check_base_info(text_work_space_dir, text_role) + if text_base_audio_path is None or text_base_audio_path == '': + raise Exception("基准音频路径不能为空") + if text_compare_audio_dir is None or text_compare_audio_dir == '': + raise Exception("待分析的音频所在目录不能为空") + if dropdown_speaker_verification is None or dropdown_speaker_verification == '': + raise Exception("说话人验证模型不能为空") + + time_consuming, (similarity_list, similarity_file, similarity_file_dir) \ + = time_util.time_monitor(start_similarity_analysis)(base_role_dir, text_compare_audio_dir, + dropdown_speaker_verification, text_base_audio_path, + True) + + if similarity_list is None: + raise Exception("相似度分析失败") + + text_similarity_audio_output_info = f'耗时:{time_consuming:0.1f}秒;相似度分析成功:生成目录{similarity_file_dir},文件{similarity_file}' + + hide_voice_similarity_dir = os.path.join(base_role_dir, params.audio_similarity_dir) + + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_similarity_audio_output_info = f"发生异常:{e}" + return i18n(text_similarity_audio_output_info) + + +# 根据参考音频目录的删除情况,将其同步到推理生成的音频目录中,即参考音频目录下,删除了几个参考音频,就在推理目录下,将这些参考音频生成的音频文件移除 +def sync_ref_audio(text_work_space_dir, text_role, text_refer_audio_file_dir, + text_inference_audio_file_dir): + text_work_space_dir, text_refer_audio_file_dir, text_inference_audio_file_dir \ + = common.batch_clean_paths([text_work_space_dir, text_refer_audio_file_dir, text_inference_audio_file_dir]) + + text_sync_ref_audio_info = None + try: + check_base_info(text_work_space_dir, text_role) + if text_refer_audio_file_dir is None or text_refer_audio_file_dir == '': + raise Exception("参考音频目录不能为空") + if text_inference_audio_file_dir is None or text_inference_audio_file_dir == '': + raise Exception("推理生成的音频目录不能为空") + time_consuming, (delete_text_wav_num, delete_emotion_dir_num) \ + = time_util.time_monitor(audio_check.sync_ref_audio)(text_refer_audio_file_dir, + text_inference_audio_file_dir) + + text_sync_ref_audio_info = (f"耗时:{time_consuming:0.1f}秒;推理音频目录{text_inference_audio_file_dir}下," + f"text目录删除了{delete_text_wav_num}个推理音频,emotion目录下,删除了{delete_emotion_dir_num}个目录") + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_sync_ref_audio_info = f"发生异常:{e}" + return i18n(text_sync_ref_audio_info) + + +hide_config_file = '' + + +# 根据模板和参考音频目录,生成参考音频配置内容 +def create_config(text_work_space_dir, text_role, text_template, text_refer_audio_file_dir): + text_work_space_dir, text_refer_audio_file_dir \ + = common.batch_clean_paths([text_work_space_dir, text_refer_audio_file_dir]) + + global hide_config_file + + config_file = None + text_create_config_info = None + try: + base_role_dir = check_base_info(text_work_space_dir, text_role) + if text_template is None or text_template == '': + raise Exception("参考音频抽样目录不能为空") + if text_refer_audio_file_dir is None or text_refer_audio_file_dir == '': + raise Exception("参考音频目录不能为空") + config_file = os.path.join(base_role_dir, f'{params.reference_audio_config_filename}.json') + ref_audio_manager = common.RefAudioListManager(text_refer_audio_file_dir) + + time_consuming, _ = time_util.time_monitor(audio_config.generate_audio_config)(base_role_dir, text_template, + ref_audio_manager.get_ref_audio_list(), + config_file) + + text_create_config_info = f"耗时:{time_consuming:0.1f}秒;配置生成成功:生成文件{config_file}" + + hide_config_file = config_file + + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_create_config_info = f"发生异常:{e}" + return i18n(text_create_config_info) + + +# 基于请求路径和参数,合成完整的请求路径 +def whole_url(text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, text_emotion): + url_composer = audio_inference.TTSURLComposer(text_url, dropdown_refer_type_param, text_emotion, text_text, + text_ref_path, text_ref_text) + if url_composer.is_emotion(): + text_whole_url = url_composer.build_url_with_emotion('测试内容', '情绪类型', False) + else: + text_whole_url = url_composer.build_url_with_ref('测试内容', '参考路径', '参考文本', False) + return text_whole_url + + +def start_api(): + text_start_api_info = None + try: + proc = common.start_new_service('api.py') + text_start_api_info = "启动完成" + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_start_api_info = f"发生异常:{e}" + return text_start_api_info + + +def refresh_api_model(): + return ({"choices": model_manager.get_gpt_model_names(), "__type__": "update"}, + {"choices": model_manager.get_sovits_model_names(), "__type__": "update"}) + + +def api_set_model_whole_url(text_api_set_model_base_url, dropdown_api_gpt_models, dropdown_api_sovits_models, + text_api_gpt_param, text_api_sovits_param): + url = audio_inference.SetModelURLComposer("all", text_api_set_model_base_url, text_api_gpt_param, + text_api_sovits_param) + return url.build_get_url([dropdown_api_gpt_models, dropdown_api_sovits_models], False) + + +def start_api_set_model(text_api_set_model_base_url, dropdown_api_gpt_models, dropdown_api_sovits_models, + text_api_gpt_param, text_api_sovits_param): + text_api_start_set_model_request_info = None + try: + if dropdown_api_gpt_models is None or dropdown_api_gpt_models == '': + raise Exception("GPT模型不能为空") + if dropdown_api_sovits_models is None or dropdown_api_sovits_models == '': + raise Exception("Sovits模型不能为空") + url = audio_inference.SetModelURLComposer("all", text_api_set_model_base_url, text_api_gpt_param, + text_api_sovits_param) + url.is_valid() + time_consuming, result = time_util.time_monitor(audio_inference.start_api_set_model)(url, + dropdown_api_gpt_models, + dropdown_api_sovits_models) + text_api_start_set_model_request_info = f"耗时:{time_consuming:0.1f}秒;请求结果:{result}" + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_api_start_set_model_request_info = f"发生异常:{e}" + return text_api_start_set_model_request_info + + +def refresh_api_v2_gpt_model(): + return {"choices": model_manager.get_gpt_model_names(), "__type__": "update"} + + +def api_v2_set_gpt_whole_url(text_api_v2_set_gpt_model_base_url, text_api_v2_gpt_model_param, + dropdown_api_v2_gpt_models): + url = audio_inference.SetModelURLComposer("gpt", text_api_v2_set_gpt_model_base_url, text_api_v2_gpt_model_param, + None) + return url.build_get_url([dropdown_api_v2_gpt_models], False) + + +def start_api_v2_set_gpt_model(text_api_v2_set_gpt_model_base_url, text_api_v2_gpt_model_param, + dropdown_api_v2_gpt_models): + text_api_v2_start_set_gpt_model_request_info = None + try: + if dropdown_api_v2_gpt_models is None or dropdown_api_v2_gpt_models == '': + raise Exception("GPT模型不能为空") + url = audio_inference.SetModelURLComposer("gpt", text_api_v2_set_gpt_model_base_url, + text_api_v2_gpt_model_param, None) + url.is_valid() + time_consuming, result = time_util.time_monitor(audio_inference.start_api_v2_set_gpt_model)(url, + dropdown_api_v2_gpt_models) + text_api_v2_start_set_gpt_model_request_info = f"耗时:{time_consuming:0.1f}秒;请求结果:{result}" + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_api_v2_start_set_gpt_model_request_info = f"发生异常:{e}" + return text_api_v2_start_set_gpt_model_request_info + + +def refresh_api_v2_sovits_model(): + return {"choices": model_manager.get_sovits_model_names(), "__type__": "update"} + + +def api_v2_set_sovits_whole_url(text_api_v2_set_sovits_model_base_url, text_api_v2_sovits_model_param, + dropdown_api_v2_sovits_models): + url = audio_inference.SetModelURLComposer("sovits", text_api_v2_set_sovits_model_base_url, None, + text_api_v2_sovits_model_param) + return url.build_get_url([dropdown_api_v2_sovits_models], False) + + +def start_api_v2_set_sovits_model(text_api_v2_set_sovits_model_base_url, text_api_v2_sovits_model_param, + dropdown_api_v2_sovits_models): + text_api_v2_start_set_sovits_model_request_info = None + try: + if dropdown_api_v2_sovits_models is None or dropdown_api_v2_sovits_models == '': + raise Exception("Sovits模型不能为空") + url = audio_inference.SetModelURLComposer("sovits", text_api_v2_set_sovits_model_base_url, None, + text_api_v2_sovits_model_param) + url.is_valid() + time_consuming, result = time_util.time_monitor(audio_inference.start_api_v2_set_sovits_model)(url, + dropdown_api_v2_sovits_models) + text_api_v2_start_set_sovits_model_request_info = f"耗时:{time_consuming:0.1f}秒;请求结果:{result}" + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_api_v2_start_set_sovits_model_request_info = f"发生异常:{e}" + return text_api_v2_start_set_sovits_model_request_info + + +def open_file(file_path): + common.open_file(my_utils.clean_path(file_path)) + + +def delete_ref_audio_below_boundary(ref_audio_path, text_text_similarity_result_path, text_inference_audio_file_dir, + slider_audio_text_similarity_boundary): + text_delete_ref_audio_below_boundary_info = None + ref_audio_path, text_text_similarity_result_path, text_inference_audio_file_dir = common.batch_clean_paths( + [ref_audio_path, text_text_similarity_result_path, text_inference_audio_file_dir]) + try: + if ref_audio_path is None or ref_audio_path == '': + raise Exception("参考音频路径不能为空") + if text_text_similarity_result_path is None or text_text_similarity_result_path == '': + raise Exception("文本相似度结果路径不能为空") + time_consuming, count = time_util.time_monitor(text_check.delete_ref_audio_below_boundary)(ref_audio_path, + text_text_similarity_result_path, + text_inference_audio_file_dir, + slider_audio_text_similarity_boundary) + text_delete_ref_audio_below_boundary_info = f"耗时:{time_consuming:0.1f}秒;删除参考音频数量:{count}" + except Exception as e: + logger.error("发生异常: \n%s", traceback.format_exc()) + text_delete_ref_audio_below_boundary_info = f"发生异常:{e}" + return text_delete_ref_audio_below_boundary_info + + +def change_lang_choices(key): # 根据选择的模型修改可选的语言 + # return gr.Dropdown(choices=asr_dict[key]['lang']) + return {"__type__": "update", "choices": asr_dict[key]['lang'], "value": asr_dict[key]['lang'][0]} + + +def change_size_choices(key): # 根据选择的模型修改可选的模型尺寸 + # return gr.Dropdown(choices=asr_dict[key]['size']) + return {"__type__": "update", "choices": asr_dict[key]['size']} + + +def save_work_dir(text_work_space_dir, text_role): + text_work_space_dir = my_utils.clean_path(text_work_space_dir) + rw_param.write(rw_param.work_dir, text_work_space_dir) + if text_role is not None and text_role != '': + return text_role + else: + role_dir = '' + for i in range(1, 101): + role_dir = f"role_{i}" + dir_name = os.path.join(text_work_space_dir, role_dir) + if not os.path.isdir(dir_name): + break + rw_param.write(rw_param.role, role_dir) + return role_dir + + +def chang_refer_type_param(selected_value): + rw_param.write(rw_param.refer_type_param, selected_value) + if selected_value == "参考音频": + return {"visible": True, "__type__": "update"}, {"visible": True, "__type__": "update"}, {"visible": False, + "__type__": "update"} + else: + return {"visible": False, "__type__": "update"}, {"visible": False, "__type__": "update"}, {"visible": True, + "__type__": "update"} + + +def switch_role_and_refresh(): + global hide_voice_similarity_dir, hide_config_file + init.init_all() + reset_list = [] + # 基础 + reset_list.extend([ + init.text_refer_audio_file_dir_default, + init.text_inference_audio_file_dir_default + ]) + # 第一步 + reset_list.extend([ + init.text_sample_dir_default, + '', # text_list_input + '', # text_base_voice_path + ]) + # 第二步 + # 第三步 + reset_list.extend([ + init.text_asr_audio_dir_default, + init.text_text_similarity_analysis_path_default + ]) + # 第四步 + reset_list.extend([ + '', # text_base_audio_path + '', # text_compare_audio_dir + ]) + hide_voice_similarity_dir = '' + hide_config_file = '' + # 第五步 + return reset_list + + +def init_ui(): + init.init_all() + + with gr.Blocks() as app: + gr.Markdown(value=i18n("基本介绍:这是一个从训练素材中,批量提取参考音频,并进行效果评估与配置生成的工具")) + with gr.Accordion(label=i18n("基本信息")): + with gr.Row(): + text_work_space_dir = gr.Text(label=i18n("工作目录,后续操作所生成文件都会保存在此目录下"), + value=init.text_work_space_dir_default, scale=4) + text_role = gr.Text(label=i18n("角色名称"), value=init.text_role_default, scale=4) + button_switch_role_and_refresh = gr.Button(i18n("切换并刷新"), variant="primary", scale=1) + text_work_space_dir.blur(save_work_dir, [text_work_space_dir, text_role], [text_role]) + text_role.blur(lambda value: rw_param.write(rw_param.role, value), [text_role], []) + gr.Markdown(value=i18n("下方为公共参数,会随着进度自动填充,无需填写")) + with gr.Row(): + text_refer_audio_file_dir = gr.Text(label=i18n("参考音频所在目录"), + value=init.text_refer_audio_file_dir_default) + text_inference_audio_file_dir = gr.Text(label=i18n("推理音频所在目录"), + value=init.text_inference_audio_file_dir_default) + with gr.Tab(label=i18n("第一步:基于训练素材,生成待选参考音频列表")): + gr.Markdown(value=i18n("1.1:选择list文件,并提取3-10秒的素材作为参考候选")) + text_list_input = gr.Text(label=i18n("请输入list文件路径"), value="") + with gr.Row(): + button_convert_from_list = gr.Button(i18n("开始生成待参考列表"), variant="primary", scale=4) + text_convert_from_list_info = gr.Text(label=i18n("参考列表生成结果"), value="", interactive=False, + scale=4) + button_convert_from_list_result_dir = gr.Button(i18n("打开目录"), variant="primary", scale=1) + gr.Markdown(value=i18n("1.2:选择基准音频,执行相似度匹配,并分段随机抽样")) + with gr.Row(): + text_sample_dir = gr.Text(label=i18n("参考音频抽样目录"), value=init.text_sample_dir_default, + interactive=True) + dropdown_speaker_verification_1 = gr.Dropdown(label=i18n("说话人确认算法"), + choices=list( + model_manager.speaker_verification_models.keys()), + value='speech_campplus_sv_zh-cn_16k-common', + interactive=True) + button_convert_from_list_result_dir.click(open_file, [text_sample_dir], []) + button_convert_from_list.click(convert_from_list, [text_work_space_dir, text_role, text_list_input], + [text_convert_from_list_info, text_sample_dir]) + with gr.Row(): + text_base_voice_path = gr.Text(label=i18n("请输入基准音频路径"), value="") + slider_subsection_num = gr.Slider(minimum=1, maximum=10, step=1, label=i18n("请输入分段数"), + value=init.slider_subsection_num_default, + interactive=True) + slider_sample_num = gr.Slider(minimum=1, maximum=10, step=1, label=i18n("请输入每段随机抽样个数"), + value=init.slider_sample_num_default, interactive=True) + checkbox_similarity_output = gr.Checkbox(label=i18n("是否将相似度匹配结果输出到临时目录?"), + show_label=True) + slider_subsection_num.change(lambda value: rw_param.write(rw_param.subsection_num, value), + [slider_subsection_num], []) + slider_sample_num.change(lambda value: rw_param.write(rw_param.sample_num, value), [slider_sample_num], + []) + with gr.Row(): + button_sample = gr.Button(i18n("开始分段随机抽样"), variant="primary", scale=4) + text_sample_info = gr.Text(label=i18n("分段随机抽样结果"), value="", interactive=False, scale=4) + button_sample_result_open = gr.Button(i18n("打开目录"), variant="primary", scale=1) + with gr.Tab(label=i18n("第二步:基于参考音频和测试文本,执行批量推理")): + gr.Markdown(value=i18n("2.1:启动推理服务,并配置模型参数")) + with gr.Accordion(label=i18n("详情")): + with gr.Tab(label=i18n("主项目下api.py服务")): + gr.Markdown(value=i18n("2.1.1:启动服务")) + with gr.Row(): + button_start_api = gr.Button(i18n("启动api"), variant="primary") + text_start_api_info = gr.Text(label=i18n("api启动信息"), value="", interactive=False) + button_start_api.click(start_api, [], [text_start_api_info]) + gr.Markdown(value=i18n("2.1.2:设置模型参数")) + text_api_set_model_base_url = gr.Text(label=i18n("请输入api服务模型切换接口地址"), + value=init.text_api_set_model_base_url_default, + interactive=True) + text_api_set_model_base_url.blur( + lambda value: rw_param.write(rw_param.api_set_model_base_url, value), + [text_api_set_model_base_url], []) + with gr.Row(): + dropdown_api_gpt_models = gr.Dropdown(label=i18n("GPT模型列表"), + choices=model_manager.get_gpt_model_names(), value="", + interactive=True, scale=4) + dropdown_api_sovits_models = gr.Dropdown(label=i18n("SoVITS模型列表"), + choices=model_manager.get_sovits_model_names(), + value="", interactive=True, scale=4) + button_refresh_api_model = gr.Button(i18n("刷新模型路径"), variant="primary", scale=1) + button_refresh_api_model.click(refresh_api_model, [], + [dropdown_api_gpt_models, dropdown_api_sovits_models]) + with gr.Row(): + text_api_gpt_param = gr.Text(label=i18n("GPT模型参数名"), value=init.text_api_gpt_param_default, + interactive=True) + text_api_sovits_param = gr.Text(label=i18n("SoVITS模型参数名"), + value=init.text_api_sovits_param_default, interactive=True) + text_api_gpt_param.blur(lambda value: rw_param.write(rw_param.api_gpt_param, value), + [text_api_gpt_param], []) + text_api_sovits_param.blur(lambda value: rw_param.write(rw_param.api_sovits_param, value), + [text_api_sovits_param], []) + gr.Markdown(value=i18n("2.1.3:发起设置请求")) + text_api_set_model_whole_url = gr.Text(label=i18n("完整的模型参数设置请求地址"), value="", + interactive=False) + dropdown_api_gpt_models.change(api_set_model_whole_url, + [text_api_set_model_base_url, dropdown_api_gpt_models, + dropdown_api_sovits_models, text_api_gpt_param, + text_api_sovits_param], [text_api_set_model_whole_url]) + dropdown_api_sovits_models.change(api_set_model_whole_url, + [text_api_set_model_base_url, dropdown_api_gpt_models, + dropdown_api_sovits_models, text_api_gpt_param, + text_api_sovits_param], [text_api_set_model_whole_url]) + text_api_gpt_param.input(api_set_model_whole_url, + [text_api_set_model_base_url, dropdown_api_gpt_models, + dropdown_api_sovits_models, text_api_gpt_param, text_api_sovits_param], + [text_api_set_model_whole_url]) + text_api_sovits_param.input(api_set_model_whole_url, + [text_api_set_model_base_url, dropdown_api_gpt_models, + dropdown_api_sovits_models, text_api_gpt_param, text_api_sovits_param], + [text_api_set_model_whole_url]) + with gr.Row(): + button_api_start_set_model_request = gr.Button(i18n("发起模型设置请求"), variant="primary") + text_api_start_set_model_request_info = gr.Text(label=i18n("设置请求结果"), value="", + interactive=False) + button_api_start_set_model_request.click(start_api_set_model, + [text_api_set_model_base_url, dropdown_api_gpt_models, + dropdown_api_sovits_models, text_api_gpt_param, + text_api_sovits_param], + [text_api_start_set_model_request_info]) + with gr.Tab(label=i18n("fast项目下api_v2.py服务")): + gr.Markdown(value=i18n("2.1.1:请将训练完毕得模型,复制到你的项目文件下,启动服务")) + gr.Markdown(value=i18n("2.1.2:设置GPT模型参数")) + text_api_v2_set_gpt_model_base_url = gr.Text(label=i18n("请输入api服务GPT模型切换接口地址"), + value=init.text_api_v2_set_gpt_model_base_url_default, + interactive=True) + text_api_v2_set_gpt_model_base_url.blur( + lambda value: rw_param.write(rw_param.api_v2_set_gpt_model_base_url, value), + [text_api_v2_set_gpt_model_base_url], []) + with gr.Row(): + text_api_v2_gpt_model_param = gr.Text(label=i18n("GPT模型参数名"), + value=init.text_api_v2_gpt_model_param_default, + interactive=True, scale=4) + dropdown_api_v2_gpt_models = gr.Dropdown(label=i18n("GPT模型列表"), + choices=model_manager.get_gpt_model_names(), value="", + interactive=True, scale=4) + text_api_v2_gpt_model_param.blur( + lambda value: rw_param.write(rw_param.api_v2_gpt_model_param, value), + [text_api_v2_gpt_model_param], []) + button_api_v2_refresh_gpt = gr.Button(i18n("刷新模型路径"), variant="primary", scale=1) + button_api_v2_refresh_gpt.click(refresh_api_v2_gpt_model, [], [dropdown_api_v2_gpt_models]) + text_api_v2_set_gpt_model_whole_url = gr.Text(label=i18n("完整的GPT模型参数设置请求地址"), value="", + interactive=False) + text_api_v2_gpt_model_param.input(api_v2_set_gpt_whole_url, + [text_api_v2_set_gpt_model_base_url, text_api_v2_gpt_model_param, + dropdown_api_v2_gpt_models], + [text_api_v2_set_gpt_model_whole_url]) + dropdown_api_v2_gpt_models.change(api_v2_set_gpt_whole_url, + [text_api_v2_set_gpt_model_base_url, text_api_v2_gpt_model_param, + dropdown_api_v2_gpt_models], + [text_api_v2_set_gpt_model_whole_url]) + with gr.Row(): + button_api_v2_start_set_gpt_model_request = gr.Button(i18n("发起GPT模型设置请求"), + variant="primary") + text_api_v2_start_set_gpt_model_request_info = gr.Text(label=i18n("设置请求结果"), value="", + interactive=False) + button_api_v2_start_set_gpt_model_request.click(start_api_v2_set_gpt_model, + [text_api_v2_set_gpt_model_base_url, + text_api_v2_gpt_model_param, + dropdown_api_v2_gpt_models], + [text_api_v2_start_set_gpt_model_request_info]) + gr.Markdown(value=i18n("2.1.3:设置SoVITS模型参数")) + text_api_v2_set_sovits_model_base_url = gr.Text(label=i18n("请输入api服务SoVITS模型切换接口地址"), + value=init.text_api_v2_set_sovits_model_base_url_default, + interactive=True) + text_api_v2_set_sovits_model_base_url.blur( + lambda value: rw_param.write(rw_param.api_v2_set_sovits_model_base_url, value), + [text_api_v2_set_sovits_model_base_url], []) + with gr.Row(): + text_api_v2_sovits_model_param = gr.Text(label=i18n("SoVITS模型参数名"), + value=init.text_api_v2_sovits_model_param_default, + interactive=True, scale=4) + dropdown_api_v2_sovits_models = gr.Dropdown(label=i18n("SoVITS模型列表"), + choices=model_manager.get_sovits_model_names(), + value="", interactive=True, scale=4) + button_api_v2_refresh_sovits = gr.Button(i18n("刷新模型路径"), variant="primary", scale=1) + text_api_v2_sovits_model_param.blur( + lambda value: rw_param.write(rw_param.api_v2_sovits_model_param, value), + [text_api_v2_sovits_model_param], []) + button_api_v2_refresh_sovits.click(refresh_api_v2_sovits_model, [], + [dropdown_api_v2_sovits_models]) + text_api_v2_set_sovits_model_whole_url = gr.Text(label=i18n("完整的SoVITS模型参数设置请求地址"), + value="", interactive=False) + text_api_v2_sovits_model_param.input(api_v2_set_sovits_whole_url, + [text_api_v2_set_sovits_model_base_url, + text_api_v2_sovits_model_param, + dropdown_api_v2_sovits_models], + [text_api_v2_set_sovits_model_whole_url]) + dropdown_api_v2_sovits_models.change(api_v2_set_sovits_whole_url, + [text_api_v2_set_sovits_model_base_url, + text_api_v2_sovits_model_param, + dropdown_api_v2_sovits_models], + [text_api_v2_set_sovits_model_whole_url]) + with gr.Row(): + button_api_v2_start_set_sovits_model_request = gr.Button(i18n("发起SoVITS模型设置请求"), + variant="primary") + text_api_v2_start_set_sovits_model_request_info = gr.Text(label=i18n("设置请求结果"), value="", + interactive=False) + button_api_v2_start_set_sovits_model_request.click(start_api_v2_set_sovits_model, + [text_api_v2_set_sovits_model_base_url, + text_api_v2_sovits_model_param, + dropdown_api_v2_sovits_models], [ + text_api_v2_start_set_sovits_model_request_info]) + with gr.Tab(label=i18n("第三方推理服务")): + gr.Markdown(value=i18n("启动第三方推理服务,并完成参考音频打包,模型参数设置等操作")) + gr.Markdown(value=i18n("2.2:配置推理服务参数信息,除api服务外,其他需要修改参数内容,参考音和角色情绪二选一,如果是角色情绪(第三方推理包),需要先执行第五步," + "将参考音频打包配置到推理服务下,在推理前,请确认完整请求地址是否与正常使用时的一致,包括角色名称,尤其是文本分隔符是否正确")) + text_url = gr.Text(label=i18n("请输入推理服务请求地址与参数"), + value=init.text_url_default) + with gr.Row(): + text_text = gr.Text(label=i18n("请输入文本参数名"), value=init.text_text_default) + dropdown_refer_type_param = gr.Dropdown(label=i18n("类型"), choices=["参考音频", "角色情绪"], + value=init.dropdown_refer_type_param_default, interactive=True) + text_ref_path = gr.Text(label=i18n("请输入参考音频路径参数名"), + value=init.text_ref_path_default, visible=True) + text_ref_text = gr.Text(label=i18n("请输入参考音频文本参数名"), + value=init.text_ref_text_default, visible=True) + text_emotion = gr.Text(label=i18n("请输入角色情绪参数名"), value=init.text_emotion_default, + visible=False) + dropdown_refer_type_param.change(chang_refer_type_param, [dropdown_refer_type_param], + [text_ref_path, text_ref_text, text_emotion]) + text_whole_url = gr.Text(label=i18n("完整地址"), value=init.text_whole_url_default, interactive=False) + + text_text.blur(lambda value: rw_param.write(rw_param.text_param, value), [text_text], []) + text_ref_path.blur(lambda value: rw_param.write(rw_param.ref_path_param, value), [text_ref_path], []) + text_ref_text.blur(lambda value: rw_param.write(rw_param.ref_text_param, value), [text_ref_text], []) + text_emotion.blur(lambda value: rw_param.write(rw_param.emotion_param, value), [text_emotion], []) + + text_url.input(whole_url, + [text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, text_emotion], + [text_whole_url]) + text_url.blur(lambda value: rw_param.write(rw_param.text_url, value), [text_url], []) + text_text.input(whole_url, [text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, + text_emotion], + [text_whole_url]) + text_text.blur(lambda value: rw_param.write(rw_param.text_param, value), [text_text], []) + dropdown_refer_type_param.change(whole_url, + [text_url, dropdown_refer_type_param, text_text, text_ref_path, + text_ref_text, + text_emotion], + [text_whole_url]) + text_ref_path.input(whole_url, + [text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, + text_emotion], + [text_whole_url]) + text_ref_path.blur(lambda value: rw_param.write(rw_param.ref_path_param, value), [text_ref_path], []) + text_ref_text.input(whole_url, + [text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, + text_emotion], + [text_whole_url]) + text_ref_text.blur(lambda value: rw_param.write(rw_param.ref_text_param, value), [text_ref_text], []) + text_emotion.input(whole_url, [text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, + text_emotion], + [text_whole_url]) + text_emotion.blur(lambda value: rw_param.write(rw_param.emotion_param, value), [text_emotion], []) + gr.Markdown(value=i18n("2.3:配置待推理文本,一句一行,尽量保证文本多样性,不同情绪、不同类型的都来一点")) + with gr.Row(): + text_test_content = gr.Text(label=i18n("请输入待推理文本路径"), value=init.text_test_content_default, + scale=8) + button_open_test_content_file = gr.Button(i18n("打开推理文本"), variant="primary", scale=1) + button_open_test_content_file.click(open_file, [text_test_content], []) + text_test_content.blur(lambda value: rw_param.write(rw_param.test_content_path, value), + [text_test_content], []) + gr.Markdown(value=i18n("2.4:开始批量推理,这个过程比较耗时,可以去干点别的")) + slider_request_concurrency_num = gr.Slider(minimum=1, maximum=init.slider_request_concurrency_max_num, + step=1, label=i18n( + "请输入请求并发数,会根据此数创建对应数量的子进程并行发起推理请求"), + value=init.slider_request_concurrency_num_default, + interactive=True) + slider_request_concurrency_num.change(lambda value: rw_param.write(rw_param.request_concurrency_num, value), + [slider_request_concurrency_num], []) + with gr.Row(): + button_model_inference = gr.Button(i18n("开启批量推理"), variant="primary", scale=4) + text_model_inference_info = gr.Text(label=i18n("批量推理结果"), value="", interactive=False, scale=4) + button_model_inference_result_open = gr.Button(i18n("打开目录"), variant="primary", scale=1) + with gr.Tab(label=i18n("第三步:进行参考音频推理效果准确度校验")): + gr.Markdown(value=i18n("3.1:启动asr,获取推理音频文本")) + text_asr_audio_dir = gr.Text(label=i18n("待asr的音频所在目录"), value=init.text_asr_audio_dir_default, + interactive=True) + with gr.Row(): + dropdown_asr_model = gr.Dropdown( + label=i18n("ASR 模型"), + choices=list(asr_dict.keys()), + interactive=True, + value="达摩 ASR (中文)" + ) + dropdown_asr_size = gr.Dropdown( + label=i18n("ASR 模型尺寸"), + choices=["large"], + interactive=True, + value="large" + ) + dropdown_asr_lang = gr.Dropdown( + label=i18n("ASR 语言设置"), + choices=["zh"], + interactive=True, + value="zh" + ) + dropdown_asr_model.change(change_lang_choices, [dropdown_asr_model], [dropdown_asr_lang]) + dropdown_asr_model.change(change_size_choices, [dropdown_asr_model], [dropdown_asr_size]) + with gr.Row(): + button_asr = gr.Button(i18n("启动asr"), variant="primary", scale=4) + text_asr_info = gr.Text(label=i18n("asr结果"), value="", interactive=False, scale=4) + button_asr_result_open = gr.Button(i18n("打开文件"), variant="primary", scale=1) + gr.Markdown(value=i18n("3.2:启动文本相似度分析")) + with gr.Row(): + text_text_similarity_analysis_path = gr.Text(label=i18n("待分析的文件路径"), + value=init.text_text_similarity_analysis_path_default, + interactive=True) + slider_text_similarity_amplification_boundary = gr.Slider(minimum=0, maximum=1, step=0.01, + label=i18n( + "文本相似度放大边界,因为原始模型输出的相似度差异太小,所以进行了一次放大,放大逻辑为,边界值以下归0,边界值到1的区间重新映射到0-1"), + value=init.slider_text_similarity_amplification_boundary_default, + interactive=True) + slider_text_similarity_amplification_boundary.change( + lambda value: rw_param.write(rw_param.text_similarity_amplification_boundary, value), + [slider_text_similarity_amplification_boundary], []) + button_asr.click(asr, [text_work_space_dir, text_role, text_asr_audio_dir, dropdown_asr_model, + dropdown_asr_size, dropdown_asr_lang], + [text_asr_info, text_text_similarity_analysis_path]) + button_asr_result_open.click(open_file, [text_text_similarity_analysis_path], []) + with gr.Row(): + button_text_similarity_analysis = gr.Button(i18n("启动文本相似度分析"), variant="primary") + text_text_similarity_analysis_info = gr.Text(label=i18n("文本相似度分析结果"), value="", + interactive=False) + gr.Markdown(value=i18n("3.3:根据相似度分析结果,重点检查最后几条是否存在复读等问题")) + with gr.Row(): + text_text_similarity_result_path = gr.Text(label=i18n("文本相似度分析结果文件所在路径"), + value=init.text_text_similarity_result_path_default, + interactive=True, scale=7) + button_open_text_similarity_result = gr.Button(i18n("打开结果文件"), variant="primary", scale=1) + button_open_inference_dir = gr.Button(i18n("打开推理目录"), variant="primary", scale=1) + + button_text_similarity_analysis.click(text_similarity_analysis, [text_work_space_dir, text_role, + slider_text_similarity_amplification_boundary, + text_text_similarity_analysis_path], + [text_text_similarity_analysis_info, + text_text_similarity_result_path]) + + button_open_text_similarity_result.click(open_file, [text_text_similarity_result_path], []) + button_open_inference_dir.click(open_file, [text_inference_audio_file_dir], []) + slider_audio_text_similarity_boundary = gr.Slider(minimum=0, maximum=1, step=0.001, + label=i18n("音频文本相似度边界值"), value=0.800, + interactive=True) + with gr.Row(): + button_delete_ref_audio_below_boundary = gr.Button(i18n("删除音频文本相似度边界值以下的参考音频"), + variant="primary") + text_delete_ref_audio_below_boundary_info = gr.Text(label=i18n("删除结果"), value="", interactive=True) + button_delete_ref_audio_below_boundary.click(delete_ref_audio_below_boundary, + [text_refer_audio_file_dir, + text_text_similarity_result_path, + text_inference_audio_file_dir, + slider_audio_text_similarity_boundary], + [text_delete_ref_audio_below_boundary_info]) + with gr.Tab(label=i18n("第四步:校验参考音频音质")): + gr.Markdown(value=i18n("4.1:对结果按音频相似度排序,或许有用吧,主要还是耳朵听")) + with gr.Row(): + text_base_audio_path = gr.Text(label=i18n("请输入基准音频"), value="") + text_compare_audio_dir = gr.Text(label=i18n("请输入待比较的音频文件目录"), value="") + dropdown_speaker_verification_2 = gr.Dropdown(label=i18n("说话人确认算法"), + choices=list( + model_manager.speaker_verification_models.keys()), + value='speech_campplus_sv_zh-cn_16k-common', + interactive=True) + with gr.Row(): + button_similarity_audio_output = gr.Button(i18n("输出相似度-参考音频到临时目录"), variant="primary", + scale=4) + text_similarity_audio_output_info = gr.Text(label=i18n("输出结果"), value="", interactive=False, + scale=4) + button_similarity_audio_output_result_open = gr.Button(i18n("打开目录"), variant="primary", scale=1) + button_similarity_audio_output.click(similarity_audio_output, + [text_work_space_dir, text_role, text_base_audio_path, + text_compare_audio_dir, dropdown_speaker_verification_2], + [text_similarity_audio_output_info]) + button_similarity_audio_output_result_open.click(lambda: open_file(hide_voice_similarity_dir), [], []) + gr.Markdown(value=i18n("4.2:如果发现存在低音质的推理音频,那么就去参考音频目录下,把原参考音频删了")) + gr.Markdown(value=i18n("4.3:删除参考音频之后,按下面的操作,会将推理音频目录下对应的音频也删掉")) + with gr.Row(): + button_sync_ref_audio = gr.Button(i18n("将参考音频的删除情况,同步到推理音频目录"), variant="primary") + text_sync_ref_info = gr.Text(label=i18n("同步结果"), value="", interactive=False) + button_sync_ref_audio.click(sync_ref_audio, [text_work_space_dir, text_role, text_refer_audio_file_dir, + text_inference_audio_file_dir], [text_sync_ref_info]) + with gr.Tab("第五步:生成参考音频配置文本"): + gr.Markdown(value=i18n( + "5.1:编辑模板,占位符说明:\${emotion}表示相对路径加音频文件名;\${ref_path}表示音频相对角色目录的文件路径;\${ref_text}:表示音频文本")) + text_template = gr.Text(label=i18n("模板内容"), value=init.text_template_default, lines=10) + text_template.blur(lambda value: rw_param.write(rw_param.text_template, value), [text_template], []) + gr.Markdown(value=i18n("5.2:生成配置")) + with gr.Row(): + button_create_config = gr.Button(i18n("生成配置"), variant="primary", scale=4) + text_create_config_info = gr.Text(label=i18n("生成结果"), value="", interactive=False, scale=4) + button_create_config_result_open = gr.Button(i18n("打开文件"), variant="primary", scale=1) + button_create_config.click(create_config, + [text_work_space_dir, text_role, text_template, text_refer_audio_file_dir], + [text_create_config_info]) + button_create_config_result_open.click(lambda: open_file(hide_config_file), [], []) + button_sample.click(sample, [text_work_space_dir, text_role, text_sample_dir, dropdown_speaker_verification_1, + text_base_voice_path, + slider_subsection_num, slider_sample_num, checkbox_similarity_output], + [text_sample_info, text_refer_audio_file_dir]) + button_sample_result_open.click(open_file, [text_refer_audio_file_dir], []) + button_model_inference.click(model_inference, + [text_work_space_dir, text_role, slider_request_concurrency_num, + text_refer_audio_file_dir, text_url, dropdown_refer_type_param, + text_text, text_ref_path, text_ref_text, text_emotion, + text_test_content], + [text_model_inference_info, text_asr_audio_dir, text_inference_audio_file_dir]) + button_model_inference_result_open.click(open_file, [text_inference_audio_file_dir], []) + + # 设置重置刷新事件 + refresh_list = [] + # 基础 + refresh_list.extend([ + text_refer_audio_file_dir, + text_inference_audio_file_dir + ]) + # 第一步 + refresh_list.extend([ + text_sample_dir, + text_list_input, + text_base_voice_path + ]) + # 第二步 + # 第三步 + refresh_list.extend([ + text_asr_audio_dir, + text_text_similarity_analysis_path + ]) + # 第四步 + refresh_list.extend([ + text_base_audio_path, + text_compare_audio_dir + ]) + # 第五步 + + button_switch_role_and_refresh.click(switch_role_and_refresh, [], refresh_list) + + app.launch( + server_port=params.server_port, + inbrowser=True, + quiet=True, + ) + + +if __name__ == "__main__": + init_ui() diff --git a/Ref_Audio_Selector/start_ref_audio_selector_webui.bat b/Ref_Audio_Selector/start_ref_audio_selector_webui.bat new file mode 100644 index 000000000..aed2d79e1 --- /dev/null +++ b/Ref_Audio_Selector/start_ref_audio_selector_webui.bat @@ -0,0 +1,5 @@ +CHCP 65001 +@echo off +cd ../ +runtime\python.exe ./Ref_Audio_Selector/ref_audio_selector_webui.py +pause \ No newline at end of file diff --git a/Ref_Audio_Selector/tool/__init__.py b/Ref_Audio_Selector/tool/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/tool/asr/__init__.py b/Ref_Audio_Selector/tool/asr/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py b/Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py new file mode 100644 index 000000000..4b5bc95c6 --- /dev/null +++ b/Ref_Audio_Selector/tool/asr/fasterwhisper_asr_multi_level_dir.py @@ -0,0 +1,120 @@ +import argparse +import os +import traceback +import Ref_Audio_Selector.config_param.config_params as params + +os.environ["HF_ENDPOINT"] = "https://hf-mirror.com" +os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" + +import torch +from faster_whisper import WhisperModel +from tqdm import tqdm + +from tools.asr.config import check_fw_local_models +from Ref_Audio_Selector.config_param.log_config import logger + +language_code_list = [ + "af", "am", "ar", "as", "az", + "ba", "be", "bg", "bn", "bo", + "br", "bs", "ca", "cs", "cy", + "da", "de", "el", "en", "es", + "et", "eu", "fa", "fi", "fo", + "fr", "gl", "gu", "ha", "haw", + "he", "hi", "hr", "ht", "hu", + "hy", "id", "is", "it", "ja", + "jw", "ka", "kk", "km", "kn", + "ko", "la", "lb", "ln", "lo", + "lt", "lv", "mg", "mi", "mk", + "ml", "mn", "mr", "ms", "mt", + "my", "ne", "nl", "nn", "no", + "oc", "pa", "pl", "ps", "pt", + "ro", "ru", "sa", "sd", "si", + "sk", "sl", "sn", "so", "sq", + "sr", "su", "sv", "sw", "ta", + "te", "tg", "th", "tk", "tl", + "tr", "tt", "uk", "ur", "uz", + "vi", "yi", "yo", "zh", "yue", + "auto"] + + +def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language, precision): + if '-local' in model_size: + model_size = model_size[:-6] + model_path = f'tools/asr/models/faster-whisper-{model_size}' + else: + model_path = model_size + if language == 'auto': + language = None # 不设置语种由模型自动输出概率最高的语种 + logger.info("loading faster whisper model:", model_size, model_path) + device = 'cuda' if torch.cuda.is_available() else 'cpu' + try: + model = WhisperModel(model_path, device=device, compute_type=precision) + except: + return logger.error(traceback.format_exc()) + + output = [] + + # 递归遍历输入目录及所有子目录 + for root, dirs, files in os.walk(input_folder): + for file_name in sorted(files): + # 只处理wav文件(假设是wav文件) + if file_name.endswith(".wav"): + try: + file_path = os.path.join(root, file_name) + original_text = os.path.basename(root) + segments, info = model.transcribe( + audio=file_path, + beam_size=5, + vad_filter=True, + vad_parameters=dict(min_silence_duration_ms=700), + language=language) + text = '' + + if info.language == "zh": + logger.info("检测为中文文本, 转 FunASR 处理") + if ("only_asr" not in globals()): + from Ref_Audio_Selector.tool.asr.funasr_asr_multi_level_dir import \ + only_asr # #如果用英文就不需要导入下载模型 + text = only_asr(file_path) + + if text == '': + for segment in segments: + text += segment.text + output.append(f"{file_path}|{original_text}|{info.language.upper()}|{text}") + print(f"{file_path}|{original_text}|{info.language.upper()}|{text}") + except: + return logger.error(traceback.format_exc()) + + output_folder = output_folder + os.makedirs(output_folder, exist_ok=True) + output_file_path = os.path.abspath(f'{output_folder}/{params.asr_filename}.list') + + with open(output_file_path, "w", encoding="utf-8") as f: + f.write("\n".join(output)) + logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n") + return output_file_path + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_folder", type=str, required=True, + help="Path to the folder containing WAV files.") + parser.add_argument("-o", "--output_folder", type=str, required=True, + help="Output folder to store transcriptions.") + parser.add_argument("-s", "--model_size", type=str, default='large-v3', + choices=check_fw_local_models(), + help="Model Size of Faster Whisper") + parser.add_argument("-l", "--language", type=str, default='ja', + choices=language_code_list, + help="Language of the audio files.") + parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'], + help="fp16 or fp32") + + cmd = parser.parse_args() + output_file_path = execute_asr_multi_level_dir( + input_folder=cmd.input_folder, + output_folder=cmd.output_folder, + model_size=cmd.model_size, + language=cmd.language, + precision=cmd.precision, + ) diff --git a/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py b/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py new file mode 100644 index 000000000..abe45e9ee --- /dev/null +++ b/Ref_Audio_Selector/tool/asr/funasr_asr_multi_level_dir.py @@ -0,0 +1,94 @@ +# -*- coding:utf-8 -*- + +import argparse +import os +import traceback +import Ref_Audio_Selector.config_param.config_params as params +from Ref_Audio_Selector.config_param.log_config import logger +from Ref_Audio_Selector.common.time_util import timeit_decorator +from tqdm import tqdm +from funasr import AutoModel + +path_asr = 'tools/asr/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' +path_vad = 'tools/asr/models/speech_fsmn_vad_zh-cn-16k-common-pytorch' +path_punc = 'tools/asr/models/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' +path_asr = path_asr if os.path.exists( + path_asr) else "iic/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +path_vad = path_vad if os.path.exists(path_vad) else "iic/speech_fsmn_vad_zh-cn-16k-common-pytorch" +path_punc = path_punc if os.path.exists(path_punc) else "iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" + +model = AutoModel( + model=path_asr, + model_revision="v2.0.4", + vad_model=path_vad, + vad_model_revision="v2.0.4", + punc_model=path_punc, + punc_model_revision="v2.0.4", +) + + +def only_asr(input_file): + try: + text = model.generate(input=input_file)[0]["text"] + except: + text = '' + logger.error(traceback.format_exc()) + return text + + +@timeit_decorator +def execute_asr_multi_level_dir(input_folder, output_folder, model_size, language): + output = [] + # 递归遍历输入目录及所有子目录 + for root, dirs, files in os.walk(input_folder): + for name in sorted(files): + # 只处理wav文件(假设是wav文件) + if name.endswith(".wav"): + try: + original_text = os.path.basename(root) + # 构造完整的输入音频文件路径 + input_file_path = os.path.join(root, name) + input_file_path = os.path.normpath(input_file_path) # 先标准化可能存在混合斜杠的情况 + asr_text = model.generate(input=input_file_path)[0]["text"] + + output.append(f"{input_file_path}|{original_text}|{language.upper()}|{asr_text}") + + except: + logger.error(traceback.format_exc()) + + # 创建或打开指定的输出目录 + output_folder = output_folder + output_dir_abs = os.path.abspath(output_folder) + os.makedirs(output_dir_abs, exist_ok=True) + + # 构造输出文件路径 + output_file_path = os.path.join(output_dir_abs, f'{params.asr_filename}.list') + + # 将输出写入文件 + with open(output_file_path, "w", encoding="utf-8") as f: + f.write("\n".join(output)) + logger.info(f"ASR 任务完成->标注文件路径: {output_file_path}\n") + + return output_file_path + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_folder", type=str, required=True, + help="Path to the folder containing WAV files.") + parser.add_argument("-o", "--output_folder", type=str, required=True, + help="Output folder to store transcriptions.") + parser.add_argument("-s", "--model_size", type=str, default='large', + help="Model Size of FunASR is Large") + parser.add_argument("-l", "--language", type=str, default='zh', choices=['zh'], + help="Language of the audio files.") + parser.add_argument("-p", "--precision", type=str, default='float16', choices=['float16', 'float32'], + help="fp16 or fp32") # 还没接入 + + cmd = parser.parse_args() + execute_asr_multi_level_dir( + input_folder=cmd.input_folder, + output_folder=cmd.output_folder, + model_size=cmd.model_size, + language=cmd.language, + ) diff --git a/Ref_Audio_Selector/tool/audio_check.py b/Ref_Audio_Selector/tool/audio_check.py new file mode 100644 index 000000000..d00ef5224 --- /dev/null +++ b/Ref_Audio_Selector/tool/audio_check.py @@ -0,0 +1,54 @@ +import os +import shutil +import Ref_Audio_Selector.common.common as common +import Ref_Audio_Selector.config_param.config_params as params +from Ref_Audio_Selector.config_param.log_config import logger + + +def remove_matching_audio_files_in_text_dir(text_dir, emotions_list): + count = 0 + emotions = [item['emotion'] for item in emotions_list] + for root, dirs, files in os.walk(text_dir): + for file in files: + if file.endswith(".wav"): + emotion_tag = os.path.basename(file)[:-4] + if emotion_tag not in emotions: + file_path = os.path.join(root, file) + logger.info(f"Deleting file: {file_path}") + try: + os.remove(file_path) + count += 1 + except Exception as e: + logger.error(f"Error deleting file {file_path}: {e}") + + return count + + +def delete_emotion_subdirectories(emotion_dir, emotions_list): + count = 0 + + emotions = [item['emotion'] for item in emotions_list] + + for entry in os.listdir(emotion_dir): + entry_path = os.path.join(emotion_dir, entry) + if os.path.isdir(entry_path): + if entry not in emotions: + logger.info(f"Deleting directory: {entry_path}") + try: + # 使用shutil.rmtree删除整个子目录及其内容 + shutil.rmtree(entry_path) + count += 1 + except Exception as e: + logger.error(f"Error deleting directory {entry_path}: {e}") + + return count + + +def sync_ref_audio(ref_audio_dir, inference_audio_dir): + ref_audio_manager = common.RefAudioListManager(ref_audio_dir) + ref_list = ref_audio_manager.get_ref_audio_list() + text_dir = os.path.join(inference_audio_dir, params.inference_audio_text_aggregation_dir) + emotion_dir = os.path.join(inference_audio_dir, params.inference_audio_emotion_aggregation_dir) + delete_text_wav_num = remove_matching_audio_files_in_text_dir(text_dir, ref_list) + delete_emotion_dir_num = delete_emotion_subdirectories(emotion_dir, ref_list) + return delete_text_wav_num, delete_emotion_dir_num diff --git a/Ref_Audio_Selector/tool/audio_config.py b/Ref_Audio_Selector/tool/audio_config.py new file mode 100644 index 000000000..1783f1de8 --- /dev/null +++ b/Ref_Audio_Selector/tool/audio_config.py @@ -0,0 +1,31 @@ +import os +import platform + + +def generate_audio_config(work_space_dir, template_str, audio_list, output_file_path): + # 定义一个空字符串来存储最终要写入文件的内容 + file_content = "" + + # 遍历参考音频列表 + for audio_info in audio_list: + emotion = audio_info['emotion'] + ref_path = audio_info['ref_path'] + ref_text = audio_info['ref_text'] + + relative_path = os.path.relpath(ref_path, work_space_dir) + if platform.system() == 'Windows': + relative_path = relative_path.replace('\\', '/') + + # 使用字符串模板替换变量 + formatted_line = template_str.replace('${emotion}', emotion).replace('${ref_path}', relative_path).replace( + '${ref_text}', ref_text) + + # 将格式化后的行添加到内容中,使用逗号和换行符分隔 + file_content += formatted_line + ",\n" + + # 删除最后一个逗号和换行符,确保格式整洁 + file_content = file_content[:-2] + + # 将内容写入输出文件 + with open(output_file_path, 'w', encoding='utf-8') as output_file: + output_file.write(file_content) diff --git a/Ref_Audio_Selector/tool/audio_inference.py b/Ref_Audio_Selector/tool/audio_inference.py new file mode 100644 index 000000000..b05602127 --- /dev/null +++ b/Ref_Audio_Selector/tool/audio_inference.py @@ -0,0 +1,238 @@ +import time +import os +import requests +import itertools +import multiprocessing +from concurrent.futures import ProcessPoolExecutor +import numpy as np +import Ref_Audio_Selector.config_param.config_params as params +from urllib.parse import urlparse, parse_qs, urlencode, urlunparse, quote +from Ref_Audio_Selector.config_param.log_config import logger, p_logger + + +class SetModelURLComposer: + def __init__(self, type, base_url, gpt_param_name, sovits_param_name): + self.type = type + self.base_url = base_url + self.gpt_param_name = gpt_param_name + self.sovits_param_name = sovits_param_name + + def is_valid(self): + if self.base_url is None or self.base_url == '': + raise Exception("请求地址不能为空") + if self.type in ['gpt', 'all']: + if self.gpt_param_name is None or self.gpt_param_name == '': + raise Exception("GPT参数名不能为空") + if self.type in ['sovits', 'all']: + if self.sovits_param_name is None or self.sovits_param_name == '': + raise Exception("Sovits参数名不能为空") + + def build_get_url(self, value_array, need_url_encode=True): + params = {} + if self.type == 'gpt': + params[self.gpt_param_name] = value_array[0] + if self.type == 'sovits': + params[self.sovits_param_name] = value_array[0] + if self.type == 'all': + params[self.gpt_param_name] = value_array[0] + params[self.sovits_param_name] = value_array[1] + return append_params_to_url(self.base_url, params, need_url_encode) + + def build_post_url(self, value_array, need_url_encode=True): + url = append_params_to_url(self.base_url, {}, need_url_encode) + params = {} + if self.type == 'gpt': + params[self.gpt_param_name] = value_array[0] + if self.type == 'sovits': + params[self.sovits_param_name] = value_array[0] + if self.type == 'all': + params[self.gpt_param_name] = value_array[0] + params[self.sovits_param_name] = value_array[1] + return url, params + + +class TTSURLComposer: + def __init__(self, base_url, refer_type_param, emotion_param_name, text_param_name, ref_path_param_name, ref_text_param_name): + self.base_url = base_url + # 角色情绪 or 参考音频 + self.refer_type_param = refer_type_param + self.emotion_param_name = emotion_param_name + self.text_param_name = text_param_name + self.ref_path_param_name = ref_path_param_name + self.ref_text_param_name = ref_text_param_name + + def is_valid(self): + if self.base_url is None or self.base_url == '': + raise ValueError("请输入url") + + if self.text_param_name is None or self.text_param_name == '': + raise ValueError("请输入text参数名") + + if self.emotion_param_name is None and self.ref_path_param_name is None and self.ref_text_param_name is None: + raise ValueError("请输入至少一个参考or情绪的参数") + + def is_emotion(self): + return self.refer_type_param == '角色情绪' + + def build_url_with_emotion(self, text_value, emotion_value, need_url_encode=True): + params = { + self.text_param_name: text_value, + self.emotion_param_name: emotion_value, + } + return append_params_to_url(self.base_url, params, need_url_encode) + + def build_url_with_ref(self, text_value, ref_path_value, ref_text_value, need_url_encode=True): + params = { + self.text_param_name: text_value, + self.ref_path_param_name: ref_path_value, + self.ref_text_param_name: ref_text_value, + } + return append_params_to_url(self.base_url, params, need_url_encode) + + +def append_params_to_url(url_with_params, params, need_url_encode): + if params: + query_params = '&'.join([f"{k}={v}" for k, v in params.items()]) + url_with_params += '?' + query_params if '?' not in url_with_params else '&' + query_params + return url_with_params if not need_url_encode else safe_encode_query_params(url_with_params) + + +def safe_encode_query_params(original_url): + # 分析URL以获取查询字符串部分 + parsed_url = urlparse(original_url) + query_params = parse_qs(parsed_url.query) + + # 将查询参数转换为编码过的字典(键值对会被转码) + encoded_params = {k: quote(v[0]) for k, v in query_params.items()} + + # 重新编码查询字符串 + new_query_string = urlencode(encoded_params, doseq=False) + + # 重建完整的URL + new_parsed_url = parsed_url._replace(query=new_query_string) + encoded_url = urlunparse(new_parsed_url) + + logger.info(encoded_url) + return encoded_url + + +def generate_audio_files_parallel(url_composer, text_list, emotion_list, output_dir_path, num_processes=1): + + # 将emotion_list均匀分成num_processes个子集 + emotion_groups = np.array_split(emotion_list, num_processes) + + with ProcessPoolExecutor(max_workers=num_processes) as executor: + futures = [ + executor.submit(generate_audio_files_for_emotion_group, url_composer, text_list, group, output_dir_path) + for group in emotion_groups] + for future in futures: + future.result() # 等待所有进程完成 + + +def generate_audio_files_for_emotion_group(url_composer, text_list, emotion_list, output_dir_path): + start_time = time.perf_counter() # 使用 perf_counter 获取高精度计时起点 + # Ensure the output directory exists + output_dir = os.path.abspath(output_dir_path) + os.makedirs(output_dir, exist_ok=True) + + # Create subdirectories for text and emotion categories + text_subdir = os.path.join(output_dir, params.inference_audio_text_aggregation_dir) + os.makedirs(text_subdir, exist_ok=True) + emotion_subdir = os.path.join(output_dir, params.inference_audio_emotion_aggregation_dir) + os.makedirs(emotion_subdir, exist_ok=True) + + all_count = len(text_list) * len(emotion_list) + has_generated_count = 0 + all_text_count = sum(len(item) for item in text_list) + + # 计算笛卡尔积 + cartesian_product = list(itertools.product(text_list, emotion_list)) + + for text, emotion in cartesian_product: + # Generate audio byte stream using the create_audio function + + emotion_name = emotion['emotion'] + + text_subdir_text = os.path.join(text_subdir, text) + os.makedirs(text_subdir_text, exist_ok=True) + text_subdir_text_file_path = os.path.join(text_subdir_text, emotion_name + '.wav') + + emotion_subdir_emotion = os.path.join(emotion_subdir, emotion_name) + os.makedirs(emotion_subdir_emotion, exist_ok=True) + emotion_subdir_emotion_file_path = os.path.join(emotion_subdir_emotion, text + '.wav') + + # 检查是否已经存在对应的音频文件,如果存在则跳过 + if os.path.exists(text_subdir_text_file_path) and os.path.exists(emotion_subdir_emotion_file_path): + has_generated_count += 1 + logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}") + continue + + if url_composer.is_emotion(): + real_url = url_composer.build_url_with_emotion(text, emotion['emotion'], False) + else: + real_url = url_composer.build_url_with_ref(text, emotion['ref_path'], emotion['ref_text'], False) + + audio_bytes = inference_audio_from_api(real_url) + + # Write audio bytes to the respective files + with open(text_subdir_text_file_path, 'wb') as f: + f.write(audio_bytes) + with open(emotion_subdir_emotion_file_path, 'wb') as f: + f.write(audio_bytes) + + has_generated_count += 1 + logger.info(f"进程ID: {os.getpid()}, 进度: {has_generated_count}/{all_count}") + end_time = time.perf_counter() # 获取计时终点 + elapsed_time = end_time - start_time # 计算执行耗时 + # 记录日志内容 + log_message = f"进程ID: {os.getpid()}, generate_audio_files_for_emotion_group 执行耗时: {elapsed_time:.6f} 秒;推理数量: {has_generated_count}; 字符总数:{all_text_count};每秒推理字符数:{all_text_count*len(emotion_list) / elapsed_time:.3f};" + p_logger.info(log_message) + logger.info(log_message) + + +def inference_audio_from_api(url): + logger.info(f'inference_audio_from_api url: {url}') + # 发起GET请求 + response = requests.get(url, stream=True) + + # 检查响应状态码是否正常(例如200表示成功) + if response.status_code == 200: + # 返回音频数据的字节流 + return response.content + else: + raise Exception(f"Failed to fetch audio from API. Server responded with status code {response.status_code}.message: {response.json()}") + + +def start_api_set_model(set_model_url_composer, gpt_models, sovits_models): + url, post_body = set_model_url_composer.build_post_url([gpt_models, sovits_models], True) + logger.info(f'set_model_url_composer url: {set_model_url_composer}') + logger.info(f'start_api_set_model url: {url}') + logger.info(f'start_api_set_model post_body: {post_body}') + response = requests.post(url, json=post_body) + if response.status_code == 200: + result = response.text + return result + else: + return f'请求失败,状态码:{response.status_code}' + + +def start_api_v2_set_gpt_model(set_model_url_composer, gpt_models): + url = set_model_url_composer.build_get_url([gpt_models], False) + logger.info(f'start_api_v2_set_gpt_model url: {url}') + response = requests.get(url) + if response.status_code == 200: + result = response.text + return result + else: + return f'请求失败,状态码:{response.status_code}' + + +def start_api_v2_set_sovits_model(set_model_url_composer, sovits_models): + url = set_model_url_composer.build_get_url([sovits_models], False) + logger.info(f'start_api_v2_set_sovits_model url: {url}') + response = requests.get(url) + if response.status_code == 200: + result = response.text + return result + else: + return f'请求失败,状态码:{response.status_code}' diff --git a/Ref_Audio_Selector/tool/audio_sample.py b/Ref_Audio_Selector/tool/audio_sample.py new file mode 100644 index 000000000..8f02c7e0d --- /dev/null +++ b/Ref_Audio_Selector/tool/audio_sample.py @@ -0,0 +1,162 @@ +import os +import shutil +import random +import librosa +from Ref_Audio_Selector.config_param.log_config import logger + + +def check_audio_duration(path, min_duration=3, max_duration=10): + try: + + # 直接计算音频文件的时长(单位:秒) + duration = librosa.get_duration(filename=path) + + # 判断时长是否在3s至10s之间 + if min_duration <= duration <= max_duration: + return True + else: + return False + + except Exception as e: + logger.error(f"无法打开或处理音频文件:{e}") + return None + + +def convert_from_list(list_file, output_dir): + # 创建输出目录,如果它不存在的话 + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + # 解析.list文件,并操作文件 + with open(list_file, 'r', encoding='utf-8') as file: + lines = file.readlines() + + for line in lines: + parts = line.strip().split('|') + if len(parts) != 4: + logger.error(f"Line format incorrect: {line}") + continue + + audio_path, _, _, transcription = parts + + # 构建新的文件名和路径 + new_filename = transcription.strip() + '.wav' + # new_filename = new_filename.replace(' ', '_') # 移除空格 + # new_filename = ''.join(e for e in new_filename if e.isalnum() or e in ['_', '.']) # 移除非法字符 + new_path = os.path.join(output_dir, new_filename) + + # 如果目标文件已存在,不要覆盖 + if os.path.exists(new_path): + logger.info(f"File already exists: {new_path}") + continue + + try: + # 检查音频文件是否存在 + if not os.path.exists(audio_path): + logger.info(f"Audio file does not exist: {audio_path}") + continue + + if check_audio_duration(audio_path): + # 复制音频文件到output目录并重命名 + shutil.copy2(audio_path, new_path) + logger.info(f"File copied and renamed to: {new_path}") + else: + logger.info(f"File skipped due to duration: {audio_path}") + + except Exception as e: + logger.error(f"An error occurred while processing: {audio_path}") + logger.error(e) + + logger.info("Processing complete.") + + +def sample(output_audio_dir, similarity_list, subsection_num, sample_num): + # 按照相似度分值降序排序相似度列表 + similarity_list.sort(key=lambda x: x['score'], reverse=True) + + # 计算每段的起始索引 + step = len(similarity_list) // subsection_num + if len(similarity_list) % subsection_num != 0: + step += 1 + + # 分段并随机采样 + for i in range(subsection_num): + start = i * step + end = (i + 1) * step + end = min(end, len(similarity_list)) # 防止最后一段越界 + + # 创建子列表 + subsection = similarity_list[start:end] + # 在子列表上随机打乱 + random.shuffle(subsection) + + # 从打乱后的子列表中抽取相应数量的个体 + num = min(sample_num, len(subsection)) + sampled_subsection = subsection[:num] + + # 创建并进入子目录 + subdir_name = f'emotion_{i + 1}' + subdir_path = os.path.join(output_audio_dir, subdir_name) + os.makedirs(subdir_path, exist_ok=True) + + # 复制采样结果的音频到子目录 + for item in sampled_subsection: + src_path = item['wav_path'] + dst_path = os.path.join(subdir_path, os.path.basename(src_path)) + shutil.copyfile(src_path, dst_path) + + logger.info("Sampling completed.") + + +def parse_similarity_file(file_path): + """ + 解析指定文本文件,将其中的内容以元组形式存入列表。 + + 参数: + file_path (str): 待解析的文本文件路径。 + + 返回: + list[tuple[float, str]]: 存储浮点数和路径的元组列表。 + """ + result_list = [] + + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + # 去除行尾换行符并按'|'分割 + score, filepath = line.strip().split('|') + + # 将浮点数字符串转换为浮点数类型 + score = float(score) + + # 将得分和路径作为元组添加到结果列表 + result_list.append({ + 'score': score, + 'wav_path': filepath + }) + + return result_list + + +def copy_and_move(output_audio_directory, similarity_scores): + # 确保新目录存在 + if not os.path.exists(output_audio_directory): + os.makedirs(output_audio_directory) + + # 遍历并复制文件 + for item in similarity_scores: + # 构造新的文件名 + base_name = os.path.basename(item['wav_path'])[:-4] # 去掉.wav扩展名 + new_name = f"{item['score'] * 10000:04.0f}-{base_name}.wav" + + # 新文件的完整路径 + new_path = os.path.join(output_audio_directory, new_name) + + # 复制文件到新目录 + shutil.copyfile(item['wav_path'], new_path) + + logger.info("已完成复制和重命名操作。") + + +if __name__ == '__main__': + similarity_list = parse_similarity_file("D:/tt/similarity/啊,除了伊甸和樱,竟然还有其他人会提起我?.txt") + sample('D:/tt/similarity/output', similarity_list, 10, 4) diff --git a/Ref_Audio_Selector/tool/speaker_verification/__init__.py b/Ref_Audio_Selector/tool/speaker_verification/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py b/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py new file mode 100644 index 000000000..4f11fdf27 --- /dev/null +++ b/Ref_Audio_Selector/tool/speaker_verification/voice_similarity.py @@ -0,0 +1,142 @@ +import argparse +import os +import torchaudio +import torchaudio.transforms as T +import platform +import Ref_Audio_Selector.config_param.config_params as params +import Ref_Audio_Selector.config_param.log_config as log_config +from Ref_Audio_Selector.common.time_util import timeit_decorator +from Ref_Audio_Selector.common.model_manager import speaker_verification_models as models + +from modelscope.pipelines import pipeline + + +def init_model(model_type='speech_campplus_sv_zh-cn_16k-common'): + log_config.logger.info(f'人声识别模型类型:{model_type}') + return pipeline( + task=models[model_type]['task'], + model=models[model_type]['model'], + model_revision=models[model_type]['model_revision'] + ) + + +@timeit_decorator +def compare_audio_and_generate_report(reference_audio_path, comparison_dir_path, output_file_path, model_type): + sv_pipeline = init_model(model_type) + + # Step 1: 获取比较音频目录下所有音频文件的路径 + comparison_audio_paths = [os.path.join(comparison_dir_path, f) for f in os.listdir(comparison_dir_path) if + f.endswith('.wav')] + + if platform.system() == 'Windows': + # 因为这个模型是基于16k音频数据训练的,为了避免后续比较时,每次都对参考音频进行重采样,所以,提前进行了采样 + # windows不支持torchaudio.sox_effects.apply_effects_tensor,所以改写了依赖文件中的重采样方法 + # 改用torchaudio.transforms.Resample进行重采样,如果在非windows环境下,没有更改依赖包的采样方法的话, + # 使用这段代码进行预采样会出现因为采样方法不同,而导致的模型相似度计算不准确的问题 + # 当然如果在windows下,使用了其他的采样方法,也会出现不准确的问题 + if params.enable_pre_sample == 'true': + reference_audio_16k = ensure_16k_wav(reference_audio_path) + else: + reference_audio_16k = reference_audio_path + else: + reference_audio_16k = reference_audio_path + + # Step 2: 用参考音频依次比较音频目录下的每个音频,获取相似度分数及对应路径 + all_count = len(comparison_audio_paths) + has_processed_count = 0 + similarity_scores = [] + for audio_path in comparison_audio_paths: + score = sv_pipeline([reference_audio_16k, audio_path])['score'] + similarity_scores.append({ + 'score': score, + 'path': audio_path + }) + has_processed_count += 1 + log_config.logger.info(f'进度:{has_processed_count}/{all_count}') + + # Step 3: 根据相似度分数降序排列 + similarity_scores.sort(key=lambda x: x['score'], reverse=True) + + # Step 4: 处理输出文件不存在的情况,创建新文件 + if not os.path.exists(output_file_path): + open(output_file_path, 'w').close() # Create an empty file + + # Step 5: 将排序后的结果写入输出结果文件(支持中文) + formatted_scores = [f'{item["score"]}|{item["path"]}' for item in similarity_scores] + with open(output_file_path, 'w', encoding='utf-8') as f: + # 使用'\n'将每个字符串分开,使其写入不同行 + content = '\n'.join(formatted_scores) + f.write(content) + + +def ensure_16k_wav(audio_file_path, target_sample_rate=16000): + """ + 输入一个音频文件地址,判断其采样率并决定是否进行重采样,然后将结果保存到指定的输出文件。 + + 参数: + audio_file_path (str): 音频文件路径。 + output_file_path (str): 保存重采样后音频数据的目标文件路径。 + target_sample_rate (int, optional): 目标采样率,默认为16000Hz。 + """ + # 读取音频文件并获取其采样率 + waveform, sample_rate = torchaudio.load(audio_file_path) + + # 判断是否需要重采样 + if sample_rate == target_sample_rate: + return audio_file_path + else: + + # 创建Resample实例 + resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate) + + # 应用重采样 + resampled_waveform = resampler(waveform) + + # 创建临时文件夹 + os.makedirs(params.temp_dir, exist_ok=True) + + # 设置临时文件名 + temp_file_path = os.path.join(params.temp_dir, os.path.basename(audio_file_path)) + + # 保存重采样后的音频到指定文件 + torchaudio.save(temp_file_path, resampled_waveform, target_sample_rate) + + return temp_file_path + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Audio processing script arguments") + + # Reference audio path + parser.add_argument("-r", "--reference_audio", type=str, required=True, + help="Path to the reference WAV file.") + + # Comparison directory path + parser.add_argument("-c", "--comparison_dir", type=str, required=True, + help="Path to the directory containing comparison WAV files.") + + # Output file path + parser.add_argument("-o", "--output_file", type=str, required=True, + help="Path to the output file where results will be written.") + + # Model Type + parser.add_argument("-m", "--model_type", type=str, required=True, + help="Path to the model type.") + + return parser.parse_args() + + +if __name__ == '__main__': + cmd = parse_arguments() + compare_audio_and_generate_report( + reference_audio_path=cmd.reference_audio, + comparison_dir_path=cmd.comparison_dir, + output_file_path=cmd.output_file, + model_type=cmd.model_type, + ) + + # compare_audio_and_generate_report( + # reference_audio_path="D:/tt/渡鸦/refer_audio_all/也对,你的身份和我们不同吗?.wav", + # comparison_dir_path='D:/tt/渡鸦/refer_audio_all', + # output_file_path='D:/tt/渡鸦/test.txt', + # ) diff --git a/Ref_Audio_Selector/tool/text_check.py b/Ref_Audio_Selector/tool/text_check.py new file mode 100644 index 000000000..6281940c0 --- /dev/null +++ b/Ref_Audio_Selector/tool/text_check.py @@ -0,0 +1,77 @@ +import os +import Ref_Audio_Selector.common.common as common +import Ref_Audio_Selector.tool.audio_check as audio_check +from Ref_Audio_Selector.config_param.log_config import logger + + +def parse_text_similarity_result_txt(file_path): + """ + 解析指定格式的txt文件,每行格式:f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}" + + :param file_path: txt文件的路径 + :return: 包含解析后数据的字典列表 + """ + data_list = [] + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + # 使用'|'作为分隔符分割每行数据 + parts = line.strip().split('|') + if len(parts) == 3: + # 将分割后的字符串转换为浮点数、整数和字符串 + try: + item = { + 'average_similarity_score': float(parts[0]), + 'count': int(parts[1]), + 'emotion': parts[2] + } + data_list.append(item) + except ValueError as e: + # 如果转换失败,打印错误信息并跳过该行 + logger.error(f"Error parsing line: {line.strip()} - {e}") + + return data_list + + +def remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary): + """ + 根据条件删除低相似度音频文件并返回删除数量。 + + :param ref_audio_list: 包含音频路径和情感属性的列表 + :param report_list: 包含相似度评分和情感属性的列表 + :param audio_text_similarity_boundary: 相似度阈值 + :return: 删除的文件数量 + """ + deleted_count = 0 + + # 筛选出平均相似度低于阈值的报告 + low_similarity_reports = [report for report in report_list if + report['average_similarity_score'] < audio_text_similarity_boundary] + + # 遍历低相似度报告,查找并删除对应音频文件 + for report in low_similarity_reports: + emotion = report['emotion'] + # 查找ref_audio_list中相同情感的音频文件路径 + matching_refs = [ref for ref in ref_audio_list if ref['emotion'] == emotion] + for match in matching_refs: + ref_path = match['ref_path'] + # 检查文件是否存在,然后尝试删除 + if os.path.exists(ref_path): + try: + os.remove(ref_path) + deleted_count += 1 + logger.info(f"Deleted file: {ref_path}") + except Exception as e: + logger.error(f"Error deleting file {ref_path}: {e}") + else: + logger.error(f"File not found: {ref_path}") + + return deleted_count + + +def delete_ref_audio_below_boundary(ref_audio_path, text_similarity_result_path, sync_inference_audio_dir, + audio_text_similarity_boundary): + ref_audio_list = common.RefAudioListManager(ref_audio_path).get_ref_audio_list() + report_list = parse_text_similarity_result_txt(text_similarity_result_path) + count = remove_low_similarity_files(ref_audio_list, report_list, audio_text_similarity_boundary) + audio_check.sync_ref_audio(ref_audio_path, sync_inference_audio_dir) + return count diff --git a/Ref_Audio_Selector/tool/text_comparison/__init__.py b/Ref_Audio_Selector/tool/text_comparison/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/tool/text_comparison/asr_text_process.py b/Ref_Audio_Selector/tool/text_comparison/asr_text_process.py new file mode 100644 index 000000000..6511cba7a --- /dev/null +++ b/Ref_Audio_Selector/tool/text_comparison/asr_text_process.py @@ -0,0 +1,161 @@ +import os +import argparse +from collections import defaultdict +from operator import itemgetter +from Ref_Audio_Selector.common.time_util import timeit_decorator +import Ref_Audio_Selector.tool.text_comparison.text_comparison as text_comparison +import Ref_Audio_Selector.config_param.config_params as params +import Ref_Audio_Selector.common.common as common +from Ref_Audio_Selector.config_param.log_config import logger + + +def parse_asr_file(file_path): + output = [] + + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + # 假设每行都是正确的格式,且"|"'是固定分隔符 + input_file_path, original_text, language, asr_text = line.strip().split('|') + + emotion = common.get_filename_without_extension(input_file_path) + + # 将解析出的数据构造成新的字典或元组等结构 + parsed_data = { + 'emotion': emotion, + 'input_file_path': input_file_path, + 'original_text': original_text, + 'language': language, + 'asr_text': asr_text, + 'similarity_score': 0 + } + + output.append(parsed_data) + + return output + + +@timeit_decorator +def calculate_similarity_and_append_to_list(input_list, boundary): + all_count = len(input_list) + has_been_processed_count = 0 + for item in input_list: + original_score, similarity_score = text_comparison.calculate_result(item['original_text'], item['asr_text'], boundary) + item['similarity_score'] = similarity_score + item['original_score'] = original_score + has_been_processed_count += 1 + logger.info(f'进度:{has_been_processed_count}/{all_count}') + + return input_list + + +def calculate_average_similarity_by_emotion(data_list): + result_dict = defaultdict(list) + + for item in data_list: + emotion = item['emotion'] + similarity_score = item['similarity_score'] + result_dict[emotion].append(similarity_score) + + average_scores = [{'emotion': emotion, 'average_similarity_score': sum(scores) / len(scores), 'count': len(scores)} + for emotion, scores in result_dict.items()] + + average_scores.sort(key=lambda x: x['average_similarity_score'], reverse=True) + + return average_scores + + +def group_and_sort_by_field(data, group_by_field): + # 创建一个空的结果字典,键是group_by_field指定的字段,值是一个列表 + result_dict = defaultdict(list) + + # 遍历输入列表 + for item in data: + # 根据指定的group_by_field将当前元素添加到对应键的列表中 + key_to_group = item[group_by_field] + result_dict[key_to_group].append(item) + + # 对每个键对应的列表中的元素按similarity_score降序排序 + for key in result_dict: + result_dict[key].sort(key=itemgetter('similarity_score'), reverse=True) + + # 将结果字典转换为列表,每个元素是一个包含键(emotion或original_text)和排序后数组的元组 + result_list = [(k, v) for k, v in result_dict.items()] + + return result_list + + +def format_list_to_text(data_list, output_filename): + with open(output_filename, 'w', encoding='utf-8') as output_file: + output_file.write('放大后的相似度分值|原始分值|ASR文本|原文文本\n') + for key, items in data_list: + # 写入情绪标题 + output_file.write(key + '\n') + + # 写入每条记录 + for item in items: + formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['original_text']}\n" + output_file.write(formatted_line) + + +def format_list_to_emotion(data_list, output_filename): + with open(output_filename, 'w', encoding='utf-8') as output_file: + output_file.write('放大后的相似度分值|原始分值|ASR文本|情绪类型\n') + for key, items in data_list: + # 写入情绪标题 + output_file.write(key + '\n') + + # 写入每条记录 + for item in items: + formatted_line = f"{item['similarity_score']}|{item['original_score']}|{item['asr_text']}|{item['emotion']}\n" + output_file.write(formatted_line) + + +@timeit_decorator +def process(asr_file_path, output_dir, similarity_enlarge_boundary): + # 检查输出目录是否存在,如果不存在则创建 + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + records = parse_asr_file(asr_file_path) + calculate_similarity_and_append_to_list(records, similarity_enlarge_boundary) + average_similarity_list = calculate_average_similarity_by_emotion(records) + + average_similarity_file = os.path.join(output_dir, + f'{params.text_emotion_average_similarity_report_filename}.txt') + average_similarity_content = \ + '\n'.join([f"{item['average_similarity_score']}|{item['count']}|{item['emotion']}" for item in average_similarity_list]) + common.write_text_to_file(average_similarity_content, average_similarity_file) + + emotion_detail_list = group_and_sort_by_field(records, 'emotion') + + emotion_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_emotion_detail_filename}.txt') + format_list_to_text(emotion_detail_list, emotion_detail_file) + + original_text_detail_list = group_and_sort_by_field(records, 'original_text') + + original_text_detail_file = os.path.join(output_dir, f'{params.text_similarity_by_text_detail_filename}.txt') + format_list_to_emotion(original_text_detail_list, original_text_detail_file) + + logger.info('文本相似度分析完成。') + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Process ASR files and analyze similarity.") + + parser.add_argument("-a", "--asr_file_path", type=str, required=True, + help="Path to the directory containing ASR files or path to a single ASR file.") + + parser.add_argument("-o", "--output_dir", type=str, required=True, + help="Path to the directory where the analysis results should be saved.") + + parser.add_argument("-b", "--similarity_enlarge_boundary", type=float, required=True, + help="Similarity score boundary value to be used in your calculations.") + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + cmd = parse_arguments() + # print(cmd) + process(cmd.asr_file_path, cmd.output_dir, cmd.similarity_enlarge_boundary) diff --git a/Ref_Audio_Selector/tool/text_comparison/text_comparison.py b/Ref_Audio_Selector/tool/text_comparison/text_comparison.py new file mode 100644 index 000000000..2c4a5302a --- /dev/null +++ b/Ref_Audio_Selector/tool/text_comparison/text_comparison.py @@ -0,0 +1,128 @@ +import os +import torch +from transformers import AutoTokenizer, AutoModel +from scipy.spatial.distance import cosine +from Ref_Audio_Selector.config_param.log_config import logger + +bert_path = os.environ.get( + "bert_path", "GPT_SoVITS/pretrained_models/chinese-roberta-wwm-ext-large" +) + +# Set device to GPU if available, else CPU +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +logger.info(f'使用计算设备: {device}') + +tokenizer = AutoTokenizer.from_pretrained(bert_path) +model = AutoModel.from_pretrained(bert_path).to(device) + + +def calculate_similarity(text1, text2, max_length=512): + # 预处理文本,设置最大长度 + inputs1 = tokenizer(text1, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device) + inputs2 = tokenizer(text2, padding=True, truncation=True, max_length=max_length, return_tensors='pt').to(device) + + # 获取句子向量(这里是取CLS token的向量并展平为一维) + with torch.no_grad(): + encoded_text1 = model(**inputs1)[0][:, 0, :].flatten() + encoded_text2 = model(**inputs2)[0][:, 0, :].flatten() + + # 确保转换为numpy数组并且是一维的 + similarity = 1 - cosine(encoded_text1.cpu().numpy().flatten(), encoded_text2.cpu().numpy().flatten()) + + return similarity + + +# 对boundary到1区间的值进行放大 +def adjusted_similarity(similarity_score2, boundary=0.8): + if similarity_score2 < boundary: + return 0 + + # 倍数 + multiple = 1 / (1 - boundary) + + adjusted_score = (similarity_score2 - boundary) * multiple + + return adjusted_score + + +def calculate_result(t1, t2, boundary): + # 计算并打印相似度 + similarity_score2 = calculate_similarity(t1, t2) + + # 调整相似度 + adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary) + + return similarity_score2, adjusted_similarity_score2 + + +def print_result(t1, t2, boundary): + print(f't2: {t2}') + # 计算并打印相似度 + similarity_score2 = calculate_similarity(t1, t2) + print(f"两句话的相似度为: {similarity_score2:.4f}") + + # 调整相似度 + adjusted_similarity_score2 = adjusted_similarity(similarity_score2, boundary) + print(f"调整后的相似度为: {adjusted_similarity_score2:.4f}") + + +def test(boundary): + # 原始文本 + text1 = "这是第一个句子" + list = """ + 这是第一个句子 + 这是第二个句子。 + 那么,这是第三个表达。 + 当前呈现的是第四个句子。 + 接下来,我们有第五句话。 + 在此,展示第六条陈述。 + 继续下去,这是第七个短句。 + 不容忽视的是第八个表述。 + 顺延着序列,这是第九句。 + 此处列举的是第十个说法。 + 进入新的篇章,这是第十一个句子。 + 下一段内容即为第十二个句子。 + 显而易见,这是第十三个叙述。 + 渐进地,我们来到第十四句话。 + 向下滚动,您会看到第十五个表达。 + 此刻,呈现在眼前的是第十六个句子。 + 它们中的一个——第十七个句子在此。 + 如同链条般连接,这是第十八个断言。 + 按照顺序排列,接下来是第十九个话语。 + 逐一列举,这是第二十个陈述句。 + 结构相似,本例给出第二十一个实例句。 + 这是最初的陈述句。 + 首先表达的是这一个句子。 + 第一句内容即为此处所示。 + 这是起始的叙述段落。 + 开篇所展示的第一句话就是这个。 + 明媚的阳光洒满大地 + 窗外飘落粉色樱花瓣 + 笔尖轻触纸面思绪万千 + 深夜的月光如水般静谧 + 穿越丛林的小径蜿蜒曲折 + 浅酌清茶品味人生百态 + 破晓时分雄鸡一唱天下白 + 草原上奔驰的骏马无拘无束 + 秋叶纷飞描绘季节更替画卷 + 寒冬雪夜炉火旁围坐共话家常 + kszdRjYXw + pfsMgTlVHnB + uQaGxIbWz + ZtqNhPmKcOe + jfyrXsStVUo + wDiEgLkZbn + yhNvAfUmqC + TpKjxMrWgs + eBzHUaFJtYd + oQnXcVSiPkL + 00000 + """ + list2 = list.strip().split('\n') + for item in list2: + print_result(text1, item, boundary) + + +if __name__ == '__main__': + test(0.9) diff --git a/Ref_Audio_Selector/ui_init/__init__.py b/Ref_Audio_Selector/ui_init/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/Ref_Audio_Selector/ui_init/init_ui_param.py b/Ref_Audio_Selector/ui_init/init_ui_param.py new file mode 100644 index 000000000..87367bb42 --- /dev/null +++ b/Ref_Audio_Selector/ui_init/init_ui_param.py @@ -0,0 +1,197 @@ +import os +import multiprocessing +import Ref_Audio_Selector.config_param.config_params as params +import Ref_Audio_Selector.tool.audio_inference as audio_inference +import Ref_Audio_Selector.common.common as common + +rw_param = params.config_manager.get_rw_param() +# -------------------基本信息--------------------------- + +# 角色所在工作目录 +base_dir_default = None +# 工作目录 +text_work_space_dir_default = None +# 角色名称 +text_role_default = None +# 参考音频所在目录 +text_refer_audio_file_dir_default = None +# 推理音频所在目录 +text_inference_audio_file_dir_default = None + +# -------------------第一步------------------------------ + +# 参考音频抽样目录 +text_sample_dir_default = None +# 分段数 +slider_subsection_num_default = None +# 每段随机抽样个数 +slider_sample_num_default = None + +# -------------------第二步------------------------------ + +# api服务模型切换接口地址 +text_api_set_model_base_url_default = None +# GPT模型参数名 +text_api_gpt_param_default = None +# SoVITS模型参数名 +text_api_sovits_param_default = None +# api服务GPT模型切换接口地址 +text_api_v2_set_gpt_model_base_url_default = None +# GPT模型参数名 +text_api_v2_gpt_model_param_default = None +# api服务SoVITS模型切换接口地址 +text_api_v2_set_sovits_model_base_url_default = None +# SoVITS模型参数名 +text_api_v2_sovits_model_param_default = None +# 推理服务请求地址与参数 +text_url_default = None +# 推理服务请求完整地址 +text_whole_url_default = None +# 文本参数名 +text_text_default = None +# 参考参数类型 +dropdown_refer_type_param_default = None +# 参考音频路径参数名 +text_ref_path_default = None +# 参考音频文本参数名 +text_ref_text_default = None +# 角色情绪参数名 +text_emotion_default = None +# 待推理文本路径 +text_test_content_default = None +# 请求并发数 +slider_request_concurrency_num_default = 3 +# 最大并发数 +slider_request_concurrency_max_num = None + +# -------------------第三步------------------------------ + +# 待asr的音频所在目录 +text_asr_audio_dir_default = None +# 待分析的文件路径 +text_text_similarity_analysis_path_default = None +# 文本相似度放大边界 +slider_text_similarity_amplification_boundary_default = 0.90 +# 文本相似度分析结果文件所在路径 +text_text_similarity_result_path_default = None + +# -------------------第四步------------------------------ +# -------------------第五步------------------------------ +# 模板内容 +text_template_default = None + + +def empty_default(vale, default_value): + if vale is None or vale == "": + return default_value + else: + return vale + + +def init_base(): + global text_work_space_dir_default, text_role_default, base_dir_default, text_refer_audio_file_dir_default, text_inference_audio_file_dir_default + + text_work_space_dir_default = rw_param.read(rw_param.work_dir) + text_role_default = rw_param.read(rw_param.role) + base_dir_default = os.path.join(text_work_space_dir_default, text_role_default) + + text_refer_audio_file_dir_default = common.check_path_existence_and_return( + os.path.join(base_dir_default, params.reference_audio_dir)) + + text_inference_audio_file_dir_default = common.check_path_existence_and_return( + os.path.join(base_dir_default, params.inference_audio_dir)) + + +def init_first(): + global text_sample_dir_default, slider_subsection_num_default, slider_sample_num_default + + text_sample_dir_default = common.check_path_existence_and_return( + os.path.join(base_dir_default, params.list_to_convert_reference_audio_dir)) + + slider_subsection_num_default = int(empty_default(rw_param.read(rw_param.subsection_num), 10)) + + slider_sample_num_default = (empty_default(rw_param.read(rw_param.sample_num), 4)) + + +def init_second(): + global text_api_set_model_base_url_default, text_api_gpt_param_default, text_api_sovits_param_default, text_api_v2_set_gpt_model_base_url_default, text_api_v2_gpt_model_param_default + global text_api_v2_set_sovits_model_base_url_default, text_api_v2_sovits_model_param_default, text_url_default, text_whole_url_default, text_text_default, dropdown_refer_type_param_default, text_ref_path_default + global text_ref_text_default, text_emotion_default, text_test_content_default, slider_request_concurrency_num_default, slider_request_concurrency_max_num + + text_api_set_model_base_url_default = empty_default(rw_param.read(rw_param.api_set_model_base_url), + 'http://localhost:9880/set_model') + text_api_gpt_param_default = empty_default(rw_param.read(rw_param.api_gpt_param), 'gpt_model_path') + text_api_sovits_param_default = empty_default(rw_param.read(rw_param.api_sovits_param), 'sovits_model_path') + + text_api_v2_set_gpt_model_base_url_default = empty_default(rw_param.read(rw_param.api_v2_set_gpt_model_base_url), + 'http://localhost:9880/set_gpt_weights') + text_api_v2_gpt_model_param_default = empty_default(rw_param.read(rw_param.api_v2_gpt_model_param), 'weights_path') + + text_api_v2_set_sovits_model_base_url_default = empty_default( + rw_param.read(rw_param.api_v2_set_sovits_model_base_url), 'http://localhost:9880/set_sovits_weights') + text_api_v2_sovits_model_param_default = empty_default(rw_param.read(rw_param.api_v2_sovits_model_param), 'weights_path') + + text_url_default = empty_default(rw_param.read(rw_param.text_url), + 'http://localhost:9880?prompt_language=中文&text_language=中文&cut_punc=,.;?!、,。?!;:…') + text_text_default = empty_default(rw_param.read(rw_param.text_param), 'text') + dropdown_refer_type_param_default = empty_default(rw_param.read(rw_param.refer_type_param), '参考音频') + + text_ref_path_default = empty_default(rw_param.read(rw_param.ref_path_param), 'refer_wav_path') + text_ref_text_default = empty_default(rw_param.read(rw_param.ref_text_param), 'prompt_text') + text_emotion_default = empty_default(rw_param.read(rw_param.emotion_param), 'emotion') + + text_whole_url_default = whole_url(text_url_default, dropdown_refer_type_param_default, text_text_default, + text_ref_path_default, text_ref_text_default, text_emotion_default) + + text_test_content_default = empty_default(rw_param.read(rw_param.test_content_path), params.default_test_text_path) + + slider_request_concurrency_max_num = multiprocessing.cpu_count() + + slider_request_concurrency_num_default = empty_default(rw_param.read(rw_param.request_concurrency_num), 3) + + slider_request_concurrency_num_default = min(int(slider_request_concurrency_num_default), slider_request_concurrency_max_num) + + +# 基于请求路径和参数,合成完整的请求路径 +def whole_url(text_url, dropdown_refer_type_param, text_text, text_ref_path, text_ref_text, text_emotion): + url_composer = audio_inference.TTSURLComposer(text_url, dropdown_refer_type_param, text_emotion, text_text, + text_ref_path, text_ref_text) + if url_composer.is_emotion(): + text_whole_url = url_composer.build_url_with_emotion('测试内容', '情绪类型', False) + else: + text_whole_url = url_composer.build_url_with_ref('测试内容', '参考路径', '参考文本', False) + return text_whole_url + + +def init_third(): + global text_asr_audio_dir_default, text_text_similarity_analysis_path_default, slider_text_similarity_amplification_boundary_default, text_text_similarity_result_path_default + + text_asr_audio_dir_default = common.check_path_existence_and_return( + os.path.join(base_dir_default, params.inference_audio_dir, params.inference_audio_text_aggregation_dir)) + text_text_similarity_analysis_path_default = common.check_path_existence_and_return( + os.path.join(base_dir_default, params.asr_filename + '.list')) + slider_text_similarity_amplification_boundary_default = empty_default( + rw_param.read(rw_param.text_similarity_amplification_boundary), 0.90) + text_text_similarity_result_path_default = common.check_path_existence_and_return( + os.path.join(base_dir_default, params.text_emotion_average_similarity_report_filename + '.txt')) + + +def init_fourth(): + pass + + +def init_fifth(): + global text_template_default + + default_template_path = params.default_template_path + text_template_default = empty_default(rw_param.read(rw_param.text_template), + common.read_file(default_template_path)) + + +def init_all(): + init_base() + init_first() + init_second() + init_third() + init_fourth() + init_fifth() diff --git "a/Ref_Audio_Selector/\345\217\202\350\200\203\351\237\263\351\242\221\347\255\233\351\200\211\346\265\201\347\250\213.png" "b/Ref_Audio_Selector/\345\217\202\350\200\203\351\237\263\351\242\221\347\255\233\351\200\211\346\265\201\347\250\213.png" new file mode 100644 index 000000000..bedc31484 Binary files /dev/null and "b/Ref_Audio_Selector/\345\217\202\350\200\203\351\237\263\351\242\221\347\255\233\351\200\211\346\265\201\347\250\213.png" differ