In [1]:
import os

In [2]:
def traverse_dir_files(root_dir, ext="dic"):
    paths_list = []
    for parent, _, fileNames in os.walk(root_dir):
        for name in fileNames:
            if name.startswith("."):  # 去除隐藏文件
                continue
            if ext:  # 根据后缀名搜索
                if name.endswith(tuple(ext)):
                    # names_list.append(name)
                    paths_list.append(os.path.join(parent, name))
            else:
                # names_list.append(name)
                paths_list.append(os.path.join(parent, name))

    return paths_list

# 用模型进一步构建

In [3]:
import numpy as np
import torch

# np.load(all_files[0])


def cosine_similarity(arr1, arr2):
    # 确保输入的数组形状为[1, 192]

    assert arr1.shape == (1, 192), "arr1的形状应为[1, 192]"
    assert arr2.shape == (1, 192), "arr2的形状应为[1, 192]"

    # 计算数组的二范数(L2范数)
    norm1 = np.linalg.norm(arr1)
    norm2 = np.linalg.norm(arr2)

    # 计算数组的点积
    dot_product = np.dot(arr1, arr2.T)

    # 计算余弦相似度
    cosine_sim = dot_product / (norm1 * norm2)

    return cosine_sim[0][0]


def cosine_similarity_batch(arr1, arr2):
    # 确保输入的数组形状为[batch_size, feature_size]
    if arr1.shape != arr2.shape:
        # arr1的最后一个维度等于arr2的最后一个维度
        arr1 = arr1[: arr2.shape[0], :]

    # 计算数组的二范数(L2范数)
    norm1 = np.linalg.norm(arr1, axis=1, keepdims=True)
    norm2 = np.linalg.norm(arr2, axis=1, keepdims=True)

    # 计算数组的点积
    dot_product = np.sum(arr1 * arr2, axis=1, keepdims=True)

    # 计算余弦相似度
    cosine_sim = dot_product / (norm1 * norm2)

    return cosine_sim.squeeze()


def cosine_similarity_gpu(tensor1, tensor2, epsilon=1e-8):
    # 确保输入的张量形状为[1, 192]
    assert tensor1.shape == (1, 192), "tensor1的形状应为[1, 192]"
    assert tensor2.shape == (1, 192), "tensor2的形状应为[1, 192]"

    # 将张量移动到GPU上
    tensor1 = tensor1.cuda()
    tensor2 = tensor2.cuda()

    # 计算张量的二范数(L2范数)
    norm1 = torch.norm(tensor1)
    norm2 = torch.norm(tensor2)
    # 对分母进行平滑处理,防止除以零
    denominator = torch.clamp(norm1 * norm2, min=epsilon)
    # 计算张量的点积
    dot_product = torch.dot(tensor1.view(-1), tensor2.view(-1))

    # 计算余弦相似度
    cosine_sim = dot_product / denominator

    return cosine_sim.item()


def cosine_similarity_batch_torch(tensor1, tensor2, epsilon=1e-8):
    # 确保输入的张量形状为[batch_size, feature_size]
    assert tensor1.shape == tensor2.shape, "tensor1和tensor2的形状应该相同"

    # 计算张量的二范数(L2范数)
    norm1 = torch.norm(tensor1, dim=1, keepdim=True)
    norm2 = torch.norm(tensor2, dim=1, keepdim=True)

    # 对分母进行平滑处理,防止除以零
    denominator = torch.clamp(norm1 * norm2, min=epsilon)

    # 计算张量的点积
    dot_product = torch.sum(tensor1 * tensor2, dim=1, keepdim=True)

    # 计算余弦相似度
    cosine_sim = dot_product / denominator

    return cosine_sim.squeeze()

In [4]:
file_paths = traverse_dir_files("/mnt/e/Workspace/growth/audio/so-vits-svc/preprocess/raw_data/蜡笔小新切片",'wav')

In [53]:
from tqdm import tqdm
import torch
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
spk_info = {}
processed = {}
file_paths = file_paths[:10000]
for i in range(len(file_paths)):
    # 检查i是否已经被归类
    spk_name = all_files[i].split("/")[-1].replace(".npy", "")
    if spk_name in processed:
        continue
    # 新建一个spk_name
    if spk_name not in spk_info:
        spk_info[spk_name] = [spk_name]
    else:
        assert False, "spk_name已经存在，程序存在逻辑错误"
    sub_file = all_files[i + 1 :]
    # 剔除已经被归类的音频
    # 分批次
    batch_size = 128
    spk1 = np.load(all_files[i])
    # numpy在第一个维度复制
    spk1 = np.repeat(spk1, batch_size, axis=0)
    for j in tqdm(range(i + 1, len(sub_file), batch_size)):
        batch_files = sub_file[j : j + batch_size]
        spk2s = [np.load(p) for p in batch_files]
        spk2s = np.squeeze(np.stack(spk2s, axis=0))
        # x = cosine_similarity_batch(spk1, spk2s)
        x = cosine_similarity_batch(spk1, spk2s)
        indexs = np.where(x > 0.75)[0]
        for index in indexs:
            # 记录文件名
            t_name = batch_files[index].split("/")[-1].replace(".npy", "")
            spk_info[spk_name].append(t_name)
            processed[t_name] = True
    # 保存Spk_info和processed
    json.dump(spk_info, open("spk_info.json", "w"), ensure_ascii=False, indent=4)
    json.dump(processed, open("processed.json", "w"), ensure_ascii=False, indent=4)

    # for j in tqdm(range(i + 1, len(all_files))):
    #     spk1 = np.load(all_files[i])
    #     spk2 = np.load(all_files[j])
    #     if cosine_similarity_gpu(torch.from_numpy(spk1), torch.from_numpy(spk2)) > 0.9:
    #         print(all_files[i], all_files[j])

  0%|          | 0/79 [00:00<?, ?it/s]

100%|██████████| 79/79 [00:04<00:00, 18.29it/s]
100%|██████████| 79/79 [00:04<00:00, 18.24it/s]
100%|██████████| 79/79 [00:04<00:00, 18.80it/s]
100%|██████████| 79/79 [00:04<00:00, 18.92it/s]
100%|██████████| 79/79 [00:04<00:00, 18.60it/s]
100%|██████████| 79/79 [00:04<00:00, 17.03it/s]
100%|██████████| 79/79 [00:04<00:00, 17.85it/s]
100%|██████████| 78/78 [00:04<00:00, 17.53it/s]
  6%|▋         | 5/78 [00:00<00:05, 13.88it/s]


KeyboardInterrupt: 