In [16]:
import os
import base64
import numpy as np
np.bool = np.bool_
import pandas as pd


def encode_data(data):
    # 将数据编码为Base64
    encoded_data = base64.b64encode(data.encode('utf-8')).decode('utf-8')
    return encoded_data

def decode_data(encoded_data):
    # 从Base64解码回原始数据
    decoded_data = base64.b64decode(encoded_data.encode('utf-8')).decode('utf-8')
    return decoded_data


DATASET_LIST = {
        "fake": {
          "wav_path" : "/qqf/project/concat/datasets/corpus/resampled/fake",
          "meta_path" : "/qqf/project/concat/datasets/corpus/fake_meta_data.txt"
        },
        "half_fake": {
          "wav_path" : "/qqf/project/concat/datasets/corpus/resampled/half_fake",
          "meta_path" : "/qqf/project/concat/datasets/corpus/half_fake_meta_data.txt",
        },
        "real": {
          "wav_path" : "/qqf/project/concat/datasets/corpus/resampled/real",
          "meta_path" : "/qqf/project/concat/datasets/corpus/real_meta_data.txt",
        }
      }

def get_spklist(dataframe:pd.DataFrame):
    grouped = dataframe.groupby("spk")
    spklist = [spk for spk, _ in grouped]
    return spklist

r_meta = pd.read_csv(DATASET_LIST["real"]["meta_path"], sep="|").sort_values(by='spk').reset_index(drop=True)
f_meta = pd.read_csv(DATASET_LIST["fake"]["meta_path"], sep="|").sort_values(by='spk').reset_index(drop=True)
hf_meta = pd.read_csv(DATASET_LIST["half_fake"]["meta_path"], sep="|").sort_values(by='spk').reset_index(drop=True)
modes = ["train", "dev", "test"]
SPKLIST = get_spklist(f_meta)
print(SPKLIST)

In [24]:
def split_list_by_ratio(lst, ratios):
    # 确保列表长度与n一致
    n = len(lst)
    
    # 计算总比例
    total_ratio = sum(ratios)

    # 计算每个部分的长度
    split_lengths = [int((ratio / total_ratio) * n) for ratio in ratios]

    # 调整最后一个分割的长度以确保总长度一致
    adjustment = n - sum(split_lengths)
    split_lengths[-1] += adjustment

    # 切分列表
    start = 0
    splits = []
    for length in split_lengths:
        end = start + length
        splits.append(lst[start:end])
        start = end

    return splits

def get_metadata(generator=None, mode="train", ratios=[5, 3, 2]):
    splits = split_list_by_ratio(SPKLIST, ratios)
    
    if mode == "train": spks = splits[0]
    elif mode == "dev": spks = splits[1]
    else: spks = splits[2]
    if generator != None:
        fake_meta_g = f_meta[f_meta["generator"] == generator] # 全假语音数据
        half_fake_meta_g = hf_meta[hf_meta["generator"] == generator] # 半假语音数据
        fake_meta_g = fake_meta_g.sample(n=len(r_meta) - len(half_fake_meta_g), random_state=42)

        r_meta_g = r_meta[r_meta['spk'].isin(spks)]
        fake_meta_g = fake_meta_g[fake_meta_g['spk'].isin(spks)]
        half_fake_meta_g = half_fake_meta_g[half_fake_meta_g['spk'].isin(spks)]
    else:
        r_meta_g = r_meta[r_meta['spk'].isin(spks)]
        fake_meta_g = f_meta[f_meta["spk"].isin(spks)]
        half_fake_meta_g = hf_meta[hf_meta["spk"].isin(spks)] # 半假语音数据
        sample_n = len(r_meta_g) // 6 // 2
        fake_meta_list = []
        half_fake_meta_list = []
        generators = fake_meta_g["generator"].unique().tolist()
        for generator in generators:
            fg_df = fake_meta_g[fake_meta_g["generator"] == generator]
            hfg_df = half_fake_meta_g[half_fake_meta_g["generator"] == generator]
            fake_meta_list.append(fg_df.sample(n=sample_n, random_state=42))
            half_fake_meta_list.append(hfg_df.sample(n=sample_n, random_state=42))
            
        fake_meta_g = pd.concat(fake_meta_list, ignore_index=True).reset_index(drop=True)
        half_fake_meta_g = pd.concat(half_fake_meta_list, ignore_index=True).reset_index(drop=True)
        
    print(f"mode:{mode}\t real:{len(r_meta_g)}\t fake:{len(fake_meta_g)}\t half_fake:{len(half_fake_meta_g)}\t total:{len(r_meta_g)+len(fake_meta_g)+len(half_fake_meta_g)}")
    res = pd.concat([r_meta_g, fake_meta_g, half_fake_meta_g], ignore_index=True).sample(frac=1).reset_index(drop=True)
    return res

In [27]:
def getDataByGenerator(generator = "G01", ratios = [6,2,2]):
    """_summary_
    获取数据集，根据generator 
    Args:
        generator (str, optional): _description_. Defaults to "G01".
    """
    assert generator in ["G01", "G02", "G03", "G04", "G05", "G06"]
    for mode in modes:
        output_file = f"{generator}/{mode}.txt"
        res = []
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        metadata = get_metadata(generator=generator, mode=mode, ratios=ratios)
        for _, row in metadata.iterrows():
            spkid = row["spk"]
            wav_id = row["wav_id"]
            gen = row["generator"]
            label = "bonafide" if row["label"] == "real" else "spoof"
            res.append(f"{spkid} {wav_id} - {gen} {label}")
        with open(output_file, "w") as f:
            f.write("\n".join(res))

In [12]:
def getDataByRadio(ratios = [4,4,2]):
    """_summary_
    获取数据集，根据切分的比率，切分比率是与说话人相关的。
    Args:
        ratios (list, optional): _description_. Defaults to [4,4,2].
    """
    dirname = str(ratios)
    for mode in modes:
        output_file = f"radio_{dirname}/{mode}.txt"
        res = []
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        metadata = get_metadata(mode=mode,ratios=ratios)
        for _, row in metadata.iterrows():
            spkid = row["spk"]
            wav_id = row["wav_id"]
            gen = row["generator"]
            label = "bonafide" if row["label"] == "real" else "spoof"
            res.append(f"{spkid} {wav_id} - {gen} {label}")
        with open(output_file, "w") as f:
            f.write("\n".join(res))

In [30]:
if __name__ == "__main__":
    # 将元数据到./{generator}/{mode}.txt 文件
    getDataByGenerator(generator = "G01", ratios=[6,2,2])
    # 将元数据到./radios_{radios}/{mode}.txt 文件
    getDataByRadio(ratios = [4,4,2])
    # 解码元数据，拿到结果
    print(decode_data("RzAxX2hhbGZfZmFrZV9wMzY0XzA2NA=="))

mode:train	 real:26094	 fake:17598	 half_fake:8566	 total:52258
mode:dev	 real:8435	 fake:5823	 half_fake:2615	 total:16873
mode:test	 real:9541	 fake:6624	 half_fake:2844	 total:19009
mode:train	 real:17730	 fake:8862	 half_fake:8862	 total:35454
mode:dev	 real:17151	 fake:8574	 half_fake:8574	 total:34299
mode:test	 real:9189	 fake:4590	 half_fake:4590	 total:18369
G01_half_fake_p364_064
