In [2]:
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm

In [3]:
##############################
# 第一部分：处理 itw_meta.csv
##############################
meta_csv_path = 'itw_meta.csv'  

df = pd.read_csv(meta_csv_path)

# speaker_counts = df['speaker'].value_counts()
# test_speakers = speaker_counts.nsmallest(20).index.tolist()
# df['Set'] = df['speaker'].apply(lambda x: 'test' if x in test_speakers else 'train')abs

df['Set'] = 'test'

df['label_numeric'] = df['label'].map({'bona-fide': 1, 'spoof': 0})

MAX_WORKERS = 96

source_dir = os.path.join('inthewild', 'release_in_the_wild')

def process_row(row):
    """
    根据 DataFrame 的一行记录，构造数据源名称、文件名、源文件的绝对路径和标签
    这里 label 直接取 label_numeric 的值
    """
    filename = row['file']
    speaker = row['speaker']
    Set = row['Set']
    label_num = row['label_numeric']  # 使用 label_numeric 来判断
    data_source = 'itw'
    src_file = os.path.join(source_dir, filename)
    abs_path = os.path.abspath(src_file)
    rel_path = os.path.join('release_in_the_wild',filename)
    return data_source, speaker, abs_path, rel_path, label_num, Set

records = []

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_row, row): idx for idx, row in df.iterrows()}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing rows"):
        try:
            record = future.result()
            records.append(record)
        except Exception as e:
            print(f"Error processing row: {e}")

df_records = pd.DataFrame(records, columns=["data_source", "speaker", "absolute_path", "relative_path", "label", "Set"])

Processing rows:   0%|          | 0/31779 [00:00<?, ?it/s]

In [4]:
# df_records.Set.value_counts()
len(df.speaker.unique())
df_records.head()

Unnamed: 0,data_source,speaker,absolute_path,relative_path,label,Set
0,itw,Alec Guinness,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/23269.wav,1,test
1,itw,Mark Zuckerberg,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/23885.wav,0,test
2,itw,Alec Guinness,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/22375.wav,1,test
3,itw,Alec Guinness,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/23998.wav,0,test
4,itw,Barack Obama,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/23525.wav,0,test


In [5]:
#############################################
# 第二部分：处理生成的 fake 音频（不复制文件）
#############################################

generated_audio_dir = os.path.join('..', 'Data', 'generated_audio')

test_fake_dirs = {"ljspeech_waveglow", "ljspeech_hifiGAN"}

def process_generated_file(folder, file):
    """
    根据生成的音频文件，构造数据记录：
    - 数据源固定为 "gen"
    - speaker 使用文件夹名称
    - 标签固定为 0（fake）
    - 根据文件夹判断数据集标签（test 或 train）
    """
    src_file = os.path.join(generated_audio_dir, folder, file)
    abs_path = os.path.abspath(src_file)
    rel_path = os.path.join("generated_audio", folder, file)
    data_source = "gen"
    speaker = folder
    label = 0  # fake
    dataset = "test" if folder in test_fake_dirs else "train"
    return data_source, speaker, abs_path, rel_path, label, dataset

tasks = []
for folder in os.listdir(generated_audio_dir):
    folder_path = os.path.join(generated_audio_dir, folder)
    if not os.path.isdir(folder_path):
        continue
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        if os.path.isfile(file_path) and file.lower().endswith('.wav'):
            tasks.append((folder, file))

records_gen = []
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(lambda t: process_generated_file(*t), task): task for task in tasks}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing generated audio"):
        try:
            records_gen.append(future.result())
        except Exception as e:
            print(f"Error processing generated file: {e}")

df_gen = pd.DataFrame(records_gen, columns=["data_source", "speaker", "absolute_path", "relative_path", "label", "Set"])

Processing generated audio:   0%|          | 0/117983 [00:00<?, ?it/s]

In [6]:
df_gen.head()
# df_gen.speaker.value_counts()
df_gen.Set.value_counts()
# df_gen.head()

Set
train    91783
test     26200
Name: count, dtype: int64

In [7]:
import random

# 源文件所在目录
LJSpeech_audio_dir = '../Data/LJSpeech-1.1/wavs'

# 全部设置为Train
files = os.listdir(LJSpeech_audio_dir)
random.shuffle(files)
num_total = len(files)
num_train = int(num_total * 1)
train_files = files[:num_train]
test_files = files[num_train:]

def process_file(file, dataset):
    """
    根据单个文件及数据集标签（train/test），构造数据记录：
    - data_source: 固定为 "LJSpeech"
    - file_name: 文件名
    - absolute_path: 文件的绝对路径
    - label: 固定为 1（真实数据）
    - dataset: 'train' 或 'test'
    """
    src_file = os.path.join(LJSpeech_audio_dir, file)
    abs_path = os.path.abspath(src_file)
    rel_path = os.path.join("LJSpeech-1.1/wavs",file)
    data_source = "LJSpeech"
    label = 1  # real 数据
    return data_source, "LJSpeech", abs_path, rel_path, label, dataset

LJ_records = []

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = []
    for f in train_files:
        futures.append(executor.submit(process_file, f, "train"))
    for f in test_files:
        futures.append(executor.submit(process_file, f, "test"))
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing LJSpeech files"):
        LJ_records.append(future.result())

df_LJ = pd.DataFrame(LJ_records, columns=["data_source", "speaker", "absolute_path", "relative_path", "label", "Set"])

Processing LJSpeech files:   0%|          | 0/13100 [00:00<?, ?it/s]

In [8]:
df_LJ.speaker.value_counts()
df_LJ.head()

Unnamed: 0,data_source,speaker,absolute_path,relative_path,label,Set
0,LJSpeech,LJSpeech,/home/jupyter/Data/LJSpeech-1.1/wavs/LJ018-039...,LJSpeech-1.1/wavs/LJ018-0390.wav,1,train
1,LJSpeech,LJSpeech,/home/jupyter/Data/LJSpeech-1.1/wavs/LJ041-005...,LJSpeech-1.1/wavs/LJ041-0050.wav,1,train
2,LJSpeech,LJSpeech,/home/jupyter/Data/LJSpeech-1.1/wavs/LJ022-002...,LJSpeech-1.1/wavs/LJ022-0022.wav,1,train
3,LJSpeech,LJSpeech,/home/jupyter/Data/LJSpeech-1.1/wavs/LJ040-011...,LJSpeech-1.1/wavs/LJ040-0113.wav,1,train
4,LJSpeech,LJSpeech,/home/jupyter/Data/LJSpeech-1.1/wavs/LJ017-027...,LJSpeech-1.1/wavs/LJ017-0270.wav,1,train


In [9]:
vox1 = pd.read_csv("vox1_meta_modified.csv",sep = '\t')
# vox1.Set.value_counts()

In [10]:

def process_row(row):
    """
    根据每一行 VoxCeleb 元数据构造文件记录：
      - 判断 Set 字段 ("dev" 或 "test")，并据此设置 target_set（"Train" 或 "Test"）；
      - 选择源目录：dev_folder 为 "../Data/vox1_dev_wav/wav/<VoxCeleb1 ID>"
        或 test_folder 为 "../Data/vox1_test_wav/wav/<VoxCeleb1 ID>"；
      - 构造目标目录为： os.path.join(filterd_base, target_set, "real", f"{vox_id}_{vgg_id}")；
      - 遍历 src_base 下所有 .wav 文件，构造新的文件名为 "子文件夹名_原文件名"，并记录源文件的绝对路径。
    返回该行中所有文件的记录列表，每条记录为
      (data_source, speaker, absolute_path, label, dataset)
    """
    records = []
    vox_id = row["VoxCeleb1 ID"]
    vgg_id = row["VGGFace1 ID"]
    set_tag = row["Set"].strip().lower()  # "dev" 或 "test"
    
    # 构造 dev 和 test 源目录
    dev_folder  = os.path.join("..", "Data", "vox1_dev_wav", "wav", vox_id)
    test_folder = os.path.join("..", "Data", "vox1_test_wav", "wav", vox_id)
    
    # 根据 set_tag 判断并选择源目录，并设置 target_set
    if set_tag == "dev":
        target_set = "train"
        if os.path.isdir(dev_folder):
            src_base = dev_folder
        elif os.path.isdir(test_folder):
            src_base = test_folder
        else:
            print(f"[Warning] Could not find {vox_id} in dev or test folders!")
            return records
    else:  # set_tag == "test"
        target_set = "test"
        if os.path.isdir(test_folder):
            src_base = test_folder
        elif os.path.isdir(dev_folder):
            src_base = dev_folder
        else:
            print(f"[Warning] Could not find {vox_id} in dev or test folders!")
            return records

    

    prefix = "../Data/"
    
    for root, dirs, files in os.walk(src_base):
        for file in files:
            if file.lower().endswith(".wav"):
                # 使用当前子目录名作为前缀，防止文件同名被覆盖
                subfolder_name = os.path.basename(root)
                new_filename = f"{subfolder_name}_{file}"
                # 记录源文件的路径，而非目标文件路径
                src_file = os.path.join(root, file)
                abs_path = os.path.abspath(src_file)
                data_source = "vox"
                speaker = f"{vgg_id}"
                label = 1  # real 数据
                dataset = target_set
                relative_path = src_file[len(prefix):]
 
                records.append((data_source, speaker, abs_path, relative_path, label, dataset))
    return records

records_vox = []
MAX_WORKERS = 96
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = {executor.submit(process_row, row): idx for idx, row in vox1.iterrows()}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing VoxCeleb rows"):
        try:
            recs = future.result()
            records_vox.extend(recs)
        except Exception as e:
            print(f"Error processing row: {e}")

# 生成 DataFrame 并保存 CSV 文件
df_vox = pd.DataFrame(records_vox, columns=["data_source", "speaker", "absolute_path", "relative_path", "label", "Set"])

Processing VoxCeleb rows:   0%|          | 0/1251 [00:00<?, ?it/s]

In [11]:
df_vox.head()
# test_dev_df = df_vox[(df_vox["Set"] == "test") & (df_vox["absolute_path"].str.contains("vox1_dev_wav"))]
# unique_speakers = test_dev_df["speaker"].unique()
# print("满足条件的 speaker 数目：", len(unique_speakers))

Unnamed: 0,data_source,speaker,absolute_path,relative_path,label,Set
0,vox,Alan_Tudyk,/home/jupyter/Data/vox1_dev_wav/wav/id10023/Pp...,vox1_dev_wav/wav/id10023/PpJ-V3Gyr1U/00005.wav,1,train
1,vox,Alan_Tudyk,/home/jupyter/Data/vox1_dev_wav/wav/id10023/Pp...,vox1_dev_wav/wav/id10023/PpJ-V3Gyr1U/00001.wav,1,train
2,vox,Alan_Tudyk,/home/jupyter/Data/vox1_dev_wav/wav/id10023/Pp...,vox1_dev_wav/wav/id10023/PpJ-V3Gyr1U/00007.wav,1,train
3,vox,Alan_Tudyk,/home/jupyter/Data/vox1_dev_wav/wav/id10023/Pp...,vox1_dev_wav/wav/id10023/PpJ-V3Gyr1U/00008.wav,1,train
4,vox,Alan_Tudyk,/home/jupyter/Data/vox1_dev_wav/wav/id10023/Pp...,vox1_dev_wav/wav/id10023/PpJ-V3Gyr1U/00004.wav,1,train


In [31]:
# 基础目录，包含 dataset_01 ~ dataset_04 文件夹
base_path = "../Data/diffusion_dataset"
dataset_folders = ["dataset_01", "dataset_02", "dataset_03", "dataset_04"]

# 指定需要设置为 test 的生成 pipeline 类型
test_fake_types = {"NATSpeech_DiffSpeech", "DiffGAN-TTS_aux", "Grad-TTS"}

def process_file(file_path, dataset_folder, subfolder, default_set="train"):
    """
    构造单个wav文件记录
    :param file_path: wav 文件的完整路径
    :param dataset_folder: 数据集文件夹名称，如 dataset_01
    :param subfolder: 子文件夹名称，对应生成 pipeline，如 DiffGAN-TTS_aux
    :param default_set: 默认的数据集划分标签，默认为 "train"
    :return: (data_source, speaker, absolute_path, relative_path, label, Set)
    """
    abs_path = os.path.abspath(file_path)
    rel_path = os.path.relpath(file_path, base_path)
    data_source = "diffusion-based"  # 可根据需要修改
    speaker = subfolder
    label = 0  # 生成的伪造语音标记为 0
    
    # 如果子文件夹名称在 test_fake_types 中，则 Set 设置为 "test"
    set_label = "test" if subfolder in test_fake_types else default_set
    return data_source, speaker, abs_path, rel_path, label, set_label

all_records = []  # 存储所有记录
MAX_WORKERS = 96  # 设置多线程数

for dataset in dataset_folders:
    dataset_dir = os.path.join(base_path, dataset)
    if not os.path.isdir(dataset_dir):
        print(f"{dataset_dir} 不存在")
        continue
    
    # 遍历每个子文件夹（代表不同生成 pipeline）
    for subfolder in os.listdir(dataset_dir):
        subfolder_path = os.path.join(dataset_dir, subfolder)
        if not os.path.isdir(subfolder_path):
            continue
        # 获取当前子文件夹下所有 .wav 文件
        wav_files = [f for f in os.listdir(subfolder_path) if f.lower().endswith('.wav')]
        # random.shuffle(wav_files)  # 可选：随机打乱文件列表
        with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
            futures = []
            for wav in wav_files:
                file_path = os.path.join(subfolder_path, wav)
                futures.append(executor.submit(process_file, file_path, dataset, subfolder, "train"))
            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing {dataset}/{subfolder}"):
                try:
                    record = future.result()
                    all_records.append(record)
                except Exception as e:
                    print(f"处理文件时发生错误：{e}")

# 合并所有记录生成 DataFrame，并保存为一个 CSV 文件
df_diffusion = pd.DataFrame(all_records, columns=["data_source", "speaker", "absolute_path", "relative_path", "label", "Set"])

Processing dataset_01/DiffGAN-TTS_aux: 100%|██████████| 13100/13100 [00:00<00:00, 172748.37it/s]
Processing dataset_01/.ipynb_checkpoints: 0it [00:00, ?it/s]
Processing dataset_01/DiffGAN-TTS_shallow: 100%|██████████| 13100/13100 [00:00<00:00, 26651.21it/s]
Processing dataset_01/DiffGAN-TTS_naive: 100%|██████████| 13100/13100 [00:00<00:00, 26241.98it/s]
Processing dataset_02/NATSpeech_DiffSpeech: 100%|██████████| 13100/13100 [00:00<00:00, 90690.80it/s] 
Processing dataset_02/.ipynb_checkpoints: 0it [00:00, ?it/s]
Processing dataset_02/ProDiff: 100%|██████████| 13100/13100 [00:00<00:00, 121905.26it/s]
Processing dataset_02/Grad-TTS: 100%|██████████| 13100/13100 [00:00<00:00, 24363.83it/s]
Processing dataset_03/.ipynb_checkpoints: 0it [00:00, ?it/s]
Processing dataset_03/wavegrad2: 100%|██████████| 13100/13100 [00:00<00:00, 82627.74it/s] 
Processing dataset_03/tacotron2-DCA_bddm: 100%|██████████| 13100/13100 [00:00<00:00, 41090.77it/s]
Processing dataset_03/tacotron2-DCA_diffwave: 100%|█

In [33]:
df_diffusion.head()
df_diffusion.speaker.value_counts()

speaker
DiffGAN-TTS_aux           13100
DiffGAN-TTS_shallow       13100
DiffGAN-TTS_naive         13100
NATSpeech_DiffSpeech      13100
ProDiff                   13100
Grad-TTS                  13100
wavegrad2                 13100
tacotron2-DCA_bddm        13100
tacotron2-DCA_diffwave    13100
tacotron2-DCA_wavegrad    13100
fast_pitch                13100
vits                      13100
tacotron2-DCA             13100
glow-tts                  13100
Name: count, dtype: int64

In [34]:
df_diffusion.Set.value_counts()

Set
train    144100
test      39300
Name: count, dtype: int64

In [35]:
df_final = pd.concat([df_records, df_gen, df_LJ, df_vox, df_diffusion], ignore_index=True)

In [36]:
df_final.head()

Unnamed: 0,data_source,speaker,absolute_path,relative_path,label,Set
0,itw,Alec Guinness,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/23269.wav,1,test
1,itw,Mark Zuckerberg,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/23885.wav,0,test
2,itw,Alec Guinness,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/22375.wav,1,test
3,itw,Alec Guinness,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/23998.wav,0,test
4,itw,Barack Obama,/home/jupyter/Data/inthewild/release_in_the_wi...,release_in_the_wild/23525.wav,0,test


In [37]:
#get in-test

def update_set_to_in_test(df, sample_ratio=0.05, random_state=42):
    # 对每个 data_source 分组
    for ds, group in df.groupby("data_source"):
        # 找出当前组中 Set 为 "train" 的行的索引
        train_indices = group[group["Set"] == "train"].index
        if len(train_indices) > 0:
            # 随机抽取5%的样本（至少1个）
            sample_count = max(1, int(len(train_indices) * sample_ratio))
            sampled_indices = train_indices.to_series().sample(n=sample_count, random_state=random_state)
            # 将这些样本的 Set 更新为 "i-test"
            df.loc[sampled_indices, "Set"] = "in-test"
    return df

# 更新 df_final
df_final = update_set_to_in_test(df_final)

In [38]:
df_final.Set.value_counts()

Set
train      353575
test       127594
in-test     18609
Name: count, dtype: int64

In [39]:
##############################
# 保存 CSV
##############################
csv_file = "meta_modified.csv"
df_final.to_csv(csv_file, index=False)
print(f"CSV file generated: {csv_file}")

CSV file generated: meta_modified.csv


In [40]:
# len(df_records.speaker.unique()) #54
# len(df_gen.speaker.unique()) # 10
# len(df_LJ.speaker.unique()) # 1
# len(df_vox.speaker.unique()) # 1251
# len(df_diffusion.speaker.unique()) #14
# len(df_final.speaker.unique()) #1330

In [41]:
df_final.Set.value_counts()

Set
train      353575
test       127594
in-test     18609
Name: count, dtype: int64

In [42]:
df_final.label.value_counts()

label
0    313199
1    186579
Name: count, dtype: int64

In [43]:
cross_counts = df_final[['Set','label','data_source']].value_counts()
cross_counts_sorted = cross_counts.sort_index(level=0, ascending=True)
print(cross_counts_sorted)

Set      label  data_source    
in-test  0      diffusion-based      7205
                gen                  4589
         1      LJSpeech              655
                vox                  6160
test     0      diffusion-based     39300
                gen                 26200
                itw                 11816
         1      itw                 19963
                vox                 30315
train    0      diffusion-based    136895
                gen                 87194
         1      LJSpeech            12445
                vox                117041
Name: count, dtype: int64


In [29]:
cross_counts = df_final[['Set', 'label']].value_counts()
print(cross_counts)

Set      label
train    0        236534
         1        129486
test     0         64216
         1         50278
in-test  0         12449
         1          6815
Name: count, dtype: int64


In [44]:
cross_counts = df_final[df_final['data_source'] == "diffusion-based"][['Set','label','speaker']].value_counts()
print(cross_counts)

Set      label  speaker               
test     0      Grad-TTS                  13100
                DiffGAN-TTS_aux           13100
                NATSpeech_DiffSpeech      13100
train    0      tacotron2-DCA_diffwave    12484
                DiffGAN-TTS_naive         12472
                DiffGAN-TTS_shallow       12468
                vits                      12457
                tacotron2-DCA             12454
                glow-tts                  12444
                tacotron2-DCA_wavegrad    12434
                ProDiff                   12431
                fast_pitch                12421
                wavegrad2                 12416
                tacotron2-DCA_bddm        12414
in-test  0      tacotron2-DCA_bddm          686
                wavegrad2                   684
                fast_pitch                  679
                ProDiff                     669
                tacotron2-DCA_wavegrad      666
                glow-tts                    656
 

In [45]:
cross_counts = df_final[df_final['data_source'] == "itw"][['Set','label']].value_counts()
print(cross_counts)

Set   label
test  1        19963
      0        11816
Name: count, dtype: int64


In [46]:
cross_counts = df_final[df_final['data_source'] == "gen"][['Set','label','speaker']].value_counts()
print(cross_counts)

Set      label  speaker                                                      
train    0      common_voices_prompts_from_conformer_fastspeech2_pwg_ljspeech    15490
test     0      ljspeech_hifiGAN                                                 13100
                ljspeech_waveglow                                                13100
train    0      ljspeech_parallel_wavegan                                        12468
                ljspeech_full_band_melgan                                        12462
                ljspeech_multi_band_melgan                                       12459
                ljspeech_melgan_large                                            12434
                ljspeech_melgan                                                  12398
                jsut_parallel_wavegan                                             4754
                jsut_multi_band_melgan                                            4729
in-test  0      common_voices_prompts_from_conformer

In [47]:
cross_counts = df_final[df_final['data_source'] == "LJSpeech"][['Set','label','speaker']].value_counts()
print(cross_counts)

Set      label  speaker 
train    1      LJSpeech    12445
in-test  1      LJSpeech      655
Name: count, dtype: int64


In [48]:
cross_counts = df_final[df_final['data_source'] == "vox"][['Set','label']].value_counts()
print(cross_counts)

Set      label
train    1        117041
test     1         30315
in-test  1          6160
Name: count, dtype: int64


In [49]:
df_final.data_source.unique()

array(['itw', 'gen', 'LJSpeech', 'vox', 'diffusion-based'], dtype=object)