In [6]:
# load dataset
from datasets import load_dataset, Audio, ClassLabel

# class_label = ClassLabel(names=["bonafide", "replay_bonafide", "fake", "replay_fake"])

ds = load_dataset(
    "json",
    data_files={
        "train": "/data1/zt/ReplayDeepfake/data/train.jsonl",
        "dev": "/data1/zt/ReplayDeepfake/data/dev.jsonl",
        "closed_set_eval": "/data1/zt/ReplayDeepfake/data/eval.jsonl",
    },
)
ds = ds.cast_column("path", Audio())
# ds = ds.cast_column("label", class_label)

In [7]:
ds.save_to_disk("EchoFake")

Saving the dataset (0/4 shards):   0%|          | 0/39926 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3973 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5991 [00:00<?, ? examples/s]

In [3]:
from datasets import load_from_disk

ds = load_from_disk("EchoFake")

In [5]:
print(ds["train"][39876])
print(ds["train"].column_names)
print(ds["train"].features["label"])

{'utt_id': 'EF_T_39877', 'path': {'path': 'EF_T_39877.mp3', 'array': array([-7.05333587e-05,  3.73185685e-06,  6.12027070e-05, ...,
       -7.26406652e-05,  2.31373560e-04,  2.09870588e-04], shape=(117419,)), 'sampling_rate': 16000}, 'label': 3, 'source': 'common_voice_en_27450201.mp3', 'source_text': 'He was invited to the Konigswinter conferences by Lilo Milchsack.', 'source_speaker_id': 'e31822077e104d2f1781db7ce098c016abca0314515a35f564b26c30e59c0d2a0206cdf020dfbfe4e7340388f898c34d51e2d625681694a797c49581af656fba', 'replay_details': {'room_size': '4.8m(L) * 3.2m(W) * 3.2m(H)', 'player': 'iPad Mini (7th generation)', 'recorder': 'iPhone 13 mini', 'distance': '15cm'}, 'synthesis_details': {'model': 'LLaSA', 'reference': 'common_voice_en_27450201.mp3', 'reference_text': "Now, everything's done on synthesizers.", 'reference_speaker_id': 'b87dd0680063cbd3dc57f5b9b20c14f020f511c3f18467d52e816db909dbe2be05b8ea36a4bca91179503817a1b093a9e6036abb514c6ad1aaf1ffa71b3d32ce'}}
['utt_id', 'path',

In [None]:
import numpy as np


def pad(x, max_len=64000):
    x_len = x.shape[0]
    if x_len >= max_len:
        return x[:max_len]
    # need to pad
    num_repeats = int(max_len / x_len) + 1
    padded_x = np.tile(x, (1, num_repeats))[:, :max_len][0]
    return padded_x


def pad_random(x: np.ndarray, max_len: int = 64000):
    x_len = x.shape[0]
    if x_len == max_len:
        return x

    # if duration is already long enough
    if x_len >= max_len:
        stt = np.random.randint(x_len - max_len)
        return x[stt : stt + max_len]

    # if too short
    num_repeats = int(max_len / x_len) + 1
    padded_x = np.tile(x, num_repeats)[:max_len]
    return padded_x


def preprocess(batch):
    wav = batch["path"]["array"]  # np.ndarray [T]
    wav = pad_random(wav, max_len=64000).astype(np.float32)
    return {"input_values": wav, "labels": batch["label"]}


def preprocess_for_test(batch):
    wav = batch["path"]["array"]  # np.ndarray [T]
    wav = pad(wav, max_len=64000).astype(np.float32)
    return {"input_values": wav, "labels": batch["label"]}


unused_columns = [
    "source",
    "source_text",
    "source_speaker_id",
    "replay_details",
    "synthesis_details",
]
trainset = ds["train"].map(preprocess, remove_columns=unused_columns)
devset = ds["dev"].map(preprocess, remove_columns=unused_columns)
evalset = ds["closed_set_eval"].map(preprocess_for_test, remove_columns=unused_columns)

In [None]:
from datasets import load_dataset, Audio
from transformers import Trainer, TrainingArguments
from models import RawNet2
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

device = "cuda:1" if torch.cuda.is_available() else "cpu"

# Step 3: 模型
model = RawNet2(device=device).to(device)

# Step 4: 训练参数
training_args = TrainingArguments(
    output_dir="./checkpoints",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
)


# Step 5: 评价函数
def compute_metrics(pred):
    logits = pred.predictions
    labels = pred.label_ids
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds),
        "auc": roc_auc_score(labels, logits[:, 1]),
    }


# Step 6: Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

# Step 7: 训练
trainer.train()

# Step 8: 保存模型
model.save_pretrained("rawnet2_fakedetection")
torch.save(model.state_dict(), "rawnet2_fakedetection/pytorch_model.bin")