In [None]:
!pip install transformers datasets torchaudio librosa accelerate evaluate
!pip install huggingface_hub


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pandas as pd

def create_csv(root_dir, split):
    data = []
    # Construct the correct path to the split directory
    split_dir = os.path.join(root_dir, split)
    for label, cls in enumerate(["real", "fake"]):
        # Construct the path to the real/fake subdirectories within the split directory
        folder = os.path.join(split_dir, cls)
        for file in os.listdir(folder):
            if file.endswith(".wav"):
                data.append([os.path.join(folder, file), label])
    return pd.DataFrame(data, columns=["file_path", "label"])

# Note: The root_dir for each split should be the parent directory containing the "Training", "Validation", and "Testing" folders.
# Based on the original code's `root_dir` values, it seems the intent was that `/content/drive/MyDrive/Colab Notebooks/FoR` is the main root directory.
# Let's adjust the calls to `create_csv` accordingly.
main_root_dir = "/content/drive/MyDrive/Colab Notebooks/FoR"
train_df = create_csv(main_root_dir, "training")
val_df   = create_csv(main_root_dir, "validation")
test_df  = create_csv(main_root_dir, "testing")

print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

Train size: 10264
Val size: 2826
Test size: 1088


In [None]:
from datasets import Dataset, DatasetDict

# Convert pandas DataFrames to HF Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)
test_dataset  = Dataset.from_pandas(test_df)

# Bundle into DatasetDict
dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print(dataset)


DatasetDict({
    train: Dataset({
        features: ['file_path', 'label'],
        num_rows: 10264
    })
    validation: Dataset({
        features: ['file_path', 'label'],
        num_rows: 2826
    })
    test: Dataset({
        features: ['file_path', 'label'],
        num_rows: 1088
    })
})


In [None]:
from transformers import Wav2Vec2Processor

# Wav2Vec2 Base model (ASR pre-trained, not fine-tuned for spoofing)
checkpoint = "facebook/wav2vec2-base"

processor = Wav2Vec2Processor.from_pretrained(checkpoint)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]



vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [None]:
from transformers import Wav2Vec2Processor

checkpoint = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(checkpoint)

def prepare_dataset(batch):
    inputs = processor(batch["speech"], sampling_rate=16000, return_tensors="pt", padding=True)
    batch["input_values"] = inputs.input_values[0]

    # check if attention_mask exists
    if "attention_mask" in inputs:
        batch["attention_mask"] = inputs.attention_mask[0]
    else:
        batch["attention_mask"] = [1] * len(batch["input_values"])  # fallback

    return batch

dataset = dataset.map(prepare_dataset, remove_columns=["speech", "file_path"])


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/10264 [00:00<?, ? examples/s]

Map:   0%|          | 0/2826 [00:00<?, ? examples/s]

Map:   0%|          | 0/1088 [00:00<?, ? examples/s]

In [None]:
import torch
from transformers import Wav2Vec2ForSequenceClassification

num_labels = 2  # real vs fake
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels,
    gradient_checkpointing=True,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.05
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(processor, padding=True)


In [None]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=preds, references=labels)


Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
!pip install -U transformers datasets




In [None]:
import transformers
print(transformers.__version__)

4.56.1


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./wav2vec2-fake-detection",   # where to save model
    eval_strategy="steps",                    # ✅ correct arg name for v4.56.1
    save_steps=500,                           # save every N steps
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=True,                                # mixed precision (faster on GPU)
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)


In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)
    f1_score = f1.compute(predictions=preds, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(processor.feature_extractor)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    tokenizer=processor,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
trainer.train()


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maryanrai1003[0m ([33maryanrai1003-mit-world-peace-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy,F1
50,0.667,0.591722,0.909766,0.909761
100,0.4427,0.276268,0.919674,0.919207
150,0.2149,0.097863,0.982307,0.982306
200,0.0828,0.055114,0.9908,0.990799
250,0.0871,0.036244,0.995046,0.995046
300,0.0714,0.033312,0.994338,0.994338
350,0.0325,0.020766,0.996815,0.996815
400,0.0219,0.021863,0.996461,0.996461
450,0.0339,0.116115,0.975938,0.975926
500,0.0403,0.027038,0.9954,0.9954


In [None]:
!pip install wandb -q
import wandb
wandb.login()


  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33maryanrai1003[0m ([33maryanrai1003-mit-world-peace-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
import os

# Your training output directory (from TrainingArguments)
output_dir = "./wav2vec2-fake-detection"

# 1. Check in the Colab session
if os.path.exists(output_dir):
    print("✅ Found checkpoints in Colab runtime:")
    print(os.listdir(output_dir))
else:
    print("❌ No checkpoints in Colab runtime.")

# 2. Check in Google Drive (if you mounted it and saved there)
drive_dir = "/content/drive/MyDrive/Colab Notebooks/untitled2/wav2vec2-fake-detection"
if os.path.exists(drive_dir):
    print("\n✅ Found checkpoints in Google Drive:")
    print(os.listdir(drive_dir))
else:
    print("\n❌ No checkpoints in Google Drive at that path.")


❌ No checkpoints in Colab runtime.

❌ No checkpoints in Google Drive at that path.


In [None]:
save_dir = "./wav2vec2-fake-detection-final"

trainer.save_model(save_dir)            # save model
processor.save_pretrained(save_dir)     # save processor


In [None]:
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification # Import the processor and model classes

# Define and load the processor (assuming checkpoint is still "facebook/wav2vec2-base")
checkpoint = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(checkpoint)

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define and load the model (assuming checkpoint is still "facebook/wav2vec2-base")
num_labels = 2  # real vs fake
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels,
    gradient_checkpointing=True,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.05
)
model.to(device)


def predict(audio_path):
    speech, sr = librosa.load(audio_path, sr=16000)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
    preds = torch.argmax(logits, dim=-1).item()
    return "REAL" if preds == 0 else "FAKE"

# Example:
print(predict("/content/drive/MyDrive/Colab Notebooks/FoR/testing/real/file1010.wav_16k.wav_norm.wav_mono.wav_silence.wav_2sec.wav"))

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]



pytorch_model.bin:   0%|          | 0.00/380M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/380M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


REAL


In [None]:
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification # Import the processor and model classes
import torch.nn.functional as F


# Define and load the processor (assuming checkpoint is still "facebook/wav2vec2-base")
checkpoint = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(checkpoint)

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define and load the model (assuming checkpoint is still "facebook/wav2vec2-base")
num_labels = 2  # real vs fake
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels,
    gradient_checkpointing=True,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.05
)
model.to(device)


def predict(audio_path):
    speech, sr = librosa.load(audio_path, sr=16000)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1)  # convert to probabilities

    real_prob = probs[0][0].item()
    fake_prob = probs[0][1].item()

    return {
        "REAL_probability": real_prob,
        "FAKE_probability": fake_prob,
        "Prediction": "REAL" if fake_prob < real_prob else "FAKE"
    }


# Example:
print(predict("/content/drive/MyDrive/Colab Notebooks/FoR/testing/real/file1010.wav_16k.wav_norm.wav_mono.wav_silence.wav_2sec.wav"))

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'REAL_probability': 0.5050028562545776, 'FAKE_probability': 0.49499714374542236, 'Prediction': 'REAL'}


In [None]:
import librosa
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification # Import the processor and model classes
import torch.nn.functional as F


# Define and load the processor (assuming checkpoint is still "facebook/wav2vec2-base")
checkpoint = "facebook/wav2vec2-base"
processor = Wav2Vec2Processor.from_pretrained(checkpoint)

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define and load the model (assuming checkpoint is still "facebook/wav2vec2-base")
num_labels = 2  # real vs fake
model = Wav2Vec2ForSequenceClassification.from_pretrained(
    checkpoint,
    num_labels=num_labels,
    gradient_checkpointing=True,
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.1,
    mask_time_prob=0.05
)
model.to(device)


def predict(audio_path):
    speech, sr = librosa.load(audio_path, sr=16000)
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1)  # convert to probabilities

    real_prob = probs[0][0].item()
    fake_prob = probs[0][1].item()

    return {
        "REAL_probability": real_prob,
        "FAKE_probability": fake_prob,
        "Prediction": "REAL" if fake_prob < real_prob else "FAKE"
    }


# Example:
print(predict("/content/drive/MyDrive/Colab Notebooks/FoR/testing/fake/file1230.wav_16k.wav_norm.wav_mono.wav_silence.wav_2sec.wav"))

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'projector.bias', 'projector.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'REAL_probability': 0.49368491768836975, 'FAKE_probability': 0.5063150525093079, 'Prediction': 'FAKE'}
