# Audio Spoofing Detection Notebook

This notebook aims to detect spoofed audio using Symbolic Regression with PySR. 

In [None]:
from pysr import PySRRegressor

In [None]:
# Setup and Initial Imports
%load_ext autoreload
%autoreload 2

import sys
import os
import time
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from tqdm import tqdm

# Add the project root folder to PYTHONPATH
sys.path.append(os.path.abspath('..'))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import Machine Learning and SR Libraries

In this cell, we import the necessary libraries for SR and data processing.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, det_curve
from sklearn.model_selection import StratifiedShuffleSplit

## Define File Paths (Validation)

Define the paths for the processed features and protocol file. Make sure to adjust these paths as needed.

In [None]:
TEST_FEATURES_MFCC  = "../data/processed/mfccs_features_ASVspoof2021_DF_eval_part00.pkl"
TRAIN_FEATURES_MFCC = "../data/processed/mfccs_features.pkl"
TEST_PROTOCOL  = "../data/raw/DF-keys-full/keys/DF/CM/trial_metadata.txt"
TRAIN_PROTOCOL = "../data/raw/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.train.trn.txt"

## Load Features

Load the feature data from the pickle file.

In [None]:
with open(TRAIN_FEATURES_MFCC, 'rb') as f:
    train_data = pickle.load(f)
with open(TEST_FEATURES_MFCC, 'rb') as f:
    test_data = pickle.load(f)

## Read Protocol File

Define a function to read the protocol file, which maps audio files to labels, and load the DataFrame for train and test.

In [None]:
def read_cm_protocol_train(filepath: str) -> pd.DataFrame:
    """
    Reads the ASVspoof2019 LA train protocol file and returns a DataFrame with:
      - speaker_id : speaker identifier (e.g. LA_0079)
      - audio_file : trial identifier (e.g. LA_T_1138215)
      - system_id  : system placeholder ('-')
      - key        : bonafide/spoof label

    Args:
        filepath (str): path to the ASVspoof2019.LA.cm.train.trn.txt file.

    Returns:
        pd.DataFrame: columns = [speaker_id, audio_file, system_id, key]
    """
    # the file has 5 whitespace-separated fields, the 4th is unused ('-')
    col_names = ["speaker_id", "audio_file", "system_id", "unused", "key"]
    df = pd.read_csv(
        filepath,
        sep=r"\s+",
        names=col_names,
        engine="python",
        index_col=False
    )
    df = df.drop(columns=["unused"])
    return df

# Usage example:
train_df = read_cm_protocol_train(TRAIN_PROTOCOL)
print("Train Protocol (first 5 rows):")
train_df.head()

Protocol Data (first 5 rows):
  speaker_id    audio_file system_id       key
0    LA_0079  LA_T_1138215         -  bonafide
1    LA_0079  LA_T_1271820         -  bonafide
2    LA_0079  LA_T_1272637         -  bonafide
3    LA_0079  LA_T_1276960         -  bonafide
4    LA_0079  LA_T_1341447         -  bonafide


In [None]:
def read_cm_protocol_test(filepath: str) -> pd.DataFrame:
    """
    Reads a CM protocol file and returns a DataFrame with the following columns:
      - speaker_id     : speaker identifier (e.g., LA_0023)
      - trial_id       : trial identifier (e.g., DF_E_2000011)
      - codec          : compression codec (e.g., nocodec, low_mp3, high_ogg, etc.)
      - data_source    : data origin (asvspoof, vcc2018, vcc2020)
      - attack_id      : spoofing attack code (A07–A19)
      - key            : trial label (bonafide or spoof)
      - trim           : speech trimming flag (notrim or trim)
      - subset         : subset name (eval, progress, hidden)
      - vocoder_type   : vocoder category
    The remaining four fields (always “-”) are discarded.
    """
    # Define all 13 column names but only keep the first 9
    column_names = [
        "speaker_id", "trial_id", "codec", "data_source", "attack_id",
        "key", "trim", "subset", "vocoder_type",
        "unused1", "unused2", "unused3", "unused4"
    ]
    
    # Read using any amount of whitespace as delimiter
    df = pd.read_csv(
        filepath,
        sep=r"\s+",
        names=column_names,
        engine="python",
        index_col=False
    )
    
    # Drop the unused placeholder columns
    df = df.drop(columns=["unused1", "unused2", "unused3", "unused4"])
    
    return df

# Example usage:
test_df = read_cm_protocol_test(TEST_PROTOCOL)
test_df.head()

## Create DataFrame for Features and Labels

Create a DataFrame where each row contains a file name and its corresponding features. Label 1 represents "spoof" and 0 represents "bonafide".

In [None]:
print(train_df.shape, test_df.shape)

In [None]:
def create_labeled_df(data: list, protocol_df: pd.DataFrame, id_col: str) -> pd.DataFrame:
    # map trial ID → label
    label_map = dict(zip(protocol_df[id_col], protocol_df['key']))
    # build raw features DataFrame
    n_feats = len(data[0]['features'])
    cols = ['file'] + [f'feature_{i}' for i in range(n_feats)]
    df = pd.DataFrame([ (d['file'], *d['features']) for d in data ], columns=cols)
    # strip extension and map to 0/1
    df['trial_id'] = df['file'].str.replace(r'\.(wav|flac)$','',regex=True)
    df['label'] = (df['trial_id']
                   .map(label_map)
                   .fillna('bonafide')
                   .map({'spoof':1,'bonafide':0}))
    return df.drop(columns=['trial_id'])

# -- Create feature+label DataFrames
train_features_df = create_labeled_df(train_data, train_df, 'audio_file')
del(train_data)
test_features_df  = create_labeled_df(test_data,  test_df,  'trial_id')
del(test_data)

In [None]:
# -- Inspect
print(f"train_features_df: {train_features_df.shape}")
print(train_features_df.head())
print(f"test_features_df: {test_features_df.shape}")
print(test_features_df.head())

## Data Exploration

Display the distribution of labels to verify the balance of the dataset.

In [None]:
def summarize_label_distribution(df: pd.DataFrame, name: str) -> pd.DataFrame:
    """
    Returns a summary DataFrame with counts and percentages of bona-fide vs spoof labels.
    
    Args:
        df   : DataFrame containing a 'label' column (0 = bonafide, 1 = spoof)
        name : identifier for printing (e.g., "Train", "Test")
    """
    # map 0/1 → string labels
    label_map = {0: "bonafide", 1: "spoof"}
    
    # count and percentage
    dist = df["label"].value_counts().rename(index=label_map)
    pct  = df["label"].value_counts(normalize=True).mul(100).rename(index=label_map).round(2)
    
    summary = pd.DataFrame({
        "Count":       dist,
        "Percent (%)": pct
    })
    
    print(f"\n{name} set label distribution:")
    print(summary)
    return summary

# Usage for train and test sets
train_summary = summarize_label_distribution(train_features_df, "Train")
test_summary  = summarize_label_distribution(test_features_df,  "Test")

Label Distribution:
          Count  Percent (%)
label                       
spoof     22800        89.83
bonafide   2580        10.17


## Data Preprocessing

Separate the features (X) and labels (y), standardize the features, and split the data into training and testing sets.

In [None]:
# Prepare feature matrices and label vectors from pre-defined train/test DataFrames
X_train = train_features_df.drop(columns=["file", "label"]).values
y_train = train_features_df["label"].values

X_test  = test_features_df.drop(columns=["file", "label"]).values
y_test  = test_features_df["label"].values

# Fit scaler on train only, then transform both
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)


## Define, Train, and Evaluate Models

Define a dictionary of models, then train and evaluate each one. For each model, we calculate the training time, accuracy, confusion matrix, and Equal Error Rate (EER).

In [None]:
SAMPLE_FRAC         = 1    # keep % of each set
SAMPLE_RANDOM_STATE = 42     # reproducible subsampling

SEEDS = range(1, 11)          # seeds 1…20
TICKS = [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50]

/home/vicosbe/victo/audio_symbolic_regression/audio_symbolic_regression_env/bin/python


In [None]:
def stratified_subsample(X, y, frac, random_state):
    """Return a stratified subsample of (X, y)."""
    if frac >= 1.0:
        return X, y
    sss = StratifiedShuffleSplit(n_splits=1, train_size=frac, random_state=random_state)
    idx, _ = next(sss.split(X, y))
    return X[idx], y[idx]

In [None]:
# Subsample both train and test sets
X_train_sub, y_train_sub = stratified_subsample(X_train, y_train, SAMPLE_FRAC, SAMPLE_RANDOM_STATE)
X_test_sub,  y_test_sub  = stratified_subsample(X_test,  y_test,  SAMPLE_FRAC, SAMPLE_RANDOM_STATE)

print(f"Subsampled train: {len(y_train_sub)}/{len(y_train)} samples")
print(f"Subsampled test : {len(y_test_sub)}/{len(y_test)} samples\n")

# Training & evaluation loop on subsampled data
print("Training and evaluating models on subsampled data...\n")

In [None]:
def plot_nist_det(fpr_tr, fnr_tr, fpr_ts, fnr_ts, title):
    # convert to percent
    fpr_tr_p = fpr_tr * 100
    fnr_tr_p = fnr_tr * 100
    fpr_ts_p = fpr_ts * 100
    fnr_ts_p = fnr_ts * 100

    ticks = [0.1, 0.2, 0.5, 1, 2, 5, 10, 20, 50]

    fig, ax = plt.subplots()
    ax.plot(fpr_tr_p, fnr_tr_p, label="Train DET", linewidth=1.5)
    ax.plot(fpr_ts_p, fnr_ts_p, label="Test DET",  linewidth=1.5)

    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlim(min(ticks), max(ticks))
    ax.set_ylim(min(ticks), max(ticks))
    ax.set_xticks(ticks)
    ax.set_yticks(ticks)
    ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
    ax.yaxis.set_major_formatter(ticker.ScalarFormatter())

    # diagonal EER line
    ax.plot(ticks, ticks, linestyle="--", color="grey", linewidth=1)

    ax.set_xlabel("False Acceptance Rate (%)")
    ax.set_ylabel("False Rejection Rate (%)")
    ax.set_title(title)
    ax.grid(which="both", linestyle="--", linewidth=0.5)
    ax.legend()
    plt.tight_layout()
    return fig

In [None]:
# -------------------------------------------------------------------
# PySR model templates
# -------------------------------------------------------------------
model_templates = {
    "PySR_batch_500_200it (Basic)": lambda seed: PySRRegressor(
        niterations=2,
        binary_operators=["+", "-", "*", "/"],
        unary_operators=["exp", "log"],
        elementwise_loss="(x, y) -> (x - y)^2",
        random_state=seed,
        batching=True,
        batch_size=500,
        deterministic=True,
        parallelism="serial",
        constraints={ "^": (-1, 1) }
    ),
    "PySR_batch_500_250it (Extended)": lambda seed: PySRRegressor(
        niterations=2,
        binary_operators=["+", "-", "*", "/"],
        unary_operators=["exp", "log", "sin", "cos"],
        elementwise_loss="(x, y) -> (x - y)^2",
        random_state=seed,
        batching=True,
        batch_size=500,
        deterministic=True,
        parallelism="serial",
        constraints={ "^": (-1, 1) }
    ),
    "PySR_batch_500_300it (Complex)": lambda seed: PySRRegressor(
        niterations=3,
        binary_operators=["+", "-", "*", "/", "^"],
        unary_operators=["exp", "log", "sin", "cos", "abs"],
        elementwise_loss="(x, y) -> (x - y)^2",
        random_state=seed,
        batching=True,
        batch_size=500,
        deterministic=True,
        parallelism="serial",
        constraints={ "^": (-1, 1) }
    ),
}

# -------------------------------------------------------------------
# Prepare metrics CSV (add EER_train column)
# -------------------------------------------------------------------
csv_file = "model_metrics_test_mfcc.csv"
if not os.path.exists(csv_file):
    pd.DataFrame(
        columns=["Model", "Training Time (sec)", "Accuracy", "EER_train", "EER"]
    ).to_csv(csv_file, index=False)

# -------------------------------------------------------------------
# Loop over templates and seeds
# -------------------------------------------------------------------
SEEDS    = range(1, 11)
BASE_DIR = "../models/test/py_sr_mfcc"
os.makedirs(BASE_DIR, exist_ok=True)

print("\nTraining PySR models with multiple seeds…\n")
for name, constructor in model_templates.items():
    print(f"=== {name} ===")
    safe_name = name.replace(" ", "_").replace("(", "").replace(")", "")
    model_dir = os.path.join(BASE_DIR, safe_name)
    os.makedirs(model_dir, exist_ok=True)

    for seed in SEEDS:
        model = constructor(seed)

        # Train
        t0 = time.time()
        model.fit(X_train_sub, y_train_sub)
        train_time = time.time() - t0

        # TRAIN DET (sanitize + compute EER_train)
        y_scores_tr = np.array(model.predict(X_train_sub), dtype=float)
        mask_tr     = np.isfinite(y_scores_tr)
        fpr_tr, fnr_tr, _ = det_curve(y_train_sub[mask_tr], y_scores_tr[mask_tr])
        idx_tr     = np.nanargmin(np.abs(fnr_tr - fpr_tr))
        eer_train  = float(fpr_tr[idx_tr])

        # TEST DET (sanitize + compute EER)
        y_scores_ts = np.array(model.predict(X_test_sub), dtype=float)
        mask_ts     = np.isfinite(y_scores_ts)
        y_true_ts   = y_test_sub[mask_ts]
        y_scores_ts = y_scores_ts[mask_ts]
        fpr_ts, fnr_ts, _ = det_curve(y_true_ts, y_scores_ts)
        idx_ts      = np.nanargmin(np.abs(fnr_ts - fpr_ts))
        eer         = float(fpr_ts[idx_ts])
        y_pred_ts   = (y_scores_ts > 0.5).astype(int)
        acc         = accuracy_score(y_true_ts, y_pred_ts)

        # Save model
        with open(os.path.join(model_dir, f"model_seed_{seed:02d}.pkl"), "wb") as f:
            pickle.dump(model, f)

        # Plot & save DET
        fig = plot_nist_det(fpr_tr, fnr_tr, fpr_ts, fnr_ts, title=f"{name} – Seed {seed:02d}")
        fig.savefig(os.path.join(model_dir, f"DET_seed_{seed:02d}.png"))
        plt.close(fig)

        # Append metrics (including EER_train)
        pd.DataFrame([{
            "Model": name,
            "Training Time (sec)": round(train_time, 2),
            "Accuracy": round(acc, 4),
            "EER_train": round(eer_train, 4),
            "EER": round(eer, 4)
        }]).to_csv(csv_file, mode="a", header=False, index=False)

        print(f"Seed {seed:02d}: time {train_time:.1f}s, acc {acc:.4f}, "
              f"EER_train {eer_train:.4f}, EER_test {eer:.4f}")
    print()