# Audio classification with pre-trained transformer models
In this tutorial we will see how to leverage pre-trained transformer models to quickly build a data-efficient audio classification model.<br>
We will compute an embedding for given audio dataset and use the embedding as the feature vector for a simple classification with a support vector machine.

## Load the dataset from Huggingface

In [1]:
import datasets

ds_full = datasets.load_dataset("renumics/esc50", split="train")

In [None]:
from renumics import spotlight

spotlight.show(ds_full, port=8889, no_ssl=True, host="0.0.0.0", analyze=False)

### We extract a small dataset for the tutorial

In [3]:
# label number 42 are sirens
ds_siren = ds_full.shuffle(seed=42).filter(lambda x: x["label"] == 42)
ds_non_siren = ds_full.select(range(80)).filter(lambda x: x["label"] != 42).select(range(40))

ds = datasets.concatenate_datasets([ds_siren, ds_non_siren])

## Generate the embedding with Huggingface ASR

In [4]:
# need to cast the waveforms to a uniform sampling rate of 16kHz
ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16000))

In [5]:
import torch


def extract_embeddings(model, feature_extractor, audio_name="audio"):
    """Utility to compute embeddings."""
    device = model.device

    def pp(batch):
        waveform = batch[audio_name]

        inputs = feature_extractor(
            waveform["array"], sampling_rate=waveform["sampling_rate"], return_tensors="pt"
        )

        # Get the embeddings
        with torch.no_grad():
            embeddings = model(**inputs, output_hidden_states=True).hidden_states[-1][0, 0, :].cpu()

        return {"embedding": embeddings}

    return pp

In [None]:
from transformers import ASTFeatureExtractor, ASTForAudioClassification

# Load the pre-trained Audio Spectrogram Transformer model and feature extractor
model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
model = ASTForAudioClassification.from_pretrained(model_name)
feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)

device = "cuda" if torch.cuda.is_available() else "cpu"
extract_fn = extract_embeddings(model.to(device), feature_extractor, "audio")
ds_enriched = ds.map(extract_fn, batched=False)

## Perform a classification with a support vector machine

In [10]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# convert to pandas
df = ds_enriched.to_pandas()
df['binary_label'] = df['label'].apply(lambda x: 1 if x == 42 else 0)

# Separate the features and the label
X = [x.tolist() for x in df["embedding"]]
y = df["binary_label"]

# Split the data into training and test sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Train the Support Vector Machine

In [None]:
# Initialize the Support Vector Classifier
svc = SVC(kernel="linear", random_state=42)  # Use 'linear' kernel, or try 'rbf', 'poly', etc.

# Train the model
svc.fit(X_train, y_train)

### Quick quantitative analysis

In [None]:
# Make predictions on the test set
y_pred = svc.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print a classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Print a confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

### Qualitative analysis with Spotlight

In [13]:
#predict on the full dataset
y_pred = svc.predict(X)

#add the predictions to the dataset
df['prediction'] = y_pred

In [None]:
spotlight.show(df, port=8889, no_ssl=True, host="0.0.0.0", analyze=False)