# Create embeddings with the transformer library

We use the Huggingface transformers library to create an embedding for a an image dataset




## tldr; Play as callable functions

In [None]:
import datasets
from transformers import AutoFeatureExtractor, AutoModel, ASTForAudioClassification
import torch
from renumics import spotlight
import pandas as pd
import umap
import numpy as np

def __set_device():
    device = "cuda" if torch.cuda.is_available() else "cpu"   
    if device == "cuda":
        torch.cuda.empty_cache()
    return device


def extract_embeddings(model, feature_extractor):
    """Utility to compute embeddings."""
    device = model.device

    def pp(batch):
        audios = [element["array"] for element in batch["audio"]]
        inputs = feature_extractor(raw_speech=audios, return_tensors="pt", padding=True).to(device)        
        embeddings = model(**inputs).last_hidden_state[:, 0].cpu()
        
        return {"embedding": embeddings}
        

    return pp


def huggingface_embedding(dataset, modelname, batched=True, batch_size=8):
    # initialize huggingface model
    feature_extractor = AutoFeatureExtractor.from_pretrained(modelname, padding=True)
    model = AutoModel.from_pretrained(modelname, output_hidden_states=True)

    #compute embedding  
    device = __set_device()
    extract_fn = extract_embeddings(model.to(device), feature_extractor)
    updated_dataset = dataset.map(extract_fn, batched=batched, batch_size=batch_size, cache_file_name="updated_embedding")
    
    return updated_dataset



## Step-by-step example on speech-commands

### Load speech-commands from Huggingface hub

Use test split to evaluate model's performance on unseen data

In [None]:
dataset = datasets.load_dataset('speech_commands', 'v0.01', split="all")
labels = dataset.features["label"].names

Let's have a look at all of the labels that we want to predict

In [None]:
print(labels)

### Compute embedding with audio transformer from Huggingface

In [None]:
dataset_enriched = huggingface_embedding(dataset, "MIT/ast-finetuned-speech-commands-v2")

### Reduce embeddings for faster visualization

In [None]:
embeddings = np.stack(np.array(dataset_enriched['embedding']))
reducer = umap.UMAP()
reduced_embedding = reducer.fit_transform(embeddings)
dataset_enriched = dataset_enriched.add_column("embedding_reduced", list(reduced_embedding))

### Perform EDA with Spotlight

In [None]:
df = dataset.to_pandas()

In [None]:
df.head(10)

In [None]:
spotlight.show(df, dtype={"audio": spotlight.Audio, "embedding": spotlight.Embedding, 
                                   "embedding_reduced": spotlight.Embedding})

### Optional: Save enriched dataframe to disk

In [None]:
#dataset_enriched.to_parquet('dataset_audio_annotated_and_embedding.parquet.gzip', compression='gzip')