In [19]:
from pathlib import Path
import numpy as np
from models import get_model
import librosa
from transformers import AutoProcessor, Wav2Vec2Model
import torch
from transformers import AutoFeatureExtractor, WavLMForXVector
from tqdm import tqdm 
import os

Pengi = get_model.get_model("mspengi")
pengi = Pengi(config="base")

In [11]:
def get_pengi_embed(file):
    Pengi = get_model("mspengi")
    pengi = Pengi(config="base")
    _, audio_embeddings = pengi.get_audio_embeddings(audio_paths=[file])    
    new_file = str(file).replace(".WAV", "_pengi.npy")    
    np.save(f"{new_file}", audio_embeddings)
        
def get_clap_embed(file):
    CLAP = get_model("msclap")
    clap_model = CLAP(version = '2023', use_cuda=False)
    audio_embeddings = clap_model.get_audio_embeddings([file])
    new_file = str(file).replace(".WAV", "_clap.npy")    
    np.save(f"{new_file}", audio_embeddings)

def get_wavlm_embed(file):
    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sv")
    model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv")
    wav, sr = librosa.load(file, sr=16_000)
    inputs = feature_extractor(wav, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        audio_embeddings = model(**inputs).embeddings

    audio_embeddings = torch.nn.functional.normalize(audio_embeddings, dim=-1).cpu()            
    new_file = str(file).replace(".wav", ".npy")    
    np.save(f"{new_file}", audio_embeddings)

def get_wav2vec_embed(file):
    processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
    wav, sr = librosa.load(file, sr=16_000)
    inputs = processor(wav, sampling_rate=sr, return_tensors="pt")
    with torch.no_grad():
        audio_embeddings = model(**inputs)
    audio_embeddings = audio_embeddings.last_hidden_state
    new_file = str(file).replace(".wav", ".npy")
    
    np.save(f"{new_file}", audio_embeddings)

In [14]:
def process_embeddings(folder, embedding_type):
    if embedding_type == 'pengi':
        get_pengi_embed(folder)
    elif embedding_type == 'clap':
        get_clap_embed(folder)
    elif embedding_type == 'wavlm':
        get_wavlm_embed(folder)
    elif embedding_type == 'wav2vec':
        get_wav2vec_embed(folder)
    else:
        raise ValueError("Unsupported embedding type provided")
process_embeddings("/media/konan/DataDrive/user_archive/home/shuohan/shan1/data_pred/data/timit_styletts/TEST/DR1/FAKS0/SA1.wav", 'wav2vec')