In [11]:
from transformers import AutoProcessor, AutoModelForAudioClassification, AutoConfig, Wav2Vec2FeatureExtractor
import torch
import torch.nn.functional as F
import torchaudio

In [12]:
model_path = "Lajavaness/wav2vec2-lg-xlsr-fr-speech-emotion-recognition"
processor = AutoProcessor.from_pretrained(model_path)
model = AutoModelForAudioClassification.from_pretrained(model_path)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_path)
sampling_rate = feature_extractor.sampling_rate

In [4]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate, sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

In [9]:
def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    #outputs = [{"Emotion": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    outputs = [(config.id2label[i], score*100) for i, score in enumerate(scores)]
    return outputs

In [10]:
file_path = '../files/output/20240512/test/audio/speaker_000/part_00001.wav'
outputs = predict(file_path, sampling_rate)
outputs

In [8]:
file_path = '../files/output/20240512/test/audio/speaker_000/part_00001.wav'
results = []

outputs = predict(file_path, sampling_rate)
for emotion in outputs:
    emotion['Score'] = float(emotion['Score'].strip('%'))
    results.append(tuple(emotion.values()))
results