In [1]:
import torch
from transformers import AutoProcessor, AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
import numpy as np
from pydub import AudioSegment

# https://github.com/ehcalabres/EMOVoice
# the preprocessor was derived from https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english
processor1 = AutoProcessor.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
# ^^^ no preload model available for this model (above), but the `feature_extractor` works in place

model1 = AutoModelForAudioClassification.from_pretrained("ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53")

def predict_emotion(audio_file):

    sound = AudioSegment.from_file(audio_file)
    sound = sound.set_frame_rate(16000)
    sound_array = np.array(sound.get_array_of_samples())
    
    # this model is VERY SLOW, so best to pass in small sections that contain 
    # emotional words from the transcript. like 10s or less.
    # how to make sub-chunk  -- this was necessary even with very short audio files 
    # test = torch.tensor(input.input_values.float()[:, :100000])

    input = feature_extractor(
        raw_speech=sound_array,
        sampling_rate=16000,
        padding=True,
        return_tensors="pt")

    result = model1.forward(input.input_values.float())

    id2label = {
        "0": "angry",
        "1": "calm",
        "2": "disgust",
        "3": "fearful",
        "4": "happy",
        "5": "neutral",
        "6": "sad",
        "7": "surprised"
    }
    
    interp = dict(zip(id2label.values(), list(round(float(i),4) for i in result[0][0])))
    return interp

ModuleNotFoundError: No module named 'torch'

In [None]:
output = predict_emotion('/kaggle/input/ravdess-emotional-speech-audio/Actor_01/03-01-06-02-02-01-01.wav')
output

{'angry': -0.0307,
 'calm': -0.047,
 'disgust': 0.0006,
 'fearful': 0.0214,
 'happy': -0.103,
 'neutral': -0.003,
 'sad': 0.0401,
 'surprised': 0.0701}

In [None]:
import torch.nn.functional as F

def apply_softmax(output_dict):
    output_values = list(output_dict.values())   
    softmax_values = F.softmax(torch.tensor(output_values), dim=0)
    softmax_values = softmax_values.tolist()
    softmax_dict = dict(zip(output_dict.keys(), softmax_values))
    
    # Find the emotion with the highest softmax value
    max_index = torch.argmax(torch.tensor(output_values))
    max_emotion = list(output_dict.keys())[max_index]
    
    return softmax_dict, max_emotion

In [None]:
softmax_dict, max_emotion  = apply_softmax(output)
softmax_dict

{'angry': 0.12184914946556091,
 'calm': 0.1198791041970253,
 'disgust': 0.12572334706783295,
 'fearful': 0.12836578488349915,
 'happy': 0.11335038393735886,
 'neutral': 0.12527155876159668,
 'sad': 0.13078880310058594,
 'surprised': 0.1347719132900238}

In [None]:
import os
import pandas as pd

# Path to the directory containing the dataset
dataset_path = '/kaggle/input/ravdess-emotional-speech-audio'

emotion_mapping = {
    1: 'neutral',
    2: 'calm',
    3: 'happy',
    4: 'sad',
    5: 'angry',
    6: 'fearful',
    7: 'disgust',
    8: 'surprised'
}

audio_data = []

# Traverse through each actor directory
for actor_folder in os.listdir(dataset_path):
    actor_folder_path = os.path.join(dataset_path, actor_folder)
    if os.path.isdir(actor_folder_path):
        # Traverse through each audio file in the actor directory
        for audio_file in os.listdir(actor_folder_path):
            if audio_file.endswith('.wav'):  # Assuming audio files are in .wav format
                audio_file_path = os.path.join(actor_folder_path, audio_file)
                label = actor_folder  # Use actor folder name as label
                # Extract the emotion code from the file name
                emotion_code = int(audio_file.split('-')[2].lstrip('0'))  # Remove leading zeros and convert to integer
                # Map emotion code to label
                emotion_label = emotion_mapping.get(emotion_code, 'unknown')
                audio_data.append({'file_path': audio_file_path, 'label': label, 'emotion_code': emotion_code, 'emotion_label': emotion_label})

# Create a DataFrame from the audio data list
audio_df = pd.DataFrame(audio_data)

# Display the DataFrame
print(audio_df.head())



                                           file_path     label  emotion_code  \
0  /kaggle/input/ravdess-emotional-speech-audio/A...  Actor_02             8   
1  /kaggle/input/ravdess-emotional-speech-audio/A...  Actor_02             1   
2  /kaggle/input/ravdess-emotional-speech-audio/A...  Actor_02             7   
3  /kaggle/input/ravdess-emotional-speech-audio/A...  Actor_02             7   
4  /kaggle/input/ravdess-emotional-speech-audio/A...  Actor_02             1   

  emotion_label  
0     surprised  
1       neutral  
2       disgust  
3       disgust  
4       neutral  


In [None]:
audio_df = audio_df[:100]
audio_df

Unnamed: 0,file_path,label,emotion_code,emotion_label
0,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_02,8,surprised
1,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_02,1,neutral
2,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_02,7,disgust
3,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_02,7,disgust
4,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_02,1,neutral
...,...,...,...,...
95,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_17,6,fearful
96,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_17,2,calm
97,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_17,5,angry
98,/kaggle/input/ravdess-emotional-speech-audio/A...,Actor_17,6,fearful


In [None]:
def predict_and_compare(audio_df):
    correct_predictions = 0
    total_predictions = len(audio_df)
    progress_interval = total_predictions // 10  

    for index, row in audio_df.iterrows():
        file_path = row['file_path']
        emotion_label = row['emotion_label']

        output = predict_emotion(file_path)
        softmax_dict, max_emotion = apply_softmax(output)

        if max_emotion == emotion_label:
            correct_predictions += 1

        # Print progress every 10%
        if (index + 1) % progress_interval == 0:
            progress = ((index + 1) / total_predictions) * 100
            print(f"Progress: {progress:.2f}%")

    accuracy = correct_predictions / total_predictions
    return accuracy

accuracy = predict_and_compare(audio_df)
print("Accuracy:", accuracy)


Progress: 10.00%
Progress: 20.00%
Progress: 30.00%
Progress: 40.00%
Progress: 50.00%
Progress: 60.00%
Progress: 70.00%
Progress: 80.00%
Progress: 90.00%
Progress: 100.00%
Accuracy: 0.03
