In [4]:
import librosa
import soundfile
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

def extract_feature(file_name, min_length=2048):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        
        # If the audio signal is shorter than the minimum required length, pad it with zeros
        if len(X) < min_length:
            X = np.pad(X, (0, min_length - len(X)), mode='constant')

        result = np.array([])

        # Calculate n_fft dynamically
        n_fft = min(2048, len(X))

        # chroma
        stft = np.abs(librosa.stft(X, n_fft=n_fft))
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate), axis=1)
        result = np.hstack((result, chroma.flatten()))

        # mfcc
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=20), axis=1)
        result = np.hstack((result, mfccs.flatten()))

        # mel
        mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=64), axis=1)
        result = np.hstack((result, mel.flatten()))

    return result

emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

def load_data(test_size):
    x, y = [], []
    actors = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24']
    for actor in actors:
        for emotion in emotions:
            if emotion == '01':
                for statement in range(1, 3):
                    for repetition in range(1, 3):
                        file_path = fr"C:\Users\subik\MoodMeter\RAVDESS_dataset\Actor_{actor}\03-01-01-01-0{statement}-0{repetition}-{actor}.wav"
                        feature = extract_feature(file_path)
                        if feature is not None:
                            x.append(feature)
                            y.append(emotions[emotion])
            else:
                for intensity in range(1, 3):
                    for statement in range(1, 3):
                        for repetition in range(1, 3):
                            file_path = fr"C:\Users\subik\MoodMeter\RAVDESS_dataset\Actor_{actor}\03-01-{emotion}-0{intensity}-0{statement}-0{repetition}-{actor}.wav"
                            feature = extract_feature(file_path)
                            if feature is not None and emotion:
                                x.append(feature) 
                                y.append(emotions[emotion])
    
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

x_train, x_test, y_train, y_test = load_data(test_size=0.5)
print(f"Train shape: {x_train.shape}, Test shape: {x_test.shape}")

model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=800)
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Train shape: (720, 96), Test shape: (720, 96)
Accuracy: 51.11%
