import model

In [1]:
import numpy as np
import os

import keras
from keras.models import load_model

import numpy as np
import librosa

model = load_model("emotion_model.keras", compile=False)

data augmentation methods (from model)

In [3]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data):
    return librosa.effects.time_stretch(data, rate=0.8)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

feature extraction methods (from model)

In [7]:
def extract_features(data, sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data, sample_rate)
    result = np.vstack((result, res2)) # stacking vertically
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch, sample_rate)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result

labels (from model)

In [14]:
emotion_dict = {0:'neutral', 1:'calm', 2:'happy', 3:'sad', 4:'angry', 5:'fear'}

make a prediction

In [25]:
file_path = os.path.join(os.getcwd(), "sample video.wav")
features = get_features(file_path)


# If you want to predict on all 3 versions
predictions = model.predict(features)  # shape: (3, 6)

# Optionally average the predictions to get a more stable result
avg_prediction = np.mean(predictions, axis=0)
predicted_class = np.argmax(avg_prediction)

print("Prediction probabilities:", avg_prediction)
print("Predicted emotion:", emotion_dict[predicted_class])

1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 16ms/ste ━━━━━━━━━━━━━━━━━━━━ 0s 31ms/step
Prediction probabilities: [0. 0. 0. 0. 1. 0.]
Predicted emotion: angry
