# Emotion Recognation for Longer Duration Audio

## Requirments for this code
### Libraries
1. keras==3.0.5
2. tensorflow==2.15.0
3. librosa==0.10.1
4. gradio==4.26.0
### Files
1. CNN model architecture (.json) file 
2. corresponding files weights in (.h5) fromat

In [None]:
import json
import librosa
import pandas as pd
import numpy as np
import gradio as gr
from tensorflow.keras.models import model_from_json
from sklearn.preprocessing import OneHotEncoder
import collections
import tabulate

# Load the model architecture from JSON file
with open("F:\\SER DCA\\DCA_SER.json", "r") as json_file:
    loaded_model_json = json_file.read()

# Load the model architecture
loaded_model = model_from_json(loaded_model_json)
# Load the model weights
loaded_model.load_weights("F:\\SER DCA\\DCA_SER.weights.h5")
print("Loaded model from disk")

# Define emotion classes
classes = ['disgust', 'sad', 'fear', 'happy', 'angry', 'neutral', 'surprise']
# Initialize OneHotEncoder
encoder = OneHotEncoder()
encoder.fit_transform(np.array(classes).reshape(-1,1))

audio = gr.Audio(
    sources=["microphone","upload"],
    type= 'numpy' )
# audio, sample_rate = librosa.load(audio_input)

# Define feature extraction functions
def zcr(data,frame_length,hop_length):
    zcr=librosa.feature.zero_crossing_rate(data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(zcr)

def rmse(data,frame_length=2048,hop_length=512):
    rmse=librosa.feature.rms(y=data)
    return np.squeeze(rmse)

def mfcc(data,sr,frame_length=2048,hop_length=512,flatten:bool=True):
    mfcc=librosa.feature.mfcc(y=data,sr=sr)
    return np.squeeze(mfcc.T)if not flatten else np.ravel(mfcc.T)

def extract_features(data,sr=22050,frame_length=2048,hop_length=512):
    data = np.squeeze(data).astype(np.float32)
    result=np.array([])
    result=np.hstack((result,
                      zcr(data,frame_length,hop_length),
                      rmse(data,frame_length,hop_length),
                      mfcc(data,sr,frame_length,hop_length)
                     ))
    return result


# Define prediction function
def predict_emotion(audio_chunk):
    # Extract features from audio chunk
    features = np.array(extract_features(audio_chunk))
    # Fill NaN values with 0
    features = np.nan_to_num(features)
    # Create DataFrame
    prde = pd.DataFrame(features)
    new_data = np.zeros((1, 4158))
    new_data[:, :prde.shape[0]] = prde.T
    prde1 = pd.DataFrame(new_data)
    prde1 = prde1.fillna(0)
    # Reshape features for model input
    prde1 = np.expand_dims(prde1, axis=2)
    # Predict emotion using the model
    emotions = loaded_model.predict(prde1)
    predictions = encoder.inverse_transform(emotions)
    return predictions[0][0]

chunk_size = 4.38 # in seconds
overlap = 0  # in seconds
    
def predict_emotion_chunks(audio, chunk_size, overlap=0, sample_rate=22050):
    _ ,audio_data  = audio
    audio_data = np.squeeze(audio_data)
    predictions = []
    start = 0
    end = int(chunk_size * sample_rate)
    
    while end <= len(audio_data):
        # Extract audio chunk
        audio_chunk = audio_data[start:end]
        # Predict emotion for chunk
        emotion_probabilities = predict_emotion(audio_chunk)
        # Record prediction
        predictions.append(emotion_probabilities[0][0])
        # Move to next chunk
        start += int(chunk_size * sample_rate) - int(overlap * sample_rate)
        end = start + int(chunk_size * sample_rate)
    
    emotion_counts = collections.Counter(predictions)
    total_observations = len(predictions)
    emotion_percentages = {emotion: count/total_observations*100 for emotion, count in emotion_counts.items()}

    headers = ['Emotion', 'Percentage']
    table = [[emotion, f"{percentage:.2f}%"] for emotion, percentage in emotion_percentages.items()]
    table.insert(0, headers)

    results = tabulate.tabulate(table, headers='firstrow')
    return results

# Example usage:
# predictions = predict_emotion_chunks(audio, chunk_size=4.38, overlap=0, sample_rate=44100)


# Create Gradio interface
gr.Interface(fn=predict_emotion_chunks, inputs=audio, outputs='text', title="Emotion Prediction", description="Speak into the microphone to predict the emotion.").launch()


Loaded model from disk




Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 294ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s