In [2]:
import gradio as gr
import numpy as np
import librosa
import pandas as pd
import json
from tensorflow.keras.models import Sequential, model_from_json
import collections
import tabulate

In [3]:
# Load the emotion prediction model
with open("F:\\SER DCA\\DCA_SER.json", "r") as json_file:
    loaded_model_json = json_file.read()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights("F:\\SER DCA\\DCA_SER.weights.h5")
print("Loaded model from disk")

Loaded model from disk


In [4]:
from sklearn.preprocessing import OneHotEncoder
classes = ['disgust', 'sad', 'fear', 'happy', 'angry', 'neutral', 'surprise']
encoder = OneHotEncoder()
encoder.fit_transform(np.array(classes).reshape(-1,1))

<7x7 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [9]:
questions = [
    "1. How have you been feeling lately?",
    "2. Any changes in your thoughts or emotions?",
    "3. Found coping strategies helpful?",
    "4. Specific challenges you're facing?",
    "5. Implemented discussed techniques?",
    "6. Notice any improvements in well-being?",
    "7. Any areas where you feel stuck?",
    "8. Noticed any patterns or triggers?",
    "9. Anything else you'd like to discuss?"
]

# Create audio input objects with questions as labels
audio_file_1 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[0])
audio_file_2 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[1])
audio_file_3 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[2])
audio_file_4 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[3])
audio_file_5 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[4])
audio_file_6 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[5])
audio_file_7 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[6])
audio_file_8 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[7])
audio_file_9 = gr.Audio(sources="microphone", type="numpy", format='wav', max_length=4.5, label=questions[8])

audio_df = [audio_file_1,audio_file_2,audio_file_3,audio_file_4,audio_file_5,audio_file_6,audio_file_7,audio_file_8,audio_file_9]

def zcr(data,frame_length,hop_length):
    zcr=librosa.feature.zero_crossing_rate(data,frame_length=frame_length,hop_length=hop_length)
    return np.squeeze(zcr)
def rmse(data,frame_length=2048,hop_length=512):
    rmse=librosa.feature.rms(y=data)
    return np.squeeze(rmse)
def mfcc(data,sr,frame_length=2048,hop_length=512,flatten:bool=True):
    mfcc=librosa.feature.mfcc(y=data,sr=sr)
    return np.squeeze(mfcc.T)if not flatten else np.ravel(mfcc.T)

def extract_features(data,sr=22050,frame_length=2048,hop_length=512):
    data = np.squeeze(data).astype(np.float32)
    result=np.array([])
    result=np.hstack((result,
                      zcr(data,frame_length,hop_length),
                      rmse(data,frame_length,hop_length),
                      mfcc(data,sr,frame_length,hop_length)
                     ))
    return result

def get_features(audio_file, target_sr=22050):
    if audio_file is None:
        return None
    
    _, data = audio_file
    
    # Ensure the data is in floating-point format
    if data.dtype != np.float32:
        # Convert data to floating-point format and normalize to range [-1, 1]
        data = librosa.util.buf_to_float(data)
    
    # Resample audio data to the target sample rate
    data_resampled = librosa.resample(data, orig_sr=_, target_sr=target_sr)
    
    aud = extract_features(data_resampled)
    audio = np.array(aud)
    return audio


# def predict_emotions(audio_file):
#     features = get_features(audio_file)
#     prde = pd.DataFrame(features)
#     new_data = np.zeros((1, 4158))
#     new_data[:, :prde.shape[0]] = prde.T
#     prde1 = pd.DataFrame(new_data)
#     prde1 = prde1.fillna(0)
#     prde1 = np.expand_dims(prde1, axis=2)
#     emotions = loaded_model.predict(prde1)
#     predictions = encoder.inverse_transform(emotions)
#     return predictions

def predict_emotions_all(audio_file_1,audio_file_2,audio_file_3,audio_file_4,audio_file_5,audio_file_6,audio_file_7,audio_file_8,audio_file_9):
    files = [audio_file_1,audio_file_2,audio_file_3,audio_file_4,audio_file_5,audio_file_6,audio_file_7,audio_file_8,audio_file_9]
    predictions = []
    for audio_file in files:
        features = get_features(audio_file)
        prde = pd.DataFrame(features)
        new_data = np.zeros((1, 4158))
        new_data[:, :prde.shape[0]] = prde.T
        prde1 = pd.DataFrame(new_data)
        prde1 = prde1.fillna(0)
        prde1 = np.expand_dims(prde1, axis=2)
        emotions = loaded_model.predict(prde1)
        predictions.append(encoder.inverse_transform(emotions)[0][0])
    emotion_counts = collections.Counter(predictions)
    total_observations = len(predictions)
    emotion_percentages = {emotion: count/total_observations*100 for emotion, count in emotion_counts.items()}

    headers = ['Emotion', 'Percentage']
    table = [[emotion, f"{percentage:.2f}%"] for emotion, percentage in emotion_percentages.items()]
    table.insert(0, headers)

    results = tabulate.tabulate(table, headers='firstrow')
    return results


demo = gr.Interface(
    fn=predict_emotions_all,
    inputs=audio_df,
    outputs="text",
    title="Emotion Prediction from Audio",
    description="Recording audio files and predict the emotions"
)


demo.launch()

Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




IMPORTANT: You are using gradio version 4.26.0, however version 4.29.0 is available, please upgrade.
--------


In [18]:
audio_file_ = gr.Audio(sources=["microphone","upload"], type="numpy", format='wav', max_length=4.5)

In [19]:
audio_file_

<gradio.components.audio.Audio at 0x2003a167f70>