<a href="https://colab.research.google.com/github/Venkat18-bit/emotion-recognition-project/blob/main/emotion_recognition_through_speech_audio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install librosa tensorflow numpy pandas scikit-learn flask flask-cors resampy soundfile



In [4]:
from google.colab import files
files.upload()  # Upload kaggle.json

Saving kaggle.json to kaggle (4).json


{'kaggle (4).json': b'{"username":"vaenkatsrini","key":"13805836483325ee9350bdfebe2eafe9"}'}

In [5]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [8]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio
!unzip ravdess-emotional-speech-audio.zip -d dataset

Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
ravdess-emotional-speech-audio.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  ravdess-emotional-speech-audio.zip
replace dataset/Actor_01/03-01-01-01-01-01-01.wav? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [9]:
import os
import librosa
import numpy as np
import pandas as pd

# Define emotions
emotions = {
    '01': 'neutral',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry'
}

# Function to extract MFCC features
def extract_mfcc(file_path, max_pad_len=180):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast', duration=3, sr=22050)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width > 0:
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_pad_len]
    return mfccs

# Load dataset
data = []
labels = []
root_dir = 'dataset/audio_speech_actors_01-24'
for actor_dir in os.listdir(root_dir):
    actor_path = os.path.join(root_dir, actor_dir)
    for file in os.listdir(actor_path):
        if file.endswith('.wav'):
            emotion_code = file.split('-')[2]
            if emotion_code in emotions:
                file_path = os.path.join(actor_path, file)
                mfcc = extract_mfcc(file_path)
                data.append(mfcc)
                labels.append(emotions[emotion_code])

# Convert to arrays
X = np.array(data)  # Shape: (samples, 40, 180)
y = pd.get_dummies(labels).values  # One-hot encode

print(f"Data shape: {X.shape}, Labels shape: {y.shape}")

Data shape: (672, 40, 180), Labels shape: (672, 4)


In [10]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build model
model = Sequential()
model.add(LSTM(128, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(128))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


In [11]:
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 37ms/step - accuracy: 0.2924 - loss: 1.3626 - val_accuracy: 0.4519 - val_loss: 1.2954
Epoch 2/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5016 - loss: 1.1859 - val_accuracy: 0.4593 - val_loss: 1.2013
Epoch 3/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.6976 - loss: 0.9066 - val_accuracy: 0.4519 - val_loss: 1.2117
Epoch 4/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8489 - loss: 0.5294 - val_accuracy: 0.4444 - val_loss: 1.5257
Epoch 5/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.9337 - loss: 0.2352 - val_accuracy: 0.4889 - val_loss: 1.4763
Epoch 6/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.9696 - loss: 0.1764 - val_accuracy: 0.5333 - val_loss: 1.6997
Epoch 7/50
[1m17/17[0m [32m━━━━

In [12]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4831 - loss: 2.7491 
Test Accuracy: 49.63%


In [13]:
model.save('emotion_model.h5')



In [14]:
!pip install gradio



In [15]:
!pip uninstall gradio -y
!pip install gradio==3.41.0
import gradio as gr
print(gr.__version__)  # Should print 3.41.0

Found existing installation: gradio 3.41.0
Uninstalling gradio-3.41.0:
  Successfully uninstalled gradio-3.41.0
Collecting gradio==3.41.0
  Using cached gradio-3.41.0-py3-none-any.whl.metadata (17 kB)
Using cached gradio-3.41.0-py3-none-any.whl (20.1 MB)
Installing collected packages: gradio
Successfully installed gradio-3.41.0


3.41.0


In [20]:
import gradio as gr
import librosa
import numpy as np
from tensorflow.keras.models import load_model
import os

# Load the pre-trained model
model = load_model('emotion_model.h5')
emotions = ['neutral', 'happy', 'sad', 'angry']

# Function to extract MFCC features
def extract_mfcc(audio_data, sample_rate=22050, max_pad_len=180):
    if len(audio_data) < sample_rate * 0.1:  # Minimum 0.1 seconds
        raise ValueError(f"Audio too short: {len(audio_data)/sample_rate:.2f} seconds")
    mfccs = librosa.feature.mfcc(y=audio_data, sr=sample_rate, n_mfcc=40)
    pad_width = max_pad_len - mfccs.shape[1]
    if pad_width > 0:
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfccs = mfccs[:, :max_pad_len]
    return mfccs

# Prediction function with debugging
def predict_emotion(audio_file):
    if audio_file is None:
        return "Error: No audio file received. Please record and submit again."
    print(f"Received audio file path: {audio_file}")
    try:
        # Load audio file
        audio_data, sample_rate = librosa.load(audio_file, sr=None)
        print(f"Loaded audio - Shape: {audio_data.shape}, Sample rate: {sample_rate}, Length (samples): {len(audio_data)}")

        # Validate audio length
        if len(audio_data) < 2205:  # Minimum 0.1 seconds at 22050 Hz
            return f"Error: Audio too short ({len(audio_data)/sample_rate:.2f} seconds). Please record for at least 1-2 seconds."

        # Resample to match training sample rate
        if sample_rate != 22050:
            audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=22050)
            print(f"Resampled to 22050 Hz, new length: {len(audio_data)}")

        mfcc = extract_mfcc(audio_data).reshape(1, 40, 180)
        print(f"MFCC shape: {mfcc.shape}")
        prediction = model.predict(mfcc)
        emotion = emotions[np.argmax(prediction)]
        return f"Predicted Emotion: {emotion}"
    except Exception as e:
        print(f"Exception details: {str(e)}")
        return f"Error: {str(e)}"

# Create Gradio interface with explicit microphone and upload
interface = gr.Interface(
    fn=predict_emotion,
    inputs=gr.Audio(type="filepath", sources=["microphone", "upload"], label="Record or upload your speech (5s max)"),
    outputs=gr.Textbox(label="Result"),
    title="Speech Emotion Recognition",
    description="Record your voice or upload an audio file to detect emotions (neutral, happy, sad, angry). Note: Microphone may not work on shared HTTP links; use upload as a fallback.",
    live=False
)

# Launch the app and get the shareable URL
interface.launch(share=True)

  inputs=gr.Audio(type="filepath", sources=["microphone", "upload"], label="Record or upload your speech (5s max)"),


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
IMPORTANT: You are using gradio version 3.41.0, however version 4.44.1 is available, please upgrade.
--------
Running on public URL: https://f5b2ad91730caed4b4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


