<a href="https://colab.research.google.com/github/Tushar12S/Speech-Emotion-Recognition/blob/main/SpeechEmotion_Recognition_with_librosa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


[Dataset](https://drive.google.com/file/d/1wWsrN2Ep7x6lWqOXfr4rpKGYrJhWc8z7/view)

In [3]:
import os
Root = "/content/drive/MyDrive/Colab Notebooks/speech-emotion-recognition-ravdess-data"
os.chdir(Root)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install --upgrade librosa



In [5]:
!ls

Actor_01  Actor_05  Actor_09  Actor_13	Actor_17  Actor_21  modelForPrediction1.sav
Actor_02  Actor_06  Actor_10  Actor_14	Actor_18  Actor_22
Actor_03  Actor_07  Actor_11  Actor_15	Actor_19  Actor_23
Actor_04  Actor_08  Actor_12  Actor_16	Actor_20  Actor_24


In [6]:
import librosa
import soundfile
import glob
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd

In [7]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
    return result

In [8]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

observed_emotions=['calm', 'happy', 'fearful', 'disgust']

In [9]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel_spectrogram = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel_spectrogram))
    return result

In [11]:
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

observed_emotions = ['calm', 'happy', 'fearful', 'disgust', 'neutral', 'sad', 'angry', 'surprised']

In [12]:
def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob(os.path.join(Root, "Actor_*/*.wav")):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

x_train, x_test, y_train, y_test = load_data(test_size=0.25)

In [13]:
print(f"Training samples: {x_train.shape[0]}, Testing samples: {x_test.shape[0]}")

Training samples: 1084, Testing samples: 362


In [14]:
model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
model.fit(x_train, y_train)

In [15]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

f1 = f1_score(y_test, y_pred, average=None)
print(f"F1 Score: {f1}")

Accuracy: 55.52%
F1 Score: [0.67272727 0.62650602 0.42253521 0.6746988  0.51546392 0.22222222
 0.45070423 0.66666667]


In [16]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(df.head(20))

       Actual  Predicted
0       angry      angry
1         sad    fearful
2         sad        sad
3     neutral    neutral
4       happy        sad
5     disgust    disgust
6         sad      angry
7         sad        sad
8       happy      happy
9       angry      angry
10      angry      angry
11  surprised  surprised
12        sad      angry
13      angry    fearful
14      angry      happy
15        sad        sad
16      angry      angry
17    neutral        sad
18      happy      happy
19      angry      angry


In [17]:
model_filename = 'modelForPrediction1.sav'
with open(model_filename, 'wb') as model_file:
    pickle.dump(model, model_file)

loaded_model = pickle.load(open(model_filename, 'rb'))

In [18]:
from IPython.core.display import display, HTML

emotion_to_emoji = {
    'neutral': '😐',
    'calm': '😌',
    'happy': '😊',
    'sad': '😢',
    'angry': '😠',
    'fearful': '😨',
    'disgust': '🤢',
    'surprised': '😲'
}

sample_file = os.path.join(Root,"Actor_09/03-01-02-01-02-01-09.wav")
sample_feature = extract_feature(sample_file, mfcc=True, chroma=True, mel=True).reshape(1, -1)
sample_prediction = loaded_model.predict(sample_feature)[0]

emoji = emotion_to_emoji.get(sample_prediction, '')

html_content = f"""
<button onclick="openPopup()">Show Prediction</button>
<script>
function openPopup() {{
  var popup = window.open("", "popup", "width=400,height=500");
  popup.document.write(`
    <style>
      .exit-button {{
        font-size: 18px;
        padding: 10px 20px;
        background-color: #0000ff;
        color: #fff;
        border: none;
        border-radius: 5px;
        cursor: pointer;
      }}
      .exit-button:hover {{
        background-color: #007BFF;
      }}
    </style>
    <div style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 100%; background-color: #ff6666;">
      <div style="padding: 20px; border: 2px solid #ff0000; border-radius: 10px; background-color: #ff6666; text-align: center; margin-bottom: 20px;">
        <h3 style="color: #ff0000;">Predicted Emotion</h3>
        <p style="font-size: 24px; color: #ffdd00;">{sample_prediction}</p>
      </div>
      <div style="font-size: 100px; color: #ffdd00;">
        {emoji}
      </div>
      <div style="margin-top: 20px;">
        <button class="exit-button" onclick="window.close()">
          Exit ❌
        </button>
      </div>
    </div>
  `);
  popup.document.close();
}}
</script>
"""

display(HTML(html_content))