In [1]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.utils import to_categorical
import warnings 
warnings.filterwarnings("ignore")

In [2]:
# Function to extract features from audio files
def extract_features(file_name):
    audio, sample_rate = librosa.load(file_name, sr=None)
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled = np.mean(mfccs.T, axis=0)
    return mfccs_scaled

In [3]:
# Load the dataset and extract features
def load_data(dataset_path):
    labels = []
    features = []

    for dirpath, dirnames, filenames in os.walk(dataset_path):
        for file in filenames:
            if file.endswith(".wav"):
                file_path = os.path.join(dirpath, file)
                label = dirpath.split("/")[-1]
                features.append(extract_features(file_path))
                labels.append(label)

    return np.array(features), np.array(labels)

In [4]:
# Load data
dataset = "TESS Toronto emotional speech set data" 
X, y = load_data(dataset)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
y = to_categorical(y)


In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape the data to be compatible with LSTM layers
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)


In [6]:
# Build the LSTM model
model = Sequential()

model.add(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.3))

model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [7]:
# Train the model
training = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 31ms/step - accuracy: 0.2088 - loss: 2.3171 - val_accuracy: 0.7821 - val_loss: 0.8901
Epoch 2/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.7070 - loss: 0.8976 - val_accuracy: 0.9089 - val_loss: 0.3685
Epoch 3/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.8612 - loss: 0.4393 - val_accuracy: 0.9375 - val_loss: 0.2480
Epoch 4/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9214 - loss: 0.2891 - val_accuracy: 0.9321 - val_loss: 0.2806
Epoch 5/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9134 - loss: 0.3140 - val_accuracy: 0.9429 - val_loss: 0.2126
Epoch 6/50
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.9376 - loss: 0.2095 - val_accuracy: 0.9607 - val_loss: 0.1475
Epoch 7/50
[1m70/70[0m [32m━━━━

In [8]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

# Save the model
model.save("speech_emotion_recognition_lstm.h5")

[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.9847 - loss: 0.0944




Test Accuracy: 98.57%


In [9]:
# Evaluate model on training data
train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
print(f"Training Loss: {train_loss}")
print(f"Training Accuracy: {train_accuracy}")


Training Loss: 0.0016498087206855416
Training Accuracy: 0.9995535612106323


In [10]:
# Evaluate model on testing data
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Testing Loss: {test_loss}")
print(f"Testing Accuracy: {test_accuracy}")


Testing Loss: 0.09605822712182999
Testing Accuracy: 0.9857142567634583
