In [1]:
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib  # For saving and loading the label encoder

# Function to load and preprocess audio data
def load_data(dataset_path):
    features = []  # List to store feature vectors
    labels = []    # List to store corresponding labels
    for emotion in os.listdir(dataset_path):
        emotion_path = os.path.join(dataset_path, emotion)
        if os.path.isdir(emotion_path):
            for file in os.listdir(emotion_path):
                if file.endswith('.wav'):
                    file_path = os.path.join(emotion_path, file)
                    audio, sr = librosa.load(file_path, sr=None)  # Load audio file
                    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)  # Extract MFCC features
                    mfcc_scaled = np.mean(mfcc.T, axis=0)  # Average MFCC coefficients
                    features.append(mfcc_scaled)
                    labels.append(emotion)
    return np.array(features), np.array(labels)

# Define the path to your dataset
dataset_path = r'F:\ABDUL\ABDUL 2024\EMOTION-DETETION-IN-AUDIO_CNN_RNN\Tess'

# Load the dataset
X, y = load_data(dataset_path)

# Encode the labels into numerical format
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Feature Standardization
X_train = (X_train - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)
X_test = (X_test - np.mean(X_test, axis=0)) / np.std(X_test, axis=0)

# Reshape the data to fit the input requirements of an RNN (LSTM layer)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build the RNN model using LSTM layers
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], 1)),  # LSTM layer
    tf.keras.layers.LSTM(128, return_sequences=False),  # LSTM layer
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model with Adam optimizer and sparse categorical crossentropy loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Define EarlyStopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, verbose=1, restore_best_weights=True)

# Train the model on the training data
model.fit(X_train, y_train, epochs=150, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Save the trained model
model.save('emotion_detection_model_rnn.h5')
print('Model saved to disk.')

# Save the label encoder
label_encoder_path = 'label_encoder.pkl'
joblib.dump(label_encoder, label_encoder_path)
print(f'Label encoder saved to {label_encoder_path}.')

# Evaluate the model on the test data
y_pred = np.argmax(model.predict(X_test), axis=1)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print the classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Epoch 1/150


  super().__init__(**kwargs)


[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 63ms/step - accuracy: 0.2860 - loss: 2.3208 - val_accuracy: 0.5946 - val_loss: 1.1899
Epoch 2/150
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.6874 - loss: 0.9557 - val_accuracy: 0.8750 - val_loss: 0.4117
Epoch 3/150
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 55ms/step - accuracy: 0.8745 - loss: 0.3844 - val_accuracy: 0.8554 - val_loss: 0.4511
Epoch 4/150
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.8903 - loss: 0.3407 - val_accuracy: 0.9411 - val_loss: 0.2090
Epoch 5/150
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 55ms/step - accuracy: 0.9237 - loss: 0.2157 - val_accuracy: 0.9214 - val_loss: 0.2741
Epoch 6/150
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 55ms/step - accuracy: 0.9362 - loss: 0.1879 - val_accuracy: 0.9536 - val_loss: 0.1983
Epoch 7/150
[1m35/35[0m [32m━━━━━━━━━



Model saved to disk.
Label encoder saved to label_encoder.pkl.
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step 
Accuracy: 96.96%

Classification Report:
                        precision    recall  f1-score   support

              OAF_Fear       1.00      0.98      0.99        52
 OAF_Pleasant_surprise       0.92      0.98      0.95        47
               OAF_Sad       1.00      0.98      0.99        49
             OAF_angry       0.97      1.00      0.99        37
           OAF_disgust       0.95      0.95      0.95        39
             OAF_happy       0.94      0.92      0.93        37
           OAF_neutral       1.00      1.00      1.00        30
             YAF_angry       0.92      0.97      0.95        37
           YAF_disgust       0.97      0.95      0.96        39
              YAF_fear       0.95      1.00      0.97        37
             YAF_happy       0.95      0.90      0.92        40
           YAF_neutral       1.00      1.00      1.0