In [2]:
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib  # For saving and loading the label encoder

# Function to load and preprocess audio data
def load_data(dataset_path):
    features = []  # List to store feature vectors
    labels = []    # List to store corresponding labels
    for emotion in os.listdir(dataset_path):
        emotion_path = os.path.join(dataset_path, emotion)
        if os.path.isdir(emotion_path):
            for file in os.listdir(emotion_path):
                if file.endswith('.wav'):
                    file_path = os.path.join(emotion_path, file)
                    audio, sr = librosa.load(file_path, sr=None)  # Load audio file
                    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)  # Extract MFCC features
                    mfcc_scaled = np.mean(mfcc.T, axis=0)  # Average MFCC coefficients
                    features.append(mfcc_scaled)
                    labels.append(emotion)
    return np.array(features), np.array(labels)

# Define the path to your dataset
dataset_path = r'D:\FINAL_CODE_EMO_R_A ORIGINAL - Copy\DATASET\TESS'

# Load the dataset
X, y = load_data(dataset_path)

# Encode the labels into numerical format
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Reshape the data to fit the input requirements of a CNN
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)

# Build the CNN model using Conv1D layers
model = tf.keras.Sequential([
    tf.keras.layers.Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1), padding='same'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(64, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Conv1D(128, kernel_size=3, activation='relu', padding='same'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model with Adam optimizer and sparse categorical crossentropy loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model on the training data
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping])

# Save the trained model
model.save('emotion_detection_model.h5')
print('Model saved to disk.')

# Save the label encoder
label_encoder_path = 'label_encoder.pkl'
joblib.dump(label_encoder, label_encoder_path)
print(f'Label encoder saved to {label_encoder_path}.')

# Evaluate the model on the test data
y_pred = np.argmax(model.predict(X_test), axis=1)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Function to make predictions on new data
def predict_emotion(file_path, model, label_encoder):
    # Load the audio file
    audio, sr = librosa.load(file_path, sr=None)
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
    # Compute the mean of MFCC coefficients
    mfcc_scaled = np.mean(mfcc.T, axis=0)
    # Reshape the input to match the model's input shape
    input_data = mfcc_scaled.reshape(1, mfcc_scaled.shape[0], 1)
    # Make prediction
    prediction = model.predict(input_data)
    # Get the class with highest probability
    predicted_class = np.argmax(prediction, axis=1)[0]
    # Decode the class label
    predicted_label = label_encoder.classes_[predicted_class]
    return predicted_label

# Example usage:
# Replace 'path_to_new_audio_file.wav' with the path to an actual WAV file
new_file_path = r'D:\FINAL_CODE_EMO_R_A ORIGINAL DONT OPEN OR EDIT\DATASET\Tess\OAF_disgust\OAF_back_disgust.wav'
predicted_emotion = predict_emotion(new_file_path, model, label_encoder)
print(f'Predicted emotion: {predicted_emotion}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.1749 - loss: 5.6464 - val_accuracy: 0.7000 - val_loss: 1.1271
Epoch 2/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7177 - loss: 0.9120 - val_accuracy: 0.8375 - val_loss: 0.5001
Epoch 3/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8681 - loss: 0.4143 - val_accuracy: 0.8482 - val_loss: 0.4452
Epoch 4/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8904 - loss: 0.3376 - val_accuracy: 0.8643 - val_loss: 0.4094
Epoch 5/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9028 - loss: 0.2723 - val_accuracy: 0.9000 - val_loss: 0.3282
Epoch 6/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8989 - loss: 0.2779 - val_accuracy: 0.8821 - val_loss: 0.3482
Epoch 7/100
[1m70/70[0m [32m━━━━━━━━━━━━━━━



Model saved to disk.
Label encoder saved to label_encoder.pkl.
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
Accuracy: 93.39%
Classification Report:
                        precision    recall  f1-score   support

              OAF_Fear       0.92      0.94      0.93        52
 OAF_Pleasant_surprise       0.88      0.91      0.90        47
               OAF_Sad       0.92      0.96      0.94        49
             OAF_angry       1.00      1.00      1.00        37
           OAF_disgust       1.00      0.92      0.96        39
             OAF_happy       0.91      0.84      0.87        37
           OAF_neutral       0.87      0.87      0.87        30
             YAF_angry       0.97      0.78      0.87        37
           YAF_disgust       0.97      0.97      0.97        39
              YAF_fear       0.85      0.89      0.87        37
             YAF_happy       0.91      1.00      0.95        40
           YAF_neutral       0.95      0.98      0.97 