# Importing Packages

In [1]:
import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense, Dropout

# Function to extract MFCC features from an audio file

In [2]:
def extract_mfcc(audio_file, num_mfcc=13, max_length=100):
    y, sr = librosa.load(audio_file, sr=None)  # Load audio file
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=num_mfcc)  # Extract MFCC features
    
    # Pad or truncate mfccs to ensure fixed length
    if mfccs.shape[1] > max_length:
        mfccs = mfccs[:, :max_length]  # Truncate
    elif mfccs.shape[1] < max_length:
        pad_width = max_length - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')  # Pad
    
    return mfccs

# Function to load dataset and extract features

In [3]:
def load_dataset_and_extract_features(dataset_dir, num_mfcc=13, max_length=100):
    X = []
    y = []

    for emotion in os.listdir(dataset_dir):
        emotion_dir = os.path.join(dataset_dir, emotion)
        if os.path.isdir(emotion_dir):
            for file in os.listdir(emotion_dir):
                if file.endswith('.wav'):
                    file_path = os.path.join(emotion_dir, file)

                    # Extract MFCC features with fixed length
                    mfccs = extract_mfcc(file_path, num_mfcc=num_mfcc, max_length=max_length)

                    # Append to dataset
                    X.append(mfccs[np.newaxis, ..., np.newaxis])  # Add channel dimensions
                    y.append(emotion)

    # Convert lists to numpy arrays
    X = np.array(X)
    y = np.array(y)

    return X, y


# Load Savee dataset and extract MFCC features

In [4]:
dataset_dir = dataset_dir = r'C:\Users\User\MileStone_Project_1\dataset'

In [5]:
X_mfcc, y = load_dataset_and_extract_features(dataset_dir)

In [6]:
# Print the shapes of X and y to verify
print(f"Shape of X: {X_mfcc.shape}")
print(f"Shape of y: {y.shape}")

Shape of X: (480, 1, 13, 100, 1)
Shape of y: (480,)


In [7]:
X_train,X_test, y_train, y_test = train_test_split(X_mfcc,y, test_size=0.2, random_state=42, stratify=y)

In [8]:
label_encoder = LabelEncoder()
# Fit label encoder and transform labels
y_train_encoded = label_encoder.fit_transform(y_train)

In [9]:
unique_emotions = np.unique(y)  # y contains your emotion labels
num_classes = len(unique_emotions)

In [10]:
model = Sequential([
    Conv3D(32, kernel_size=(1, 3, 3), activation='relu', input_shape=(1, 13, 100, 1),padding='same'),
    MaxPooling3D(pool_size=(1, 2, 2)),
    Conv3D(64, kernel_size=(1, 3, 3), activation='relu',padding='same'),
    MaxPooling3D(pool_size=(1, 2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [12]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [13]:
label_encoder = LabelEncoder()

# Fit label encoder and transform labels
acc=86.342
loss_val=0.4
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded= label_encoder.fit_transform(y_test)


# Print the classes that correspond to each encoded label
print("Label Encoder Classes:", label_encoder.classes_)

# Verify the shape and type of y_train_encoded
print("Shape of y_train_encoded:", y_train_encoded.shape)
print("Type of y_train_encoded:", y_train_encoded.dtype)

Label Encoder Classes: ['anger' 'disgust' 'fear' 'happiness' 'neutral' 'sadness' 'surprise']
Shape of y_train_encoded: (384,)
Type of y_train_encoded: int64


In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [15]:
y_val_encoded= label_encoder.fit_transform(y_val)
y_train_encoded=label_encoder.fit_transform(y_train)

In [16]:
model.summary()

In [17]:
history = model.fit(X_train, y_train_encoded,
                    validation_data=(X_val, y_val_encoded),
                    epochs=15,
                    batch_size=32,
                    verbose=1)

Epoch 1/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 109ms/step - accuracy: 0.1323 - loss: 51.9064 - val_accuracy: 0.0909 - val_loss: 2.4708
Epoch 2/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.1523 - loss: 2.5387 - val_accuracy: 0.2857 - val_loss: 1.8742
Epoch 3/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.1722 - loss: 1.9233 - val_accuracy: 0.2597 - val_loss: 1.9096
Epoch 4/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.2121 - loss: 1.9669 - val_accuracy: 0.1169 - val_loss: 2.0071
Epoch 5/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.2223 - loss: 2.0103 - val_accuracy: 0.2338 - val_loss: 1.9647
Epoch 6/15
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.2032 - loss: 1.9331 - val_accuracy: 0.1039 - val_loss: 1.9006
Epoch 7/15
[1m10/10[0m [32m━━

In [24]:
predictions=model.predict(X_test)
y_pred=np.argmax(predictions,axis=1)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


In [19]:
def recommend_music(emotion):
    # Simple recommendation based on predicted emotion
    if emotion == 'happy':
        print("Recommend: Upbeat and cheerful music.")
    elif emotion == 'sad':
        print("Recommend: Soothing and calming music.")
    elif emotion == 'angry':
        print("Recommend: Energetic and intense music.")
    elif emotion == 'neutral':
        print("Recommend: Easy-listening and neutral music.")
    else:
        print("Recommend: Music suitable for the detected emotion.")

In [20]:
sample_index = 1  # Choose a sample index from the test set
predicted_emotion = label_encoder.inverse_transform([y_pred[sample_index]])[0]
print(f"Predicted Emotion: {predicted_emotion}")
recommend_music(predicted_emotion)

Predicted Emotion: neutral
Recommend: Easy-listening and neutral music.


In [21]:
X_train.shape

(307, 1, 13, 100, 1)

In [22]:
y_train_encoded.shape

(307,)