In [3]:
import librosa
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import os

In [14]:
import os
import pandas as pd

# Assuming 'ravdess_df' already has the 'Emotions' column filled
# Set the directory where your data is stored
data_directory = 'archive'

# Initialize empty lists for paths and emotions
file_paths = []
file_emotions = []

# Loop through each subfolder in the data directory (actors folder)
for actor_folder in os.listdir(data_directory):
    actor_path = os.path.join(data_directory, actor_folder)
    
    # Skip if it's not a folder
    if not os.path.isdir(actor_path):
        continue
    
    # Loop through each file in the actor's folder
    for file_name in os.listdir(actor_path):
        # Check if it's a .wav file
        if file_name.endswith('.wav'):
            # Full file path
            full_path = os.path.join(actor_path, file_name)
            
            # Get the emotion from the file name (based on your naming convention)
            # e.g., for 03-01-01-01-01-01.wav, the emotion might be extracted from the third part (03)
            parts = file_name.split('-')
            emotion_code = int(parts[2])  # Assuming the emotion code is the third part
            
            # Append the emotion and path to the lists
            file_emotions.append(emotion_code)
            file_paths.append(full_path)

# Create a dataframe with paths and emotions
ravdess_df = pd.DataFrame({
    'Emotions': file_emotions,
    'Path': file_paths
})

# Map the emotion codes to actual emotion labels
ravdess_df['Emotions'] = ravdess_df['Emotions'].replace({
    1: 'neutral',
    2: 'calm',
    3: 'happy',
    4: 'sad',
    5: 'angry',
    6: 'fear',
    7: 'disgust',
    8: 'surprise'
})

# Print the first few rows of the dataframe
print(ravdess_df.head())


  Emotions                                       Path
0  neutral  archive\Actor_01\03-01-01-01-01-01-01.wav
1  neutral  archive\Actor_01\03-01-01-01-01-02-01.wav
2  neutral  archive\Actor_01\03-01-01-01-02-01-01.wav
3  neutral  archive\Actor_01\03-01-01-01-02-02-01.wav
4     calm  archive\Actor_01\03-01-02-01-01-01-01.wav


In [16]:
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    # Extract MFCC features (13 coefficients)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    # Take the mean of each coefficient across time frames
    mfcc = np.mean(mfcc.T, axis=0)
    return mfcc

In [18]:
features = []
labels = []

In [19]:
for index, row in ravdess_df.iterrows():
    file_path = row['Path']
    emotion = row['Emotions']
    mfcc = extract_features(file_path)
    features.append(mfcc)
    labels.append(emotion)

In [20]:
# Convert features to numpy array
X = np.array(features)

# Label encode the emotions
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Reshape X to fit LSTM input requirements (samples, timesteps, features)
X = np.expand_dims(X, axis=-1)

# Pad sequences to ensure uniform input size for LSTM
X = pad_sequences(X, dtype='float32')

In [21]:
print(X.shape, y.shape)

(1440, 13, 1) (1440,)


In [35]:
y.info()

AttributeError: 'numpy.ndarray' object has no attribute 'info'

## Building LSTM Model

In [22]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [25]:
# Define the LSTM model
model = Sequential()

# LSTM layer
model.add(LSTM(128, return_sequences=True, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))  # Number of classes

In [26]:

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [29]:

# Train the model
history = model.fit(X, y, epochs=25, batch_size=15, validation_split=0.4)

Epoch 1/25
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.6454 - loss: 0.9456 - val_accuracy: 0.3420 - val_loss: 2.2360
Epoch 2/25
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.6762 - loss: 0.8805 - val_accuracy: 0.3837 - val_loss: 2.1116
Epoch 3/25
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.7141 - loss: 0.8168 - val_accuracy: 0.3750 - val_loss: 2.2145
Epoch 4/25
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.7498 - loss: 0.7101 - val_accuracy: 0.3177 - val_loss: 2.4319
Epoch 5/25
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.6837 - loss: 0.8474 - val_accuracy: 0.3403 - val_loss: 2.2551
Epoch 6/25
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.7556 - loss: 0.7026 - val_accuracy: 0.3524 - val_loss: 2.3927
Epoch 7/25
[1m58/58[0m [32m━━━━

In [30]:
def predict_emotion(file_path):
    mfcc = extract_features(file_path)
    mfcc = np.expand_dims(mfcc, axis=0)  # Add batch dimension
    mfcc = np.expand_dims(mfcc, axis=-1)  # Add feature dimension
    
    # Make the prediction
    prediction = model.predict(mfcc)
    predicted_class = label_encoder.inverse_transform(np.argmax(prediction, axis=1))
    return predicted_class[0]

# Test the model with a new file
new_audio_path = 'archive/Actor_05/03-01-02-01-01-01-05.wav'
predicted_emotion = predict_emotion(new_audio_path)
print(f'Predicted emotion: {predicted_emotion}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 405ms/step
Predicted emotion: calm


In [31]:
# Test the model with a new file
new_audio_path = 'archive/Actor_05/03-01-01-01-01-01-05.wav'
predicted_emotion = predict_emotion(new_audio_path)
print(f'Predicted emotion: {predicted_emotion}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
Predicted emotion: neutral


In [39]:
model.save('model.h5')

