# Module: Speech Recognition
## Mid-Project: Speech Emotion Detection

### Librairies Import

In [1]:
import pandas as pd
import librosa
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

2025-03-11 08:53:16.527886: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-11 08:53:16.532249: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-11 08:53:16.544820: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741683196.565307 2196178 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741683196.570435 2196178 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-11 08:53:16.589503: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

## Loading the Data

In [2]:
speech_emotion_data_dir="EmotionalSpeechSet"
train_data_path = f'{speech_emotion_data_dir}/Train_audio_liste.csv'
print(train_data_path)

target_columns = ["label", "filepath"]

train_data = pd.read_csv(train_data_path, usecols=target_columns)

EmotionalSpeechSet/Train_audio_liste.csv


In [3]:
print("Number of Observations", len(train_data))
train_data

Number of Observations 1960


Unnamed: 0,label,filepath
0,disgust,220.wav
1,happy,602.wav
2,disgust,1673.wav
3,angry,1492.wav
4,disgust,304.wav
...,...,...
1955,disgust,1639.wav
1956,Pleasant,1096.wav
1957,Pleasant,1131.wav
1958,Sad,1295.wav


## Data Preprocessing

### Checking for NULL Values

In [4]:
null_values = train_data.isnull().sum()
null_values

label       0
filepath    0
dtype: int64

### List of Unique Labels

In [5]:
unique_labels = train_data['label'].unique()
unique_labels

array(['disgust', 'happy', 'angry', 'fear', 'neutral', 'Pleasant', 'Sad',
       'pleasant', 'sad', 'Fear'], dtype=object)

### Resolving Inconsistency in Label Names

We can remark there is inconsistency in the label names. There are some emotion names in both lowercase and Capiterlized; **Pleasant** and **Fear* for example. To handle this inconsensticy we need to lowercase all the label names.

In [6]:
# Convert all labels to lowercase
train_data['label'] = train_data['label'].str.lower()

unique_labels = train_data['label'].unique()
unique_labels

array(['disgust', 'happy', 'angry', 'fear', 'neutral', 'pleasant', 'sad'],
      dtype=object)

### Checking for Data Balancy

In [7]:
label_counts = train_data['label'].value_counts()
label_counts

label
neutral     292
angry       288
disgust     281
happy       281
sad         277
pleasant    273
fear        268
Name: count, dtype: int64

In [8]:
# Count the occurrences for each label and get the proportions
label_counts = train_data['label'].value_counts(normalize=True)*100

print("Proportion of each label:")
label_counts

Proportion of each label:


label
neutral     14.897959
angry       14.693878
disgust     14.336735
happy       14.336735
sad         14.132653
pleasant    13.928571
fear        13.673469
Name: proportion, dtype: float64

### MFCC Features Extraction

#### MFCC Array Reshaping: Padding or Truncating to Target Dimensions

The code above shows that there are almost the same proportion for each instance of labels. The data can then be considered as balanced in term of labels.

In [9]:
def padd_or_truncate(mfcc, target_shape=(100, 13)):
    # Get the current shape of the mfcc array
    current_shape = mfcc.shape
    
    # If the current length is shorter than the target, pad with zeros
    if current_shape[0] < target_shape[0]:
        # Calculate padding required
        pad_width = target_shape[0] - current_shape[0]
        # Pad with zeros on the time axis (first dimension)
        mfcc = np.pad(mfcc, ((0, pad_width), (0, 0)), mode='constant')
    
    # If the current length is longer than the target, truncate
    elif current_shape[0] > target_shape[0]:
        # Truncate the mfcc array along the time axis
        mfcc = mfcc[:target_shape[0], :]
    
    # Return the mfcc with the desired shape
    return mfcc


#### Extract MFCC Features

In [10]:
audio_files_dir = f"{speech_emotion_data_dir}/EmotionalSpeechSetBlind/"
# Function to load and preprocess audio files
def extract_mfcc(file_path):
    # Load the audio file
    #audio, sr = librosa.load(file_path, sr=None)
    audio, sr = librosa.load(file_path, sr=1024)
    
    # Normalize the audio
    audio = audio / np.max(np.abs(audio))  # Normalize the audio to be between -1 and 1
    
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

    # Padd or Truncate
    mfccs = padd_or_truncate(mfccs.T)


    return mfccs

In [11]:

# Extract MFCC features for each audio file and store in a new column
train_data['mfccs'] = train_data['filepath'].apply(lambda file: extract_mfcc(f"{audio_files_dir}/{file}"))


print("Training data with MFCC features:")
train_data



Training data with MFCC features:


Unnamed: 0,label,filepath,mfccs
0,disgust,220.wav,"[[24.957165, -80.03075, -22.420921, -4.2292356..."
1,happy,602.wav,"[[-15.845293, -95.68926, -64.10472, 59.110016,..."
2,disgust,1673.wav,"[[-53.986485, -126.50596, -59.115425, 111.5251..."
3,angry,1492.wav,"[[-143.8713, -43.938126, -75.177444, 59.01333,..."
4,disgust,304.wav,"[[15.993326, -41.089012, -39.814926, 23.312775..."
...,...,...,...
1955,disgust,1639.wav,"[[-55.922443, -95.88832, -72.759415, 102.53709..."
1956,pleasant,1096.wav,"[[30.868767, -113.35453, -39.73003, 28.604193,..."
1957,pleasant,1131.wav,"[[-6.3083587, -113.5168, -41.491386, 54.71226,..."
1958,sad,1295.wav,"[[-104.73416, -61.84137, -2.6739442, 5.676253,..."


In [12]:
train_data

Unnamed: 0,label,filepath,mfccs
0,disgust,220.wav,"[[24.957165, -80.03075, -22.420921, -4.2292356..."
1,happy,602.wav,"[[-15.845293, -95.68926, -64.10472, 59.110016,..."
2,disgust,1673.wav,"[[-53.986485, -126.50596, -59.115425, 111.5251..."
3,angry,1492.wav,"[[-143.8713, -43.938126, -75.177444, 59.01333,..."
4,disgust,304.wav,"[[15.993326, -41.089012, -39.814926, 23.312775..."
...,...,...,...
1955,disgust,1639.wav,"[[-55.922443, -95.88832, -72.759415, 102.53709..."
1956,pleasant,1096.wav,"[[30.868767, -113.35453, -39.73003, 28.604193,..."
1957,pleasant,1131.wav,"[[-6.3083587, -113.5168, -41.491386, 54.71226,..."
1958,sad,1295.wav,"[[-104.73416, -61.84137, -2.6739442, 5.676253,..."


In [13]:
train_data['mfccs'].shape

(1960,)

In [14]:
# Stack mfcc arrays into a 3D numpy array
X = np.stack(train_data['mfccs'].values)
# Get Labels
y = np.array(train_data['label'])

print("Stacked X shape:", X.shape)  # Should print (1960, 100, 13)

Stacked X shape: (1960, 100, 13)


#### Label Encoding

In [15]:
# Encode the labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

#### Scaling

In [16]:

# Reduce X to a 2D shape where each sample is summarized
X_reduced = X.mean(axis=1)  # Shape will be (1960, 13)

# Print the shape after reduction
print("Reduced X shape:", X_reduced.shape)  # Should print (1960, 13)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reduced)

Reduced X shape: (1960, 13)


#### Spliting

In [17]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded)

# Print shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape) 

X_train shape: (1372, 13)
X_test shape: (588, 13)
y_train shape: (1372,)
y_test shape: (588,)


### Models Training

First of all we will try with the following models
1. Naive Bayes
2. Logistic Regression
3. Decision Tree
4. Random Forest
5. Multi Layer Perceptron

For each one we will use a stratified Cross voalidation with 5 folds

In [18]:
# Initialize models
models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'Random Forest': RandomForestClassifier(n_estimators=10, random_state=42),
    'MLP Classifier': MLPClassifier(hidden_layer_sizes=(128,), max_iter=2000, alpha=0.01, random_state=42)

}
# Stratified K-Fold cross-validation with 5 splits
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# Iterate over models for cross-validation
for model_name, model in models.items():
    # Perform cross-validation on the training data
    cv_scores = cross_val_score(model, X_train, y_train, cv=stratified_cv)
    
    # Fit the model on the entire training data to get the training accuracy
    model.fit(X_train, y_train)
    
    # Get training accuracy
    train_predictions = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, train_predictions)

    # Evaluate the model on the validation set
    val_predictions = model.predict(X_test)
    val_accuracy = accuracy_score(y_test, val_predictions)

    # Print the results
    print(f"Model: {model_name}")
    print("Cross-Validation Scores:", cv_scores)
    print("Mean Cross-Validation Score:", np.mean(cv_scores))
    print("Training Accuracy:", train_accuracy)
    print("Validation Accuracy:", val_accuracy)
    print("-" * 30)


Model: Naive Bayes
Cross-Validation Scores: [0.89090909 0.88363636 0.86131387 0.89051095 0.89051095]
Mean Cross-Validation Score: 0.8833762441937625
Training Accuracy: 0.891399416909621
Validation Accuracy: 0.8673469387755102
------------------------------
Model: Logistic Regression
Cross-Validation Scores: [0.89818182 0.91272727 0.91605839 0.92335766 0.89416058]
Mean Cross-Validation Score: 0.9088971466489715
Training Accuracy: 0.924198250728863
Validation Accuracy: 0.9149659863945578
------------------------------
Model: Decision Tree
Cross-Validation Scores: [0.83272727 0.87636364 0.82481752 0.88686131 0.83576642]
Mean Cross-Validation Score: 0.8513072329130724
Training Accuracy: 0.9759475218658892
Validation Accuracy: 0.8690476190476191
------------------------------
Model: Random Forest
Cross-Validation Scores: [0.91272727 0.91272727 0.90510949 0.90510949 0.88321168]
Mean Cross-Validation Score: 0.9037770404777705
Training Accuracy: 1.0
Validation Accuracy: 0.923469387755102
-----

| Model                | Training Accuracy | Validation Accuracy |
|----------------------|-------------------|----------------------|
| Naive Bayes          | 0.8914           | 0.8673              |
| Logistic Regression  | 0.9242           | 0.9150              |
| Decision Tree        | 0.9759           | 0.8690              |
| Random Forest        | 1.0000           | 0.9235              |
| MLP Classifier       | 0.9993           | 0.9643              |

The models show varying performance levels, with generally good accuracy overall. Naive Bayes has the lowest accuracy, while MLP Classifier achieves the highest validation accuracy (0.9643) with near-perfect training accuracy. Logistic Regression maintains a good balance between training (0.9242) and validation (0.9150) accuracy. Decision Tree and Random Forest show higher training accuracy, indicating potential overfitting, especially for Random Forest.


Let us try now a simple feedforward neural network.

In [19]:

num_classes = len(label_counts)

# Define the model creation function
def create_model():
    model = Sequential()
 
    model.add(Dense(128, activation='relu', input_shape=(13,)))

    model.add(Dropout(0.5))
    
    model.add(Dense(num_classes, activation='softmax')) 

    model.compile(optimizer='adam',
    
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_train_scores = []
cv_val_scores = []

# Iterate through each fold
for train_index, val_index in stratified_cv.split(X_scaled, y_encoded):

    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y_encoded[train_index], y_encoded[val_index]

    # Create and train the model
    model = create_model()
    model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=1)  # Use verbose=0 to suppress output
    

    # Evaluate the model on training data
    train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=1)
    cv_train_scores.append(train_accuracy)

    # Evaluate the model on validation data
    val_loss, val_accuracy = model.evaluate(X_test, y_test, verbose=1)
    cv_val_scores.append(val_accuracy)

# Print CV results
print("Cross-Validation Training Scores:", cv_train_scores)
print("Mean Training Score:", np.mean(cv_train_scores))
print("Cross-Validation Validation Scores:", cv_val_scores)
print("Mean Validation Score:", np.mean(cv_val_scores))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-03-11 08:53:51.405210: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.2858 - loss: 1.7852 
Epoch 2/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7303 - loss: 1.0857
Epoch 3/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8036 - loss: 0.7646
Epoch 4/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8464 - loss: 0.5976
Epoch 5/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8655 - loss: 0.5018
Epoch 6/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8721 - loss: 0.4501
Epoch 7/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9089 - loss: 0.3544
Epoch 8/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9017 - loss: 0.3446
Epoch 9/50
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

The feedforward neural network outperformed the other models, with a mean training accuracy of 0.9847 and validation accuracy around 0.9794, showing strong generalization and without overfitting. It is the selected model.

## Making Predictions on Blind Test Data

Now we are going to make prediction of emotions for our blind test data

In [20]:

blind_test_data_path = f'{speech_emotion_data_dir}/BlindTest_audio_liste.csv'

target_columns = ["filepath"]

blind_test_data = pd.read_csv(blind_test_data_path, usecols=target_columns)
blind_test_data

Unnamed: 0,filepath
0,1089.wav
1,773.wav
2,2162.wav
3,1193.wav
4,1917.wav
...,...
835,681.wav
836,912.wav
837,2649.wav
838,2580.wav


In [21]:

# Extract MFCC features for each audio file and store in a new column
blind_test_data['mfccs'] = blind_test_data['filepath'].apply(lambda file: extract_mfcc(f"{audio_files_dir}/{file}"))


print("Blind Test data with MFCC features:")
blind_test_data



Blind Test data with MFCC features:


Unnamed: 0,filepath,mfccs
0,1089.wav,"[[2.8589315, -102.09917, -40.004486, 42.05353,..."
1,773.wav,"[[40.600567, -97.713974, -53.113907, 60.24423,..."
2,2162.wav,"[[-20.954939, -109.87756, -20.065926, 72.77244..."
3,1193.wav,"[[8.413327, -113.020096, -52.566772, 29.926914..."
4,1917.wav,"[[68.868126, -72.21741, -40.48712, 15.520545, ..."
...,...,...
835,681.wav,"[[43.32491, -78.73245, -82.87785, 67.38076, 14..."
836,912.wav,"[[-51.72955, -28.240273, -37.1867, 41.484238, ..."
837,2649.wav,"[[-1.6926446, -51.24196, -45.587097, -11.20017..."
838,2580.wav,"[[-17.521883, -126.7378, 12.937216, 44.97313, ..."


In [22]:
# Stack mfcc arrays into a 3D numpy array
X_blind_test = np.stack(blind_test_data['mfccs'].values)


# Reduce X to a 2D shape where each sample is summarized
X_blind_test_reduced = X_blind_test.mean(axis=1)  # Shape will be (1960, 13)

# Print the shape after reduction
print("Reduced X shape:", X_blind_test_reduced.shape)  # Should print (1960, 13)


# Use the scaler fitted above to scale the data
X_blind_test_scaled = scaler.transform(X_blind_test_reduced)

Reduced X shape: (840, 13)


In [23]:
# Make predictions using the best model
new_predictions = model.predict(X_blind_test_scaled)

# Convert probabilities to class labels by taking the index of the maximum probability
predicted_labels = np.argmax(new_predictions, axis=1)

# Print predicted labels for the new data
print("Predicted Labels for X_new:")
predicted_labels


[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Predicted Labels for X_new:


array([5, 3, 3, 5, 2, 1, 6, 0, 4, 5, 5, 5, 3, 5, 3, 2, 5, 6, 6, 5, 6, 0,
       2, 6, 1, 2, 5, 4, 3, 6, 4, 2, 0, 3, 3, 2, 1, 4, 0, 6, 2, 0, 3, 1,
       6, 2, 0, 4, 6, 3, 1, 4, 4, 2, 5, 5, 2, 4, 2, 1, 5, 6, 6, 4, 5, 4,
       0, 1, 6, 3, 5, 2, 1, 5, 0, 6, 4, 6, 3, 0, 3, 0, 0, 6, 6, 4, 1, 1,
       6, 2, 1, 3, 5, 4, 0, 6, 4, 5, 3, 3, 3, 6, 5, 5, 6, 3, 6, 2, 3, 6,
       4, 4, 4, 0, 3, 2, 0, 0, 5, 2, 3, 6, 4, 0, 3, 6, 0, 6, 3, 5, 5, 2,
       6, 5, 5, 0, 3, 0, 5, 4, 0, 6, 4, 4, 0, 6, 4, 0, 4, 3, 6, 5, 4, 6,
       5, 5, 5, 5, 3, 1, 4, 2, 4, 1, 1, 1, 4, 0, 0, 5, 0, 1, 1, 2, 2, 3,
       4, 0, 5, 0, 4, 5, 0, 3, 6, 6, 6, 1, 4, 1, 2, 0, 1, 4, 2, 5, 1, 4,
       2, 5, 3, 6, 2, 6, 3, 6, 2, 2, 3, 4, 0, 0, 2, 6, 2, 0, 2, 0, 6, 6,
       6, 1, 2, 4, 3, 6, 3, 3, 5, 2, 6, 6, 0, 5, 2, 1, 2, 2, 3, 5, 6, 4,
       0, 2, 1, 0, 1, 4, 6, 4, 3, 5, 2, 2, 5, 1, 1, 3, 2, 4, 1, 4, 5, 0,
       3, 2, 1, 6, 5, 0, 3, 1, 4, 2, 3, 6, 0, 6, 4, 6, 4, 3, 6, 0, 0, 3,
       0, 0, 2, 4, 1, 5, 5, 4, 6, 1, 4, 4, 2, 1, 4,

In [24]:
# Decode the predicted labels back to original emotion names
predicted_emotions = label_encoder.inverse_transform(predicted_labels)

# Print predicted emotion names for the new data
print("Predicted Emotions for X Blind Test:", predicted_emotions)

Predicted Emotions for X Blind Test: ['pleasant' 'happy' 'happy' 'pleasant' 'fear' 'disgust' 'sad' 'angry'
 'neutral' 'pleasant' 'pleasant' 'pleasant' 'happy' 'pleasant' 'happy'
 'fear' 'pleasant' 'sad' 'sad' 'pleasant' 'sad' 'angry' 'fear' 'sad'
 'disgust' 'fear' 'pleasant' 'neutral' 'happy' 'sad' 'neutral' 'fear'
 'angry' 'happy' 'happy' 'fear' 'disgust' 'neutral' 'angry' 'sad' 'fear'
 'angry' 'happy' 'disgust' 'sad' 'fear' 'angry' 'neutral' 'sad' 'happy'
 'disgust' 'neutral' 'neutral' 'fear' 'pleasant' 'pleasant' 'fear'
 'neutral' 'fear' 'disgust' 'pleasant' 'sad' 'sad' 'neutral' 'pleasant'
 'neutral' 'angry' 'disgust' 'sad' 'happy' 'pleasant' 'fear' 'disgust'
 'pleasant' 'angry' 'sad' 'neutral' 'sad' 'happy' 'angry' 'happy' 'angry'
 'angry' 'sad' 'sad' 'neutral' 'disgust' 'disgust' 'sad' 'fear' 'disgust'
 'happy' 'pleasant' 'neutral' 'angry' 'sad' 'neutral' 'pleasant' 'happy'
 'happy' 'happy' 'sad' 'pleasant' 'pleasant' 'sad' 'happy' 'sad' 'fear'
 'happy' 'sad' 'neutral' 'neutral' 

In [25]:

# Create a new DataFrame with filepath and emotion label as columns
blind_test_df = pd.DataFrame({
    'filepath': blind_test_data['filepath'],
    'label': predicted_emotions
})

# Display the resulting DataFrame
blind_test_df

Unnamed: 0,filepath,label
0,1089.wav,pleasant
1,773.wav,happy
2,2162.wav,happy
3,1193.wav,pleasant
4,1917.wav,fear
...,...,...
835,681.wav,happy
836,912.wav,neutral
837,2649.wav,sad
838,2580.wav,pleasant


In [None]:
# Save predictions into a CSV file

blind_test_df.to_csv(f"Test_audio_labeled.csv", index=False)