In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import librosa
import joblib
from librosa.feature import spectral_contrast, tonnetz
from tensorflow.keras.regularizers import l2

class AdvancedVoiceEmotionDetector:
    def __init__(self, dataset_path):
        """
        Initialize voice emotion detection system.
        
        Args:
            dataset_path (str): Path to RAVDESS dataset
        """
        self.dataset_path = dataset_path
        self.emotion_map = {
            1: 'neutral', 2: 'calm', 3: 'happy', 
            4: 'sad', 5: 'angry', 6: 'fear', 
            7: 'disgust', 8: 'surprise'
        }
    
    def create_metadata_dataframe(self):
        """
        Create metadata DataFrame from RAVDESS dataset.
        
        Returns:
            pandas.DataFrame: Metadata for all audio files
        """
        emotion = []
        gender = []
        actor = []
        file_path = []
        
        actor_folders = [f for f in os.listdir(self.dataset_path) if os.path.isdir(os.path.join(self.dataset_path, f))]
        
        for actor_folder in actor_folders:
            actor_path = os.path.join(self.dataset_path, actor_folder)
            filenames = os.listdir(actor_path)
            
            for filename in filenames:
                parts = filename.split('.')[0].split('-')
                emotion_code = int(parts[2])
                emotion.append(emotion_code)
                
                actor_number = int(parts[6])
                actor.append(actor_number)
                gender.append('female' if actor_number % 2 == 0 else 'male')
                
                full_path = os.path.join(actor_path, filename)
                file_path.append(full_path)
        
        audio_df = pd.DataFrame({
            'emotion_code': emotion,
            'emotion': [self.emotion_map[code] for code in emotion],
            'gender': gender,
            'actor': actor,
            'path': file_path
        })
        
        return audio_df
    
    def extract_audio_features(self, file_path, max_pad_length=100):
        """
        Extract advanced audio features.
        
        Args:
            file_path (str): Path to audio file
            max_pad_length (int): Max length for feature padding
        
        Returns:
            numpy.ndarray: Processed audio features or None
        """
        try:
            audio, sample_rate = librosa.load(file_path, duration=5.0)

            # Extract Features
            mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
            chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate)
            mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
            zcr = librosa.feature.zero_crossing_rate(y=audio)
            spectral_contrast_features = spectral_contrast(y=audio, sr=sample_rate)
            tonnetz_features = tonnetz(y=audio, sr=sample_rate)

            # Combine features
            features = np.concatenate([
                np.mean(mfccs, axis=1),
                np.mean(chroma, axis=1),
                np.mean(mel, axis=1),
                [np.mean(zcr)],
                np.mean(spectral_contrast_features, axis=1),
                np.mean(tonnetz_features, axis=1)
            ])

            # Pad/truncate to fixed size
            if len(features) > max_pad_length:
                features = features[:max_pad_length]
            else:
                features = np.pad(features, (0, max_pad_length - len(features)), 'constant')

            return features
        
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return None

    
    def prepare_dataset(self, audio_df):
        """
        Prepare dataset for machine learning.
        
        Args:
            audio_df (pandas.DataFrame): Metadata DataFrame
        
        Returns:
            tuple: Features, labels, and label encoder
        """
        features = []
        labels = []
        
        for _, row in audio_df.iterrows():
            feature = self.extract_audio_features(row['path'])
            if feature is not None:
                features.append(feature)
                labels.append(row['emotion'])
        
        X = np.array(features)
        y = np.array(labels)
        
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        
        return X, y_encoded, label_encoder
    
    def create_model(self, input_shape, num_classes):
        """
        Create a CNN-based model for voice emotion detection.
        
        Args:
            input_shape (tuple): Shape of input features
            num_classes (int): Number of emotion classes
        
        Returns:
            tensorflow.keras.Model: Compiled CNN model
        """
        model = Sequential([
            Conv1D(64, kernel_size=3, activation='relu', input_shape=(input_shape[0], 1), kernel_regularizer=l2(0.01)),
            MaxPooling1D(pool_size=2),

            Conv1D(128, kernel_size=3, activation='relu', kernel_regularizer=l2(0.01)),
            MaxPooling1D(pool_size=2),

            Flatten(),
            Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
            Dropout(0.5),
            
            Dense(num_classes, activation='softmax')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001),
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

        return model
    
    def scheduler(self, epoch, lr):
        """Learning rate scheduler."""
        if epoch < 100:
            return lr
        elif epoch < 200:
            return lr * 0.5
        else:
            return lr * 0.1

    def train_and_evaluate(self, X, y, label_encoder, test_size=0.2):
        """
        Train and evaluate the CNN-based emotion detection model.
        
        Args:
            X (numpy.ndarray): Input features
            y (numpy.ndarray): Label data
            label_encoder (sklearn.preprocessing.LabelEncoder): Label encoder
            test_size (float): Test dataset proportion
        
        Returns:
            tensorflow.keras.Model: Trained model
        """
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, stratify=y, random_state=42
        )

        # Reshape for CNN
        X_train = np.expand_dims(X_train, axis=-1)
        X_test = np.expand_dims(X_test, axis=-1)

        model = self.create_model(input_shape=(X_train.shape[1], 1), num_classes=len(np.unique(y)))

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        model_checkpoint = ModelCheckpoint('best_voice_emotion_model.h5', save_best_only=True)

        # Learning Rate Scheduler
        lr_scheduler = LearningRateScheduler(self.scheduler)

        # Train model with increased epochs and batch size
        history = model.fit(
            X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=300,  # Increased from 200 to 300
            batch_size=128,  # Increased from 64 to 128
            callbacks=[early_stopping, model_checkpoint, lr_scheduler],
            verbose=1
        )

        self.evaluate_model(model, X_test, y_test, label_encoder)

        return model
    
    def evaluate_model(self, model, X_test, y_test, label_encoder):
        """
        Evaluate model performance.
        
        Args:
            model (tensorflow.keras.Model): Trained model
            X_test (numpy.ndarray): Test features
            y_test (numpy.ndarray): Test labels
            label_encoder (sklearn.preprocessing.LabelEncoder): Label encoder
        """
        y_pred = model.predict(X_test)
        y_pred_classes = np.argmax(y_pred, axis=1)

        from sklearn.metrics import classification_report
        print("Classification Report:")
        print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))
    
    def run_pipeline(self):
        """
        Run the complete voice emotion detection pipeline.
        """
        audio_df = self.create_metadata_dataframe()
        
        print("Dataset Summary:")
        print(audio_df['emotion'].value_counts())

        X, y, label_encoder = self.prepare_dataset(audio_df)

        model = self.train_and_evaluate(X, y, label_encoder)

        model.save('final_voice_emotion_model.h5')
        joblib.dump(label_encoder, 'voice_emotion_label_encoder.pkl')

if __name__ == '__main__':
    dataset_path = r'C:\Users\aksha\Downloads\RAVDESS\audio_speech_actors_01-24'
    detector = AdvancedVoiceEmotionDetector(dataset_path)
    detector.run_pipeline()


Dataset Summary:
emotion
calm        192
happy       192
sad         192
angry       192
fear        192
disgust     192
surprise    192
neutral      96
Name: count, dtype: int64




Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.1418 - loss: 4.5259



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 25ms/step - accuracy: 0.1437 - loss: 4.4573 - val_accuracy: 0.2917 - val_loss: 1.9133
Epoch 2/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.2471 - loss: 1.9336



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.2472 - loss: 1.9327 - val_accuracy: 0.2986 - val_loss: 1.8402
Epoch 3/200
[1m10/18[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.2850 - loss: 1.8224 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2912 - loss: 1.8133 - val_accuracy: 0.2674 - val_loss: 1.7575
Epoch 4/200
[1m10/18[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.3050 - loss: 1.7664 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3113 - loss: 1.7571 - val_accuracy: 0.3646 - val_loss: 1.7192
Epoch 5/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.3559 - loss: 1.6728 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3517 - loss: 1.6711 - val_accuracy: 0.3785 - val_loss: 1.7088
Epoch 6/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.3412 - loss: 1.6903 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.3451 - loss: 1.6769 - val_accuracy: 0.3819 - val_loss: 1.6370
Epoch 7/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3969 - loss: 1.5629 - val_accuracy: 0.3750 - val_loss: 1.6374
Epoch 8/200
[1m10/18[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.3881 - loss: 1.6345 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4057 - loss: 1.5970 - val_accuracy: 0.4167 - val_loss: 1.5666
Epoch 9/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4163 - loss: 1.5310 - val_accuracy: 0.4201 - val_loss: 1.5844
Epoch 10/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.4771 - loss: 1.4402 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4703 - loss: 1.4516 - val_accuracy: 0.4618 - val_loss: 1.5368
Epoch 11/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.4316 - loss: 1.4741 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4454 - loss: 1.4622 - val_accuracy: 0.4479 - val_loss: 1.5304
Epoch 12/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.4625 - loss: 1.4000 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4582 - loss: 1.4162 - val_accuracy: 0.4340 - val_loss: 1.5108
Epoch 13/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5141 - loss: 1.3552 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5124 - loss: 1.3559 - val_accuracy: 0.4479 - val_loss: 1.4614
Epoch 14/200
[1m10/18[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.4842 - loss: 1.3409 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.4950 - loss: 1.3309 - val_accuracy: 0.4861 - val_loss: 1.4491
Epoch 15/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5420 - loss: 1.2340 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5437 - loss: 1.2372 - val_accuracy: 0.5069 - val_loss: 1.4487
Epoch 16/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5508 - loss: 1.1974 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5512 - loss: 1.2011 - val_accuracy: 0.4444 - val_loss: 1.4435
Epoch 17/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5551 - loss: 1.1895 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5515 - loss: 1.2093 - val_accuracy: 0.5278 - val_loss: 1.4120
Epoch 18/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5655 - loss: 1.2473 - val_accuracy: 0.5139 - val_loss: 1.4708
Epoch 19/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.5726 - loss: 1.1176 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5766 - loss: 1.1185 - val_accuracy: 0.5486 - val_loss: 1.3735
Epoch 20/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6189 - loss: 1.0654 - val_accuracy: 0.5104 - val_loss: 1.4609
Epoch 21/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5809 - loss: 1.0907 - val_accuracy: 0.5417 - val_loss: 1.3827
Epoch 22/200
[1m11/18[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.6129 - loss: 1.0533 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6143 - loss: 1.0535 - val_accuracy: 0.5590 - val_loss: 1.3283
Epoch 23/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6527 - loss: 0.9731 - val_accuracy: 0.5556 - val_loss: 1.3567
Epoch 24/200
[1m14/18[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 8ms/step - accuracy: 0.6917 - loss: 0.8505



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6845 - loss: 0.8675 - val_accuracy: 0.5938 - val_loss: 1.3080
Epoch 25/200
[1m10/18[0m [32m━━━━━━━━━━━[0m[37m━━━━━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.6959 - loss: 0.8762 



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6936 - loss: 0.8808 - val_accuracy: 0.5833 - val_loss: 1.2706
Epoch 26/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7059 - loss: 0.8333 - val_accuracy: 0.5799 - val_loss: 1.3862
Epoch 27/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6891 - loss: 0.8307 - val_accuracy: 0.5764 - val_loss: 1.4336
Epoch 28/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7379 - loss: 0.7287



[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7371 - loss: 0.7302 - val_accuracy: 0.6042 - val_loss: 1.2631
Epoch 29/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7306 - loss: 0.7629 - val_accuracy: 0.5694 - val_loss: 1.3977
Epoch 30/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7447 - loss: 0.6882 - val_accuracy: 0.6319 - val_loss: 1.3432
Epoch 31/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7833 - loss: 0.6413 - val_accuracy: 0.5833 - val_loss: 1.3625
Epoch 32/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7855 - loss: 0.6038 - val_accuracy: 0.6042 - val_loss: 1.3483
Epoch 33/200
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7748 - loss: 0.6074 - val_accuracy: 0.5799 - val_loss: 1.3910
Epoch 34/200
[1m18/18[0m [32m━━━



Classification Report:
              precision    recall  f1-score   support

       angry       0.68      0.74      0.71        38
        calm       0.60      0.84      0.70        38
     disgust       0.65      0.45      0.53        38
        fear       0.62      0.72      0.67        39
       happy       0.72      0.46      0.56        39
     neutral       0.42      0.26      0.32        19
         sad       0.44      0.45      0.44        38
    surprise       0.62      0.74      0.67        39

    accuracy                           0.60       288
   macro avg       0.59      0.58      0.58       288
weighted avg       0.61      0.60      0.59       288

