In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import librosa
import joblib
from librosa.feature import spectral_contrast, tonnetz

class AdvancedVoiceEmotionDetector:
    def __init__(self, dataset_path):
        """
        Initialize voice emotion detection system.
        
        Args:
            dataset_path (str): Path to RAVDESS dataset
        """
        self.dataset_path = dataset_path
        self.emotion_map = {
            1: 'neutral', 2: 'calm', 3: 'happy', 
            4: 'sad', 5: 'angry', 6: 'fear', 
            7: 'disgust', 8: 'surprise'
        }
    
    def create_metadata_dataframe(self):
        """
        Create metadata DataFrame from RAVDESS dataset.
        
        Returns:
            pandas.DataFrame: Metadata for all audio files
        """
        emotion = []
        gender = []
        actor = []
        file_path = []
        
        actor_folders = [f for f in os.listdir(self.dataset_path) if os.path.isdir(os.path.join(self.dataset_path, f))]
        
        for actor_folder in actor_folders:
            actor_path = os.path.join(self.dataset_path, actor_folder)
            filenames = os.listdir(actor_path)
            
            for filename in filenames:
                parts = filename.split('.')[0].split('-')
                emotion_code = int(parts[2])
                emotion.append(emotion_code)
                
                actor_number = int(parts[6])
                actor.append(actor_number)
                gender.append('female' if actor_number % 2 == 0 else 'male')
                
                full_path = os.path.join(actor_path, filename)
                file_path.append(full_path)
        
        audio_df = pd.DataFrame({
            'emotion_code': emotion,
            'emotion': [self.emotion_map[code] for code in emotion],
            'gender': gender,
            'actor': actor,
            'path': file_path
        })
        
        return audio_df
    
    def extract_audio_features(self, file_path, max_pad_length=100):
        """
        Extract advanced audio features.
        
        Args:
            file_path (str): Path to audio file
            max_pad_length (int): Max length for feature padding
        
        Returns:
            numpy.ndarray: Processed audio features or None
        """
        try:
            audio, sample_rate = librosa.load(file_path, duration=5.0)

            # Extract Features
            mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
            chroma = librosa.feature.chroma_stft(y=audio, sr=sample_rate)
            mel = librosa.feature.melspectrogram(y=audio, sr=sample_rate)
            zcr = librosa.feature.zero_crossing_rate(y=audio)
            spectral_contrast_features = spectral_contrast(y=audio, sr=sample_rate)
            tonnetz_features = tonnetz(y=audio, sr=sample_rate)

            # Combine features
            features = np.concatenate([
                np.mean(mfccs, axis=1),
                np.mean(chroma, axis=1),
                np.mean(mel, axis=1),
                [np.mean(zcr)],
                np.mean(spectral_contrast_features, axis=1),
                np.mean(tonnetz_features, axis=1)
            ])

            # Pad/truncate to fixed size
            if len(features) > max_pad_length:
                features = features[:max_pad_length]
            else:
                features = np.pad(features, (0, max_pad_length - len(features)), 'constant')

            return features
        
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            return None

    
    def prepare_dataset(self, audio_df):
        """
        Prepare dataset for machine learning.
        
        Args:
            audio_df (pandas.DataFrame): Metadata DataFrame
        
        Returns:
            tuple: Features, labels, and label encoder
        """
        features = []
        labels = []
        
        for _, row in audio_df.iterrows():
            feature = self.extract_audio_features(row['path'])
            if feature is not None:
                features.append(feature)
                labels.append(row['emotion'])
        
        X = np.array(features)
        y = np.array(labels)
        
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        
        return X, y_encoded, label_encoder
    
    def create_model(self, input_shape, num_classes):
        """
        Create a CNN-based model for voice emotion detection.
        
        Args:
            input_shape (tuple): Shape of input features
            num_classes (int): Number of emotion classes
        
        Returns:
            tensorflow.keras.Model: Compiled CNN model
        """
        model = Sequential([
            Conv1D(64, kernel_size=3, activation='relu', input_shape=(input_shape[0], 1)),
            MaxPooling1D(pool_size=2),

            Conv1D(128, kernel_size=3, activation='relu'),
            MaxPooling1D(pool_size=2),

            Flatten(),
            Dense(128, activation='relu'),
            Dropout(0.3),
            
            Dense(num_classes, activation='softmax')
        ])

        model.compile(optimizer=Adam(learning_rate=0.001),
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])

        return model
    
    def train_and_evaluate(self, X, y, label_encoder, test_size=0.2):
        """
        Train and evaluate the CNN-based emotion detection model.
        
        Args:
            X (numpy.ndarray): Input features
            y (numpy.ndarray): Label data
            label_encoder (sklearn.preprocessing.LabelEncoder): Label encoder
            test_size (float): Test dataset proportion
        
        Returns:
            tensorflow.keras.Model: Trained model
        """
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, stratify=y, random_state=42
        )

        # Reshape for CNN
        X_train = np.expand_dims(X_train, axis=-1)
        X_test = np.expand_dims(X_test, axis=-1)

        model = self.create_model(input_shape=(X_train.shape[1], 1), num_classes=len(np.unique(y)))

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        model_checkpoint = ModelCheckpoint('best_voice_emotion_model.h5', save_best_only=True)

        # Train model with increased epochs and batch size
        history = model.fit(
            X_train, y_train,
            validation_data=(X_test, y_test),
            epochs=200,  # Increased from 100 to 200
            batch_size=64,  # Increased from 32 to 64
            callbacks=[early_stopping, model_checkpoint],
            verbose=1
        )

        self.evaluate_model(model, X_test, y_test, label_encoder)

        return model
    
    def evaluate_model(self, model, X_test, y_test, label_encoder):
        """
        Evaluate model performance.
        
        Args:
            model (tensorflow.keras.Model): Trained model
            X_test (numpy.ndarray): Test features
            y_test (numpy.ndarray): Test labels
            label_encoder (sklearn.preprocessing.LabelEncoder): Label encoder
        """
        y_pred = model.predict(X_test)
        y_pred_classes = np.argmax(y_pred, axis=1)

        from sklearn.metrics import classification_report
        print("Classification Report:")
        print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))
    
    def run_pipeline(self):
        """
        Run the complete voice emotion detection pipeline.
        """
        audio_df = self.create_metadata_dataframe()
        
        print("Dataset Summary:")
        print(audio_df['emotion'].value_counts())

        X, y, label_encoder = self.prepare_dataset(audio_df)

        model = self.train_and_evaluate(X, y, label_encoder)

        model.save('final_voice_emotion_model.h5')
        joblib.dump(label_encoder, 'voice_emotion_label_encoder.pkl')

if __name__ == '__main__':
    dataset_path = r'C:\Users\aksha\Downloads\RAVDESS\audio_speech_actors_01-24'
    detector = AdvancedVoiceEmotionDetector(dataset_path)
    detector.run_pipeline()


Dataset Summary:
emotion
calm        192
happy       192
sad         192
angry       192
fear        192
disgust     192
surprise    192
neutral      96
Name: count, dtype: int64


