In [8]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import opensmile
import matplotlib.pyplot as plt
import xgboost as xgb
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import optuna

# Load Data for RAVDESS (recursive)
def load_audio_files_ravdess(dataset_path):
    audio_files = []
    labels = []
    for root, _, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".wav"):
                emotion_code = int(file.split("-")[2])  # Extract emotion from filename
                labels.append(emotion_code)
                audio_files.append(os.path.join(root, file))
    print(f"Found {len(audio_files)} audio files in total.")
    return audio_files, labels

# Emotion Mapping for RAVDESS
def map_emotions_ravdess(labels):
    emotion_dict = {
        1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad',
        5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'
    }
    return [emotion_dict[label] for label in labels]

# Feature Extraction using OpenSMILE
def extract_features(file_path):
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.ComParE_2016,
        feature_level=opensmile.FeatureLevel.Functionals
    )
    features = smile.process_file(file_path)
    return features.values.flatten()

# Feature Selection using XGBoost
def feature_selection(X, y):
    xgb_model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss')
    xgb_model.fit(X, y)
    feature_importances = xgb_model.feature_importances_
    top_features = np.argsort(feature_importances)[-50:]
    return X[:, top_features]

# --------- Load and Process RAVDESS Dataset ---------
dataset_path = "C:/Users/samhi/OneDrive/문서/College/s6/Speech Processing/Endsem/Final codes/ravdees"  # Update if needed

audio_files, labels = load_audio_files_ravdess(dataset_path)
labels_mapped = map_emotions_ravdess(labels)

# Robust Feature Extraction
X = []
y_clean = []
failed_files = []

for file, label in tqdm(zip(audio_files, labels_mapped), total=len(audio_files)):
    try:
        features = extract_features(file)
        if features.size == 0:
            raise ValueError("Empty feature vector")
        X.append(features)
        y_clean.append(label)
    except Exception as e:
        failed_files.append((file, str(e)))

print(f"Extracted features from {len(X)} files.")
print(f"Skipped {len(failed_files)} files due to errors.")

# Convert and preprocess
X = np.array(X)
y = LabelEncoder().fit_transform(y_clean)
X = StandardScaler().fit_transform(X)
X_selected = feature_selection(X, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Optuna Objective Function
def objective(trial):
    filters_1 = trial.suggest_int('filters_1', 32, 128)
    filters_2 = trial.suggest_int('filters_2', 64, 256)
    kernel_size = trial.suggest_int('kernel_size', 3, 7)
    dropout_rate = trial.suggest_float('dropout_rate', 0.2, 0.5)
    batch_size = trial.suggest_int('batch_size', 16, 64)

    model = Sequential([
        Conv1D(filters_1, kernel_size=kernel_size, activation='relu', input_shape=(X_train.shape[1], 1)),
        MaxPooling1D(pool_size=2),
        Dropout(dropout_rate),
        Conv1D(filters_2, kernel_size=kernel_size, activation='relu'),
        MaxPooling1D(pool_size=2),
        Dropout(dropout_rate),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(dropout_rate),
        Dense(len(set(y)), activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    model.fit(
        X_train.reshape(-1, X_train.shape[1], 1), y_train,
        epochs=20, batch_size=batch_size,
        validation_data=(X_test.reshape(-1, X_test.shape[1], 1), y_test),
        callbacks=[early_stopping], verbose=0
    )

    _, accuracy = model.evaluate(X_test.reshape(-1, X_test.shape[1], 1), y_test, verbose=0)
    return accuracy

# Run Optuna Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

# Best Params
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Final Model Training
model = Sequential([
    Conv1D(best_params['filters_1'], kernel_size=best_params['kernel_size'], activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(best_params['dropout_rate']),
    Conv1D(best_params['filters_2'], kernel_size=best_params['kernel_size'], activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(best_params['dropout_rate']),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(best_params['dropout_rate']),
    Dense(len(set(y)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(
    X_train.reshape(-1, X_train.shape[1], 1), y_train,
    epochs=50, batch_size=best_params['batch_size'],
    validation_data=(X_test.reshape(-1, X_test.shape[1], 1), y_test),
    callbacks=[early_stopping]
)

# Final Evaluation
eval_result = model.evaluate(X_test.reshape(-1, X_test.shape[1], 1), y_test)
print(f"Final Test Loss: {eval_result[0]:.4f}, Test Accuracy: {eval_result[1]*100:.2f}%")


Found 2880 audio files in total.


100%|██████████| 2880/2880 [16:01<00:00,  2.99it/s]


Extracted features from 2880 files.
Skipped 0 files due to errors.


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[I 2025-04-05 23:15:13,798] A new study created in memory with name: no-name-7e89f760-ef63-4560-b43a-9529ce6f5bb5
[I 2025-04-05 23:15:28,864] Trial 0 finished with value: 0.8368055820465088 and parameters: {'filters_1': 61, 'filters_2': 212, 'kernel_size': 7, 'dropout_rate': 0.2862364872139174, 'batch_size': 32}. Best is trial 0 with value: 0.8368055820465088.
[I 2025-04-05 23:15:50,255] Trial 1 finished with value: 0.6527777910232544 and parameters: {'filters_1': 43, 'filters_2': 156, 'kernel_size': 5, 'dropout_rate': 0.48214337229168935, 'batch_size': 21}. Best is trial 0 with value: 0.8368055820465088.
[I 2025-04-05 23:16:04,736] Trial 2 finished with value: 0.625 and parameters: {'filters_1': 57, 'filters_2': 147, 'kernel_size': 4, 'dropout_rate': 0.4711899526705762, 'batch_size': 53}. Best is trial 0 with value: 0.8368055820465088.
[I 2025-04-05 23:16:29,194] Trial 3 finished with value:

Best Hyperparameters: {'filters_1': 98, 'filters_2': 176, 'kernel_size': 4, 'dropout_rate': 0.23523625332403125, 'batch_size': 16}
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Final Test Loss: 0.3975, Test Accuracy: 91.15%


In [9]:
from sklearn.metrics import f1_score

# Predict class probabilities
y_pred_probs = model.predict(X_test.reshape(-1, X_test.shape[1], 1))

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Compute F1 Score
f1 = f1_score(y_test, y_pred, average='macro')

# Print final evaluation results
print(f"Final Test Loss: {eval_result[0]:.4f}")
print(f"Test Accuracy: {eval_result[1]*100:.2f}%")
print(f"Macro F1 Score: {f1:.4f}")


Final Test Loss: 0.3975
Test Accuracy: 91.15%
Macro F1 Score: 0.9105
