In [4]:
import os
import numpy as np
import librosa
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf

# ------------------- Load SAVEE -------------------
def load_savee_audio(dataset_path):
    audio_files, labels = [], []
    label_map = {'a': 'angry', 'd': 'disgust', 'f': 'fear', 'h': 'happy', 'n': 'neutral', 'sa': 'sad', 'su': 'surprise'}
    for file in os.listdir(dataset_path):
        if file.endswith(".wav"):
            parts = file.split('_')
            emotion_code = parts[1][:2] if parts[1][:2] in label_map else parts[1][0]
            if emotion_code in label_map:
                audio_files.append(os.path.join(dataset_path, file))
                labels.append(label_map[emotion_code])
    return audio_files, labels

# ------------------- Feature Extraction -------------------
def extract_mfcc_features(file_path, max_len=100):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    zcr = librosa.feature.zero_crossing_rate(y)
    rms = librosa.feature.rms(y=y)
    features = np.vstack([mfcc, zcr, rms])  # (43, time_steps)

    if features.shape[1] < max_len:
        pad_width = max_len - features.shape[1]
        features = np.pad(features, pad_width=((0,0),(0, pad_width)), mode='constant')
    else:
        features = features[:, :max_len]
    return features.T  # (time_steps, 43)

# ------------------- Load & Preprocess -------------------
dataset_path = "C:/Users/samhi/OneDrive/문서/College/s6/Speech Processing/Endsem/archive/ALL"
audio_files, labels = load_savee_audio(dataset_path)

X, y_clean = [], []
for file, label in tqdm(zip(audio_files, labels), total=len(audio_files)):
    try:
        feat = extract_mfcc_features(file)
        X.append(feat)
        y_clean.append(label)
    except Exception as e:
        print(f"Failed on {file}: {e}")

X = np.array(X)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y_clean)
num_classes = len(np.unique(y))

# ------------------- K-Fold CV Setup -------------------
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
acc_scores, f1_scores = [], []

for fold, (train_idx, test_idx) in enumerate(kfold.split(X, y)):
    print(f"\n Fold {fold+1} ---------------------------")
    
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    tf.keras.backend.clear_session()  # Clean previous model from memory

    model = Sequential([
        LSTM(128, return_sequences=False, input_shape=(X.shape[1], X.shape[2])),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=40, batch_size=16, validation_data=(X_test, y_test),
              callbacks=[EarlyStopping(patience=5, restore_best_weights=True)], verbose=0)

    y_pred = np.argmax(model.predict(X_test), axis=1)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    acc_scores.append(acc)
    f1_scores.append(f1)

    print(f"Fold {fold+1} Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# ------------------- Final Results -------------------
print("\nFinal Cross-Validation Results:")
print(f"Average Accuracy: {np.mean(acc_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")


100%|██████████| 480/480 [00:10<00:00, 44.05it/s]



🌀 Fold 1 ---------------------------
Fold 1 Accuracy: 0.4583, F1 Score: 0.3753
              precision    recall  f1-score   support

       angry       0.25      0.25      0.25        12
     disgust       0.43      0.25      0.32        12
        fear       0.50      0.25      0.33        12
       happy       0.36      0.33      0.35        12
     neutral       0.65      0.92      0.76        24
         sad       0.14      0.08      0.11        12
    surprise       0.42      0.67      0.52        12

    accuracy                           0.46        96
   macro avg       0.39      0.39      0.38        96
weighted avg       0.43      0.46      0.42        96


🌀 Fold 2 ---------------------------
Fold 2 Accuracy: 0.4062, F1 Score: 0.3017
              precision    recall  f1-score   support

       angry       0.33      0.25      0.29        12
     disgust       0.20      0.17      0.18        12
        fear       0.42      0.42      0.42        12
       happy       0.18   