<a href="https://colab.research.google.com/github/TanyaGupta37/music-emotion-recognition-using-emotify-/blob/main/emotify_reduced_5class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#IMPORTS
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [23]:
# LOAD FEATURES

BASE_DIR = "/content/drive/MyDrive/emotify_colab/data"
FEATURES_PATH = os.path.join(BASE_DIR, "emotify_features.csv")

assert os.path.exists(FEATURES_PATH), "❌ emotify_features.csv not found"

features_df = pd.read_csv(FEATURES_PATH)

print("Loaded emotify_features.csv")
print("Shape:", features_df.shape)

print("\nOriginal emotion distribution:")
print(features_df["emotion"].value_counts())

Loaded emotify_features.csv
Shape: (399, 150)

Original emotion distribution:
emotion
relaxing      69
anxious       54
amusing       49
happy         41
energizing    38
annoying      36
dreamy        35
sad           30
neutral       27
joyful        20
Name: count, dtype: int64


In [4]:
# 5-class reduction mapping (repo-faithful simplification)
emotion_map = {
    "happy": "happy",
    "joyful": "happy",

    "energizing": "energetic",
    "amusing": "energetic",

    "relaxing": "calm",
    "dreamy": "calm",

    "sad": "sad",
    "anxious": "sad",

    "neutral": "neutral"
}

features_df["emotion_5class"] = features_df["emotion"].map(emotion_map)

# Drop rows that don't fit reduced setup
features_5class = features_df.dropna(subset=["emotion_5class"]).copy()

print("\n5-class emotion distribution:")
print(features_5class["emotion_5class"].value_counts())


5-class emotion distribution:
emotion_5class
calm         104
energetic     87
sad           84
happy         61
neutral       27
Name: count, dtype: int64


In [24]:
X = features_5class.drop(columns=["emotion", "emotion_5class"]).values
y = features_5class["emotion_5class"].values

le = LabelEncoder()
y_enc = le.fit_transform(y)

print("\nFinal dataset:")
print("X shape:", X.shape)
print("y shape:", y_enc.shape)
print("Classes:", le.classes_)


Final dataset:
X shape: (363, 149)
y shape: (363,)
Classes: ['calm' 'energetic' 'happy' 'neutral' 'sad']


In [25]:
def train_and_evaluate_repo_style(X, y):

    models = [
        (
            "KNN",
            Pipeline([
                ("scaler", StandardScaler()),
                ("knn", KNeighborsClassifier())
            ]),
            {
                "knn__n_neighbors": [3, 5, 7, 9, 11, 15],
                "knn__weights": ["uniform", "distance"]
            }
        ),
        (
            "SVM",
            Pipeline([
                ("scaler", StandardScaler()),
                ("svm", SVC(kernel="rbf", probability=True, class_weight="balanced"))
            ]),
            {
                "svm__C": [0.1, 1, 10, 100],
                "svm__gamma": ["scale", "auto", 0.1, 0.01, 0.001]
            }
        )
    ]

    cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    results = {}

    for name, pipeline, param_grid in models:
        print("\n" + "="*60)
        print(f"TRAINING {name}")
        print("="*60)

        f1_scores = []
        auc_scores = []

        for train_idx, test_idx in cv_outer.split(X, y):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            grid = GridSearchCV(
                pipeline,
                param_grid,
                cv=5,
                scoring="f1_macro",
                n_jobs=-1
            )

            grid.fit(X_train, y_train)
            best_model = grid.best_estimator_

            y_pred = best_model.predict(X_test)
            y_proba = best_model.predict_proba(X_test)

            f1 = f1_score(y_test, y_pred, average="macro")
            auc = roc_auc_score(y_test, y_proba, multi_class="ovr", average="macro")

            f1_scores.append(f1)
            auc_scores.append(auc)

        print(f"{name} F1 (macro):  {np.mean(f1_scores):.4f} ± {np.std(f1_scores):.4f}")
        print(f"{name} ROC-AUC:     {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")

        results[name] = {
            "f1_mean": np.mean(f1_scores),
            "f1_std": np.std(f1_scores),
            "auc_mean": np.mean(auc_scores),
            "auc_std": np.std(auc_scores)
        }

    return results

In [26]:
results = train_and_evaluate_repo_style(X, y_enc)

print("\nFINAL RESULTS (5-class, repo-style):")
for model, res in results.items():
    print(f"{model}: ROC-AUC = {res['auc_mean']:.4f} ± {res['auc_std']:.4f}")


TRAINING KNN
KNN F1 (macro):  0.2078 ± 0.0585
KNN ROC-AUC:     0.5384 ± 0.0751

TRAINING SVM
SVM F1 (macro):  0.2659 ± 0.0745
SVM ROC-AUC:     0.5449 ± 0.0672

FINAL RESULTS (5-class, repo-style):
KNN: ROC-AUC = 0.5384 ± 0.0751
SVM: ROC-AUC = 0.5449 ± 0.0672


In [27]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

print("Label mapping:")
for i, c in enumerate(le.classes_):
    print(i, "->", c)


Label mapping:
0 -> calm
1 -> energetic
2 -> happy
3 -> neutral
4 -> sad


In [28]:
# ============================================================
# MLP & DEEP MLP — Repo-style evaluation (5-class)
# ============================================================

import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# ------------------------------------------------------------
# MODEL BUILDERS
# ------------------------------------------------------------

def build_mlp(input_dim, num_classes):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),

        layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


def build_deep_mlp(input_dim, num_classes):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),

        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),

        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2),

        layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


# ------------------------------------------------------------
# EVALUATION FUNCTION (same protocol as repo)
# ------------------------------------------------------------

def evaluate_mlp(model_builder, X, y, name):
    print("\n" + "="*60)
    print(f"TRAINING {name}")
    print("="*60)

    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    accs, f1s, aucs = [], [], []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Scale like repo
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        model = model_builder(X_train.shape[1], len(np.unique(y)))

        es = callbacks.EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True,
            verbose=0
        )

        model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=150,
            batch_size=32,
            callbacks=[es],
            verbose=0
        )

        y_pred = model.predict(X_val, verbose=0)
        y_pred_cls = np.argmax(y_pred, axis=1)

        acc = accuracy_score(y_val, y_pred_cls)
        f1 = f1_score(y_val, y_pred_cls, average='macro')

        y_val_bin = label_binarize(y_val, classes=range(len(np.unique(y))))
        auc = roc_auc_score(
            y_val_bin,
            y_pred,
            average='macro',
            multi_class='ovr'
        )

        accs.append(acc)
        f1s.append(f1)
        aucs.append(auc)

        print(f"Fold {fold:2d} | Acc: {acc:.4f} | AUC: {auc:.4f} | F1: {f1:.4f}")

    print("\nFINAL RESULTS")
    print(f"{name} Accuracy: {np.mean(accs):.4f} ± {np.std(accs):.4f}")
    print(f"{name} ROC-AUC:  {np.mean(aucs):.4f} ± {np.std(aucs):.4f}")
    print(f"{name} F1:       {np.mean(f1s):.4f} ± {np.std(f1s):.4f}")

    return {
        "model": name,
        "acc_mean": np.mean(accs),
        "auc_mean": np.mean(aucs),
        "f1_mean": np.mean(f1s)
    }


# ------------------------------------------------------------
# RUN MODELS
# ------------------------------------------------------------

results_mlp = evaluate_mlp(build_mlp, X, y_encoded, "MLP")
results_deep_mlp = evaluate_mlp(build_deep_mlp, X, y_encoded, "Deep MLP")


TRAINING MLP
Fold  1 | Acc: 0.2432 | AUC: 0.4249 | F1: 0.2291




Fold  2 | Acc: 0.2703 | AUC: 0.4715 | F1: 0.2222
Fold  3 | Acc: 0.2703 | AUC: 0.6209 | F1: 0.1887
Fold  4 | Acc: 0.2500 | AUC: 0.5187 | F1: 0.2154
Fold  5 | Acc: 0.3333 | AUC: 0.6105 | F1: 0.2657
Fold  6 | Acc: 0.2222 | AUC: 0.4898 | F1: 0.1795
Fold  7 | Acc: 0.2500 | AUC: 0.4751 | F1: 0.2338
Fold  8 | Acc: 0.2778 | AUC: 0.5160 | F1: 0.1854
Fold  9 | Acc: 0.3611 | AUC: 0.6219 | F1: 0.2504
Fold 10 | Acc: 0.4167 | AUC: 0.6829 | F1: 0.3024

FINAL RESULTS
MLP Accuracy: 0.2895 ± 0.0582
MLP ROC-AUC:  0.5432 ± 0.0802
MLP F1:       0.2273 ± 0.0366

TRAINING Deep MLP
Fold  1 | Acc: 0.3784 | AUC: 0.6214 | F1: 0.2875
Fold  2 | Acc: 0.3514 | AUC: 0.5203 | F1: 0.2837
Fold  3 | Acc: 0.2162 | AUC: 0.4880 | F1: 0.1403
Fold  4 | Acc: 0.3611 | AUC: 0.6228 | F1: 0.3133
Fold  5 | Acc: 0.3333 | AUC: 0.6345 | F1: 0.2627
Fold  6 | Acc: 0.2500 | AUC: 0.4110 | F1: 0.1714
Fold  7 | Acc: 0.3333 | AUC: 0.5926 | F1: 0.2180
Fold  8 | Acc: 0.2500 | AUC: 0.5568 | F1: 0.1354
Fold  9 | Acc: 0.3611 | AUC: 0.6134 | F1: 0

Binary Labels

In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, f1_score

In [9]:
features_df = pd.read_csv(
    '/content/drive/MyDrive/emotify_colab/data/emotify_features.csv'
)

print(features_df.shape)
features_df.head()

(399, 150)


Unnamed: 0,emotion,mfcc_1_mean,mfcc_1_std,mfcc_1_skew,mfcc_1_kurt,mfcc_2_mean,mfcc_2_std,mfcc_2_skew,mfcc_2_kurt,mfcc_3_mean,...,contrast_5_kurt,contrast_6_mean,contrast_6_std,contrast_6_skew,contrast_6_kurt,contrast_7_mean,contrast_7_std,contrast_7_skew,contrast_7_kurt,tempo
0,energizing,-176.934021,51.648315,-3.939842,24.410465,160.534607,23.455715,-2.78987,15.800953,-12.08693,...,0.695224,20.330136,3.117594,0.120977,-0.055072,54.3076,2.758529,-3.0972,29.179667,105.46875
1,happy,-142.582596,57.496311,-0.374013,1.064188,136.733994,30.178448,-1.169064,2.450932,-6.355316,...,-0.331663,18.218653,4.191549,1.125802,1.749784,56.314394,2.862256,-1.331646,21.253667,87.592691
2,neutral,-368.321991,91.605103,-0.268982,-0.866436,144.592834,40.722088,-0.651416,1.127332,11.761579,...,0.330838,20.32608,4.752616,0.783627,0.925233,51.838909,5.965837,-0.331636,-0.583815,132.512019
3,dreamy,-161.545044,73.039825,-1.825867,3.071242,159.977997,22.087666,-1.108009,2.239457,-6.941864,...,0.318027,19.293922,3.619988,0.500151,0.352205,53.372939,3.051601,-0.474054,12.74056,112.347147
4,happy,-162.883896,56.854992,-0.585945,0.861974,128.453094,31.562059,-0.817345,1.075834,-11.790378,...,0.182253,19.369635,3.913594,0.727919,0.821872,57.187478,2.84329,-1.968889,27.152725,117.453835


In [10]:
POSITIVE = ['happy', 'joyful', 'amusing', 'energizing']
NEGATIVE = ['sad', 'annoying', 'anxious']

df = features_df.copy()

df = df[df['emotion'].isin(POSITIVE + NEGATIVE)]

df['binary_emotion'] = df['emotion'].apply(
    lambda x: 1 if x in POSITIVE else 0
)

print(df['binary_emotion'].value_counts())

binary_emotion
1    148
0    120
Name: count, dtype: int64


In [11]:
X = df.drop(['emotion', 'binary_emotion'], axis=1).values
y = df['binary_emotion'].values

print(X.shape, y.shape)

(268, 149) (268,)


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def evaluate_binary(model):
    aucs = []
    for train, test in cv.split(X, y):
        Xtr, Xte = X[train], X[test]
        ytr, yte = y[train], y[test]

        model.fit(Xtr, ytr)
        probs = model.predict_proba(Xte)[:,1]
        aucs.append(roc_auc_score(yte, probs))
    return np.mean(aucs), np.std(aucs)

# KNN
knn = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5, weights='distance'))
])

# SVM
svm = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVC(C=10, gamma='scale', probability=True))
])

print("KNN AUC:", evaluate_binary(knn))
print("SVM AUC:", evaluate_binary(svm))

KNN AUC: (np.float64(0.5330158730158729), np.float64(0.0825983841930494))
SVM AUC: (np.float64(0.5594841269841269), np.float64(0.128079286638827))


PAPER'S PIPELINE

In [13]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

import librosa

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score


In [14]:
POSITIVE = ['happy', 'joyful', 'amusing', 'energizing']
NEGATIVE = ['sad', 'annoying', 'anxious']

def map_binary_label(emotion):
    if emotion in POSITIVE:
        return 1
    elif emotion in NEGATIVE:
        return 0
    else:
        return None


In [15]:
SAMPLE_RATE = 22050
N_MFCC = 13
SEGMENT_SECONDS = 5

def extract_segment_features(audio_path):
    y, sr = librosa.load(audio_path, sr=SAMPLE_RATE)

    segment_len = SEGMENT_SECONDS * sr
    features = []

    for start in range(0, len(y) - segment_len, segment_len):
        segment = y[start:start + segment_len]

        mfcc = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=N_MFCC)

        feat = []
        for i in range(N_MFCC):
            feat.append(np.mean(mfcc[i]))
            feat.append(np.std(mfcc[i]))

        features.append(feat)

    return features


In [16]:
audio_base = '/content/drive/MyDrive/emotify_colab/data/audio/emotify'
labels_path = '/content/drive/MyDrive/emotify_colab/data/emotify_labels_matched.csv'

labels_df = pd.read_csv(labels_path)

X_segments = []
y_segments = []

print("Extracting segment-level features...")

for _, row in tqdm(labels_df.iterrows(), total=len(labels_df)):
    binary_label = map_binary_label(row['dominant_emotion'])

    if binary_label is None:
        continue

    audio_path = os.path.join(audio_base, row['filename'])
    if not os.path.exists(audio_path):
        continue

    seg_feats = extract_segment_features(audio_path)

    for f in seg_feats:
        X_segments.append(f)
        y_segments.append(binary_label)

X = np.array(X_segments, dtype=np.float32)
y = np.array(y_segments, dtype=np.int32)

print("Segment-level data:")
print("X shape:", X.shape)
print("y distribution:", np.bincount(y))

Extracting segment-level features...


100%|██████████| 399/399 [04:47<00:00,  1.39it/s]

Segment-level data:
X shape: (3198, 26)
y distribution: [1436 1762]





In [17]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

models = {
    "KNN": Pipeline([
        ('scaler', StandardScaler()),
        ('clf', KNeighborsClassifier(n_neighbors=15, weights='distance'))
    ]),
    "SVM": Pipeline([
        ('scaler', StandardScaler()),
        ('clf', SVC(kernel='rbf', C=10, gamma='scale', probability=True))
    ])
}

for name, model in models.items():
    aucs = []

    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model.fit(X_train, y_train)
        y_prob = model.predict_proba(X_test)[:, 1]

        auc = roc_auc_score(y_test, y_prob)
        aucs.append(auc)

    print("="*50)
    print(f"{name} SEGMENT-LEVEL RESULTS")
    print(f"ROC-AUC: {np.mean(aucs):.4f} ± {np.std(aucs):.4f}")

KNN SEGMENT-LEVEL RESULTS
ROC-AUC: 0.8874 ± 0.0175
SVM SEGMENT-LEVEL RESULTS
ROC-AUC: 0.8968 ± 0.0203


In [19]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# =========================================================
# MLP MODELS
# =========================================================

def build_mlp(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss='binary_crossentropy'
    )
    return model


def build_deep_mlp(input_dim):
    model = models.Sequential([
        layers.Input(shape=(input_dim,)),

        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.4),

        layers.Dense(64, activation='relu'),
        layers.Dropout(0.3),

        layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(5e-4),
        loss='binary_crossentropy'
    )
    return model


# =========================================================
# EVALUATION FUNCTION (ROC-AUC)
# =========================================================

def evaluate_model(model_builder, X, y, name):
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    aucs = []

    for fold, (train_idx, test_idx) in enumerate(cv.split(X, y), 1):
        print(f"{name} | Fold {fold}/10")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        model = model_builder(X_train.shape[1])

        es = callbacks.EarlyStopping(
            monitor='val_loss',
            patience=10,
            restore_best_weights=True,
            verbose=0
        )

        rl = callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=5,
            min_lr=1e-6,
            verbose=0
        )

        model.fit(
            X_train, y_train,
            validation_split=0.2,
            epochs=100,
            batch_size=64,
            callbacks=[es, rl],
            verbose=0
        )

        y_prob = model.predict(X_test, verbose=0).ravel()
        auc = roc_auc_score(y_test, y_prob)
        aucs.append(auc)

        print(f"  Fold AUC: {auc:.4f}")

    print("="*60)
    print(f"{name} FINAL RESULTS")
    print(f"ROC-AUC: {np.mean(aucs):.4f} ± {np.std(aucs):.4f}")
    print("="*60)

    return np.mean(aucs), np.std(aucs)


# =========================================================
# RUN MODELS
# =========================================================

mlp_auc = evaluate_model(build_mlp, X, y, "MLP")
deep_mlp_auc = evaluate_model(build_deep_mlp, X, y, "Deep MLP")

MLP | Fold 1/10
  Fold AUC: 0.6859
MLP | Fold 2/10
  Fold AUC: 0.6370
MLP | Fold 3/10
  Fold AUC: 0.6091
MLP | Fold 4/10
  Fold AUC: 0.6428
MLP | Fold 5/10
  Fold AUC: 0.5839
MLP | Fold 6/10
  Fold AUC: 0.6866
MLP | Fold 7/10
  Fold AUC: 0.5458
MLP | Fold 8/10
  Fold AUC: 0.6168
MLP | Fold 9/10
  Fold AUC: 0.5508
MLP | Fold 10/10
  Fold AUC: 0.6209
MLP FINAL RESULTS
ROC-AUC: 0.6180 ± 0.0461
Deep MLP | Fold 1/10
  Fold AUC: 0.7092
Deep MLP | Fold 2/10
  Fold AUC: 0.6494
Deep MLP | Fold 3/10
  Fold AUC: 0.7500
Deep MLP | Fold 4/10
  Fold AUC: 0.6568
Deep MLP | Fold 5/10
  Fold AUC: 0.7694
Deep MLP | Fold 6/10
  Fold AUC: 0.6604
Deep MLP | Fold 7/10
  Fold AUC: 0.5136
Deep MLP | Fold 8/10
  Fold AUC: 0.6264
Deep MLP | Fold 9/10
  Fold AUC: 0.5434
Deep MLP | Fold 10/10
  Fold AUC: 0.6304
Deep MLP FINAL RESULTS
ROC-AUC: 0.6509 ± 0.0767
