In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

import librosa
import keras

import tensorflow as tf

import os
import multiprocessing
from concurrent.futures import ThreadPoolExecutor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

# Check GPU
tf.config.experimental.list_physical_devices("GPU")

2024-05-25 12:41:31.103061: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-25 12:41:31.206366: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-25 12:41:33.952871: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-25 12:41:34.024488: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devi

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [2]:
TEST_AUDIO = "data/PMEmo2019/chorus/1.mp3"

AUDIO_BASE_DIR = "data/PMEmo2019/chorus"
AUDIO_FILE_PATHS = [
    os.path.join(AUDIO_BASE_DIR, f)
    for f in os.listdir(AUDIO_BASE_DIR)
    if f.endswith(".mp3")
]
AUDIO_FILE_PATHS.sort()
AUDIO_FILE_PATHS = pd.Series(AUDIO_FILE_PATHS)

N_CHROMA = 12
N_MFCC = 20

ANNOTATIONS_PATH = "data/PMEmo2019/annotations/static_annotations.csv"

In [3]:
def get_features(audio_path: str):
    """Extract features from audio file path"""
    
    # y = audio time series, sr = sampling rate
    y, sr = librosa.load(audio_path)

    rms = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=N_CHROMA)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)

    result = {
        "audio_path": audio_path.split('/')[-1].replace('.mp3', ''),
        "rms": np.mean(rms),
        "spectral_centroid": np.mean(spec_cent),
        "spectral_bandwidth": np.mean(spec_bw),
        "rolloff": np.mean(rolloff),
        "zero_crossing_rate": np.mean(zcr),
    }
    result.update({f"chroma_stft_{i}": np.mean(value) for i, value in enumerate(chroma_stft)})
    result.update({f"mfcc_{i}": np.mean(value) for i, value in enumerate(mfcc)})

    return result

In [4]:
def map_emotion(valence, arousal):
    """
    Maps a pair of valence and arousal values to their respective emotions 
    based on a simplified version of the James Russell Circumplex Model.

    Parameters:
    - valence: float, ranging from -1 (negative) to 1 (positive)
    - arousal: float, ranging from -1 (low) to 1 (high)

    Returns:
    - str: corresponding emotion
    """

    if valence > 0:
        if arousal > 0:
            return 'Excitement'
        elif arousal < 0:
            return 'Contentment'
        else:
            return 'Pleasure'
    elif valence < 0:
        if arousal > 0:
            return 'Anxiety'
        elif arousal < 0:
            return 'Depression'
        else:
            return 'Sadness'
    else:
        if arousal > 0:
            return 'Surprise'
        elif arousal < 0:
            return 'Relaxation'
        else:
            return 'Neutral'

In [5]:
# %%time
# def process_audio(audio_path):
#     features = get_features(audio_path)
#     return features

# executor = ThreadPoolExecutor(max_workers=8)
# results = list(executor.map(process_audio, AUDIO_FILE_PATHS))
# executor.shutdown()

# df = pd.DataFrame.from_records(results, index="audio_path")
# df.sort_index(inplace=True)
# print(df.shape)
# df.head()

In [6]:
# pd.DataFrame.from_records(results, index="audio_path").to_csv("processed_data/PMEMO_features.csv")

In [7]:
df = pd.read_csv("processed_data/PMEMO_features.csv", index_col="audio_path")
df.sort_index(inplace=True)
print(df.shape)
df.head()

(794, 37)


Unnamed: 0_level_0,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,chroma_stft_0,chroma_stft_1,chroma_stft_2,chroma_stft_3,chroma_stft_4,...,mfcc_10,mfcc_11,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19
audio_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.169082,2890.836966,2651.268143,5890.84462,0.130582,0.361955,0.361912,0.489964,0.378262,0.459774,...,-1.529542,13.145842,1.16366,1.01355,2.104428,3.090441,-2.927471,7.722697,-0.89362,3.353019
4,0.322358,1767.099571,2263.079152,3934.328334,0.035608,0.622393,0.631103,0.596248,0.498622,0.519809,...,-0.031715,7.690466,3.221007,7.677089,-1.522897,6.454397,0.659756,4.768893,4.191209,4.778189
5,0.267482,2220.16506,2445.775813,4816.822881,0.067702,0.597106,0.427622,0.350653,0.423177,0.62523,...,1.90438,2.807237,4.809193,7.284648,1.370242,5.980691,-3.340221,3.21849,1.698107,6.369461
6,0.272988,2528.188515,2522.269217,5239.265694,0.088099,0.393185,0.358507,0.429876,0.537144,0.812145,...,5.453836,6.326462,6.083507,4.645279,-2.078648,1.008032,1.28592,-1.203896,-3.845422,8.045063
7,0.264039,2733.668132,2698.892631,5965.531419,0.12945,0.36614,0.336904,0.53482,0.357994,0.385802,...,-1.243261,-4.94656,-4.711036,-2.958707,-3.341433,0.223408,-5.718211,2.31814,-7.992124,-1.408837


In [8]:
# Importing the annotations
annotations = pd.read_csv(ANNOTATIONS_PATH, index_col=0)
annotations.columns = ["valence", "arousal"]

# Scale values from -1 to 1
annotations = annotations.apply(lambda x: x*2-1, axis=0)
annotations.apply(lambda x: map_emotion(*x), axis=1)
annotations

# annotations = pd.DataFrame(annotations.apply(lambda x: map_emotion(*x), axis=1), columns = ["Emotion"])

Unnamed: 0_level_0,valence,arousal
musicId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.200,0.150
4,-0.475,-0.425
5,-0.700,-0.600
6,0.025,-0.300
7,0.400,0.450
...,...,...
993,0.725,0.525
996,0.750,0.125
997,0.425,0.325
999,0.750,0.550


In [9]:
dataset = df.join(annotations, how="inner")
dataset

Unnamed: 0,rms,spectral_centroid,spectral_bandwidth,rolloff,zero_crossing_rate,chroma_stft_0,chroma_stft_1,chroma_stft_2,chroma_stft_3,chroma_stft_4,...,mfcc_12,mfcc_13,mfcc_14,mfcc_15,mfcc_16,mfcc_17,mfcc_18,mfcc_19,valence,arousal
1,0.169082,2890.836966,2651.268143,5890.844620,0.130582,0.361955,0.361912,0.489964,0.378262,0.459774,...,1.163660,1.013550,2.104428,3.090441,-2.927471,7.722697,-0.893620,3.353019,-0.200,0.150
4,0.322358,1767.099571,2263.079152,3934.328334,0.035608,0.622393,0.631103,0.596248,0.498622,0.519809,...,3.221007,7.677089,-1.522897,6.454397,0.659756,4.768893,4.191209,4.778189,-0.475,-0.425
5,0.267482,2220.165060,2445.775813,4816.822881,0.067702,0.597106,0.427622,0.350653,0.423177,0.625230,...,4.809193,7.284648,1.370242,5.980691,-3.340221,3.218490,1.698107,6.369461,-0.700,-0.600
6,0.272988,2528.188515,2522.269217,5239.265694,0.088099,0.393185,0.358507,0.429876,0.537144,0.812145,...,6.083507,4.645279,-2.078648,1.008032,1.285920,-1.203896,-3.845422,8.045063,0.025,-0.300
7,0.264039,2733.668132,2698.892631,5965.531419,0.129450,0.366140,0.336904,0.534820,0.357994,0.385802,...,-4.711036,-2.958707,-3.341433,0.223408,-5.718211,2.318140,-7.992124,-1.408837,0.400,0.450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,0.252154,2787.959363,2774.399349,5970.654376,0.136621,0.389290,0.274861,0.279484,0.395146,0.373042,...,-6.638035,-1.547041,-4.795077,5.085094,2.233570,4.489281,1.413055,2.623292,0.725,0.525
996,0.259318,3219.805720,2978.481811,7022.391025,0.157548,0.366884,0.524673,0.434003,0.401083,0.452278,...,-7.473578,-2.538193,-5.982745,0.483504,-1.839442,2.590527,-0.795077,1.818542,0.750,0.125
997,0.254020,3431.287860,3109.811852,7470.139482,0.153635,0.401828,0.351721,0.437601,0.582035,0.372254,...,-2.675682,2.640695,-0.311210,4.219510,0.099793,4.617830,-1.285356,1.154420,0.425,0.325
999,0.293819,3508.823826,3030.884489,7232.344508,0.152268,0.412378,0.408770,0.422136,0.351956,0.418580,...,2.641866,6.987669,5.044839,0.902298,-1.855185,1.242080,-3.471680,4.516253,0.750,0.550


In [10]:
# label_encoder = LabelEncoder()
# # df['color_encoded'] = label_encoder.fit_transform(df['color'])

# label_encoder.fit_transform(dataset['Emotion'])

In [11]:
predicted_features = ["valence", "arousal"]

X = dataset.drop(columns=predicted_features)
y = dataset[predicted_features]

# preprocessing
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [12]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(613, 37) (154, 37) (613, 2) (154, 2)


In [13]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow.keras.backend as K

input_dim = X_train.shape[1]
output_dim = y_train.shape[1]

# Define a custom metric for R-squared
def r_squared(y_true, y_pred):
    ss_res = K.sum(K.square(y_true - y_pred))
    ss_tot = K.sum(K.square(y_true - K.mean(y_true)))
    return 1 - ss_res / (ss_tot + K.epsilon())

# Define the model
model = Sequential()

# Input layer and first hidden layer
model.add(Dense(64, input_dim=input_dim, activation="relu"))

# Additional hidden layers
model.add(Dense(128, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(64, activation="relu"))
# model.add(Dropout(0.3))
model.add(Dense(32, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(16, activation="relu"))
model.add(Dense(4, activation="relu"))

# Output layer
model.add(Dense(output_dim, activation="tanh"))

# Compile the model
# model.compile(optimizer="adam", loss="mean_squared_error", metrics=["mae"])
model.compile(
    optimizer="adam",
    loss="mean_squared_error",
    metrics=["mae", r_squared],
)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-05-25 12:41:34.194472: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-25 12:41:34.194579: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-25 12:41:34.194601: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-25 12:41:34.342433: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA suppo

In [14]:
model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2)

Epoch 1/100


I0000 00:00:1716621095.927056   28636 service.cc:145] XLA service 0x7f0af8003fe0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1716621095.927102   28636 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2024-05-25 12:41:35.960036: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-25 12:41:36.145471: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m1/8[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m39s[0m 6s/step - loss: 0.1774 - mae: 0.3417 - r_squared: -0.6468

I0000 00:00:1716621100.246405   28636 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 1s/step - loss: 0.1643 - mae: 0.3265 - r_squared: -0.4511 - val_loss: 0.1305 - val_mae: 0.3095 - val_r_squared: -0.1067
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.1304 - mae: 0.2965 - r_squared: -0.0901 - val_loss: 0.1114 - val_mae: 0.2837 - val_r_squared: 0.0553
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.1169 - mae: 0.2727 - r_squared: 0.0398 - val_loss: 0.0934 - val_mae: 0.2520 - val_r_squared: 0.2080
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.1092 - mae: 0.2650 - r_squared: 0.1285 - val_loss: 0.0820 - val_mae: 0.2312 - val_r_squared: 0.3040
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0876 - mae: 0.2390 - r_squared: 0.2543 - val_loss: 0.0800 - val_mae: 0.2290 - val_r_squared: 0.3211
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7f0bdc7d2330>

In [15]:
model.evaluate(X_test, y_test)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 302ms/step - loss: 0.0736 - mae: 0.2126 - r_squared: 0.4000


[0.07732784748077393, 0.22119566798210144, 0.36873260140419006]

In [16]:
model.save("models/audio_model.keras")

# Testing

In [31]:
AUDIO_BASE_DIR = "test_run"
AUDIO_FILE_PATHS = [
    os.path.join(AUDIO_BASE_DIR, f)
    for f in os.listdir(AUDIO_BASE_DIR)
    if f.endswith(".mp3")
]
AUDIO_FILE_PATHS.sort()
AUDIO_FILE_PATHS = pd.Series(AUDIO_FILE_PATHS)

In [36]:
%%time
def process_audio(audio_path):
    features = get_features(audio_path)
    return features

executor = ThreadPoolExecutor(max_workers=8)
results = list(executor.map(process_audio, AUDIO_FILE_PATHS))
executor.shutdown()

df_manual = pd.DataFrame.from_records(results, index="audio_path")
df_manual.sort_index(inplace=True)
print(df_manual.shape)
df_manual.head()

  y, sr = librosa.load(audio_path)


FileNotFoundError: [Errno 2] No such file or directory: 'test_run/nightchanges.mp3'

In [23]:
X = scaler.fit_transform(df)
predictions = model.predict(X)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step


In [29]:
pd.DataFrame(predictions, index=df.index, columns=["valence", "arousal"]).apply(lambda x: map_emotion(*x), axis=1)

audio_path
4kadam                Anxiety
breakup_song       Excitement
woke_up_in_love    Excitement
dtype: object