imports

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor

random seed

In [None]:
seed = 42

### Helper functions

In [None]:
# function that splits data into train, validation, and test sets
def train_val_test_split(X, Y, train_split=0.8, val_split=0.1, test_split=0.1, random_seed=seed):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_split, random_state=random_seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_split/(train_split+val_split), random_state=random_seed)
    return X_train, X_val, X_test, y_train, y_val, y_test
# evaluation function
def evaluation(y_pred, y_test):
    valence_predicted = np.array(y_pred)[:, 0]
    arousal_predicted = np.array(y_pred)[:, 1]
    valence_test = np.array(y_test)[:, 0]
    arousal_test = np.array(y_test)[:, 1]
    # MSE
    valence_mse = mean_squared_error(valence_test, valence_predicted)
    arousal_mse = mean_squared_error(arousal_test, arousal_predicted)
    # RMSE
    valence_rmse = np.sqrt(valence_mse)
    arousal_rmse = np.sqrt(arousal_mse)
    # MAE
    valence_mae = mean_absolute_error(valence_test, valence_predicted)
    arousal_mae = mean_absolute_error(arousal_test, arousal_predicted)
    # R^2 Score
    valence_r2 = r2_score(valence_test, valence_predicted)
    arousal_r2 = r2_score(arousal_test, arousal_predicted)

    print("Valence MSE:", valence_mse)
    print("Arousal MSE:", arousal_mse)
    print("Valence RMSE:", valence_rmse)
    print("Arousal RMSE:", arousal_rmse)
    print("Valence MAE:", valence_mae)
    print("Arousal MAE:", arousal_mae)
    print("Valence R^2 Score:", valence_r2)
    print("Arousal R^2 Score:", arousal_r2)

### Preprocessing

In [None]:
df_data = pd.read_csv("../data/processed_multi_modal.csv")

audio_features_top4 = ["loudness", "instrumentalness", "time_signature", "energy"]
audio_features_top9 = ["loudness", "instrumentalness", "time_signature", "energy", "danceability", "tempo", "acousticness", "key", "speechiness"]
audio_features_all  = ["danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "tempo", "time_signature"]

lyric_features = ["compound","neg", "pos", "neu", "pca_tfidf0", "pca_tfidf1", "pca_tfidf2", "pca_tfidf3", "pca_tfidf4", "pca_tfidf5", "pca_tfidf6", "pca_tfidf7", 
                  "pca_tfidf8", "pca_tfidf9", "pca_tfidf10", "pca_tfidf11", "pca_tfidf12", "pca_tfidf13", "pca_tfidf14", "pca_tfidf15", "pca_tfidf16", "pca_tfidf17", 
                  "pca_tfidf18", "pca_tfidf19", "pca_tfidf20", "pca_tfidf21", "pca_tfidf22", "pca_tfidf23", "pca_tfidf24", "pca_tfidf25", "pca_tfidf26", "pca_tfidf27", 
                  "pca_tfidf28", "pca_tfidf29", "pca_tfidf30", "pca_tfidf31", "pca_tfidf32", "pca_tfidf33", "pca_tfidf34", "pca_tfidf35", "pca_tfidf36", "pca_tfidf37", 
                  "pca_tfidf38", "pca_tfidf39", "pca_tfidf40", "pca_tfidf41", "pca_tfidf42", "pca_tfidf43", "pca_tfidf44", "pca_tfidf45", "pca_tfidf46", "pca_tfidf47", 
                  "pca_tfidf48", "pca_tfidf49"]

ys_features = ["valence", "arousal"]

df_audio = df_data[audio_features_top9]
df_lyric = df_data[lyric_features]
df_multi = df_data[audio_features_top9 + lyric_features]
df_ys    = df_data[ys_features]

### Support Vector Regressor

define dataset

In [None]:
X, Y = df_multi, df_ys
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X, Y)

optimizing parameters

In [None]:
base_regressor = SVR() # base regressor
svr_model = MultiOutputRegressor(base_regressor)

# parameters to be tuned
param_grid = {
    "estimator__kernel": ["linear", "rbf", "poly"],  # kernel types
    "estimator__C": [1, 5, 10]                       # regularization parameter
}

# grid search on both outputs simultaneously
grid_search = GridSearchCV(svr_model, param_grid, scoring="r2", verbose=1, n_jobs=-1, return_train_score=True)
grid_search.fit(X_val, y_val)  # X_val: Validation input features, y_val: Validation target variables

# Get the best parameters
best_params = grid_search.best_params_

In [None]:
best_C, best_kernal = best_params["estimator__C"], best_params["estimator__kernel"]

training

In [None]:
# model with the best parameters
svr_model_best = MultiOutputRegressor(SVR(C=best_C, kernel=best_kernal))

# Train the model on the training data
svr_model_best.fit(X_train, y_train) 

prediction

In [None]:
y_pred = svr_model_best.predict(X_test)

evaluation

In [None]:
evaluation(y_pred, y_test)