In [3]:
import os
import pandas as pd
import numpy as np
import re
import string
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from skopt import BayesSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sentence_transformers import SentenceTransformer


ANNOTATIONS_PATH = "/kaggle/input/datasets/zheskychel/pmemo2019/PMEmo2019/annotations/static_annotations.csv"
LYRICS_PATH = "/kaggle/input/datasets/zheskychel/pmemo2019/PMEmo2019/lyrics"

annotations = pd.read_csv(ANNOTATIONS_PATH)
annotations = annotations.sort_values("musicId").reset_index(drop=True)


def load_lyrics(music_id):
    path = os.path.join(LYRICS_PATH, f"{music_id}.lrc")
    try:
        with open(path, encoding="utf-8") as f:
            return f.read()
    except:
        return ""

annotations["lyrics"] = annotations["musicId"].apply(load_lyrics).fillna("")


#Text cleaning
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

annotations["clean_lyrics"] = annotations["lyrics"].apply(clean_text)


#BERT embeddings
encoder = SentenceTransformer("all-MiniLM-L6-v2")
X_text = encoder.encode(
    annotations["clean_lyrics"].tolist(),
    show_progress_bar=True
)

#Targets
y_val = annotations["Valence(mean)"].values
y_aro = annotations["Arousal(mean)"].values


#Split
idx = np.arange(len(annotations))

idx_train, idx_test = train_test_split(
    idx,
    test_size=0.2,
    random_state=42
)

#TEXT only data
X_train_val = X_text[idx_train]
X_test_val  = X_text[idx_test]

X_train_aro = X_text[idx_train]
X_test_aro  = X_text[idx_test]

y_val_train, y_val_test = y_val[idx_train], y_val[idx_test]
y_aro_train, y_aro_test = y_aro[idx_train], y_aro[idx_test]

Batches:   0%|          | 0/24 [00:00<?, ?it/s]

In [4]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from skopt import BayesSearchCV

models = {
    'Ridge': (
        Ridge(),
        {'alpha': (1e-3, 1e+3, 'log-uniform')}
    ),

    'SVR': (
        SVR(),
        {
            'C': (1e-3, 1e+3, 'log-uniform'),
            'gamma': (1e-4, 1e-1, 'log-uniform'),
            'kernel': ['rbf']
        }
    ),

    'Random Forest': (
        RandomForestRegressor(),
        {
            'n_estimators': (100, 1000),
            'max_depth': (5, 50),
            'min_samples_split': (2, 20),
            'min_samples_leaf': (1, 10)
        }
    ),

    'XGBoost': (
        XGBRegressor(
            objective='reg:squarederror',
            tree_method='hist',
            device='cuda'
        ),
        {
            'n_estimators': (100, 1000),
            'max_depth': (3, 15),
            'learning_rate': (1e-2, 3e-1, 'log-uniform'),
            'subsample': (0.5, 1.0),
            'colsample_bytree': (0.5, 1.0)
        }
    ),

    'k-Nearest Neighbors': (
        KNeighborsRegressor(),
        {
            'n_neighbors': (1, 30),
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    )
}


def reg_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

results_val = []
trained_models_val = {}

In [5]:
for name, (model, param_grid) in tqdm(models.items()):
    print(f"\nTraining {name}...")

    pipeline = Pipeline([('model', model)])

    opt = BayesSearchCV(
        estimator=pipeline,
        search_spaces={'model__' + k: v for k, v in param_grid.items()},
        n_iter=40,
        scoring='neg_root_mean_squared_error',  # оптимизируем RMSE
        cv=3,
        n_jobs=-1,
        verbose=0,
        random_state=42,
        refit=True,
    )

    opt.fit(X_train_val, y_val_train)

    y_pred = opt.predict(X_test_val)

    mae, rmse, r2 = reg_metrics(y_val_test, y_pred)

    print(f"Model: {name}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")

    results_val.append({
        'Model': name,
        'Best Params': opt.best_params_,
        'MAE': float(mae),
        'RMSE': float(rmse),
        'R2': float(r2),
        'Best CV (neg_RMSE)': float(opt.best_score_),
    })

    trained_models_val[name] = opt

results_val_df = pd.DataFrame(results_val).sort_values(by='RMSE', ascending=True)
display(results_val_df)

  0%|          | 0/5 [00:00<?, ?it/s]


Training Ridge...


 20%|██        | 1/5 [00:30<02:03, 30.85s/it]

Model: Ridge, MAE: 0.1143, RMSE: 0.1452, R2: 0.0988

Training SVR...


 40%|████      | 2/5 [01:06<01:40, 33.57s/it]

Model: SVR, MAE: 0.1128, RMSE: 0.1427, R2: 0.1291

Training Random Forest...


 60%|██████    | 3/5 [31:55<28:45, 862.53s/it]

Model: Random Forest, MAE: 0.1100, RMSE: 0.1396, R2: 0.1665

Training XGBoost...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Model: XGBoost, MAE: 0.1114, RMSE: 0.1411, R2: 0.1494

Training k-Nearest Neighbors...


100%|██████████| 5/5 [44:00<00:00, 528.01s/it]

Model: k-Nearest Neighbors, MAE: 0.1163, RMSE: 0.1458, R2: 0.0914





Unnamed: 0,Model,Best Params,MAE,RMSE,R2,Best CV (neg_RMSE)
2,Random Forest,"{'model__max_depth': 47, 'model__min_samples_l...",0.109988,0.139645,0.166468,-0.151363
3,XGBoost,{'model__colsample_bytree': 0.6414003480362944...,0.111428,0.141069,0.149381,-0.150148
1,SVR,"{'model__C': 3.464647947339935, 'model__gamma'...",0.112765,0.142742,0.129086,-0.150806
0,Ridge,{'model__alpha': 2.149646021368094},0.114299,0.145206,0.098757,-0.149429
4,k-Nearest Neighbors,"{'model__n_neighbors': 24, 'model__p': 1, 'mod...",0.116309,0.145794,0.09144,-0.148892


In [6]:
results_val_df.to_csv('valence_regression_text_only_results.csv', index=False)

# Arousal

In [9]:
import ast
from sklearn.pipeline import Pipeline

def parse_best_params(x):
    if isinstance(x, dict):
        return x
    if isinstance(x, str):
        x = x.strip()
        if not x:
            return {}
        return ast.literal_eval(x)
    return {}

#Model -> Best Params (FROM VALENCE TUNING)
best_params_map = {
    row["Model"]: parse_best_params(row["Best Params"])
    for _, row in results_val_df.iterrows()
}

results_aro = []
trained_models_aro = {}

for name, (model, param_grid) in tqdm(models.items()):
    print(f"\nRefit {name} with params from results_val_df...")

    if name not in best_params_map:
        print(f"[SKIP] {name} not found in results_val_df")
        continue

    best_params = best_params_map[name]
    pipeline = Pipeline([('model', model)])

    if best_params:
        pipeline.set_params(**best_params)

    pipeline.fit(X_train_aro, y_aro_train)
    y_pred = pipeline.predict(X_test_aro)

    mae, rmse, r2 = reg_metrics(y_aro_test, y_pred)

    print(f"Model: {name}, MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")

    results_aro.append({
        "Model": name,
        "Best Params": best_params,
        "MAE": float(mae),
        "RMSE": float(rmse),
        "R2": float(r2),
    })

    trained_models_aro[name] = pipeline

results_aro_df = pd.DataFrame(results_aro).sort_values(by="RMSE", ascending=True).reset_index(drop=True)
display(results_aro_df)

 20%|██        | 1/5 [00:00<00:00,  6.82it/s]


Refit Ridge with params from results_val_df...
Model: Ridge, MAE: 0.1465, RMSE: 0.1803, R2: 0.0412

Refit SVR with params from results_val_df...
Model: SVR, MAE: 0.1477, RMSE: 0.1822, R2: 0.0208

Refit Random Forest with params from results_val_df...


 60%|██████    | 3/5 [00:12<00:09,  4.72s/it]

Model: Random Forest, MAE: 0.1469, RMSE: 0.1800, R2: 0.0438

Refit XGBoost with params from results_val_df...


100%|██████████| 5/5 [00:26<00:00,  5.21s/it]

Model: XGBoost, MAE: 0.1454, RMSE: 0.1810, R2: 0.0335

Refit k-Nearest Neighbors with params from results_val_df...
Model: k-Nearest Neighbors, MAE: 0.1501, RMSE: 0.1871, R2: -0.0330





Unnamed: 0,Model,Best Params,MAE,RMSE,R2
0,Random Forest,"{'model__max_depth': 47, 'model__min_samples_l...",0.146859,0.180035,0.043828
1,Ridge,{'model__alpha': 2.149646021368094},0.146493,0.180285,0.04118
2,XGBoost,{'model__colsample_bytree': 0.6414003480362944...,0.145381,0.181004,0.033509
3,SVR,"{'model__C': 3.464647947339935, 'model__gamma'...",0.147702,0.182188,0.020824
4,k-Nearest Neighbors,"{'model__n_neighbors': 24, 'model__p': 1, 'mod...",0.150113,0.187129,-0.032999


In [10]:
results_aro_df.to_csv('arousal_regression_text_only_results.csv', index=False)

In [11]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix

#Russell model of emotion
def assign_quadrant(valence, arousal, thr=0.5):
    if valence <= thr and arousal <= thr:
        return 2
    elif valence > thr and arousal <= thr:
        return 3
    elif valence <= thr and arousal > thr:
        return 1
    else:
        return 0

def quadrant_vec(v, a, thr=0.5):
    v = np.asarray(v).ravel()
    a = np.asarray(a).ravel()
    return np.array([assign_quadrant(v[i], a[i], thr=thr) for i in range(len(v))], dtype=int)
    
y_true_quad = quadrant_vec(y_val_test, y_aro_test, thr=0.5)

In [12]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def reg_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2

trained_models_val = dict(trained_models_val)
trained_models_aro = dict(trained_models_aro)

#Valence LinearRegression
lin_val = LinearRegression()
lin_val.fit(X_train_val, y_val_train)
v_pred = lin_val.predict(X_test_val)
mae_v, rmse_v, r2_v = reg_metrics(y_val_test, v_pred)
print(f"LinearReg (Valence): MAE={mae_v:.4f} RMSE={rmse_v:.4f} R2={r2_v:.4f}")

trained_models_val["Linear Regression"] = lin_val

#Arousal LinearRegression
lin_aro = LinearRegression()
lin_aro.fit(X_train_aro, y_aro_train)
a_pred = lin_aro.predict(X_test_aro)
mae_a, rmse_a, r2_a = reg_metrics(y_aro_test, a_pred)
print(f"LinearReg (Arousal): MAE={mae_a:.4f} RMSE={rmse_a:.4f} R2={r2_a:.4f}")

trained_models_aro["Linear Regression"] = lin_aro

LinearReg (Valence): MAE=0.2078 RMSE=0.2688 R2=-2.0888
LinearReg (Arousal): MAE=0.2647 RMSE=0.3552 R2=-2.7229


In [13]:
results_quad = []

common_models = sorted(set(trained_models_val.keys()) & set(trained_models_aro.keys()))

for name in common_models:
    model_val = trained_models_val[name]
    model_aro = trained_models_aro[name]

    v_pred = model_val.predict(X_test_val)
    a_pred = model_aro.predict(X_test_aro)

    y_pred_quad = quadrant_vec(v_pred, a_pred, thr=0.5)

    acc = accuracy_score(y_true_quad, y_pred_quad)

    # weighted metrics
    f1 = f1_score(y_true_quad, y_pred_quad, average="weighted")
    prec, rec, _, _ = precision_recall_fscore_support(
        y_true_quad,
        y_pred_quad,
        average="weighted",
        zero_division=0
    )

    results_quad.append({
        "Model": name,
        "Accuracy": float(acc),
        "Precision": float(prec),
        "Recall": float(rec),
        "F1": float(f1),
    })

results_quad_df = (
    pd.DataFrame(results_quad)
    .sort_values("F1", ascending=False)
    .reset_index(drop=True)
)

display(results_quad_df)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Ridge,0.642857,0.652909,0.642857,0.579672
1,SVR,0.649351,0.672873,0.649351,0.569161
2,XGBoost,0.642857,0.600383,0.642857,0.567705
3,Random Forest,0.668831,0.489508,0.668831,0.564706
4,Linear Regression,0.558442,0.559673,0.558442,0.556652
5,k-Nearest Neighbors,0.616883,0.667798,0.616883,0.551582


In [14]:
results_quad_df.to_csv('PMEmo_classification_text_only_results.csv', index=False)