In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression


ANNOTATIONS_PATH = "/kaggle/input/pmemo-2019/PMEmo2019/annotations/static_annotations.csv"
FEATURES_PATH = "/kaggle/input/pmemo-2019/PMEmo2019/features/static_features.csv"
annotations = pd.read_csv(ANNOTATIONS_PATH)
features = pd.read_csv(FEATURES_PATH)



features = features.sort_values(by="musicId").reset_index(drop=True)
annotations = annotations.sort_values(by="musicId").reset_index(drop=True)


merged = pd.merge(features, annotations, on='musicId', how='inner')


X = merged.drop(columns=['musicId', 'Arousal(mean)', 'Valence(mean)'])
y_valence = merged['Valence(mean)']
y_arousal = merged['Arousal(mean)']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


#Valence Features
selector_valence = SelectKBest(score_func=f_regression, k=400)
X_valence = selector_valence.fit_transform(X_scaled, y_valence)


#Arousal Features
selector_arousal = SelectKBest(score_func=f_regression, k=400)
X_arousal = selector_arousal.fit_transform(X_scaled, y_arousal)

In [3]:
X_valence.shape

(767, 400)

In [18]:
import time
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from skopt import BayesSearchCV
from tqdm import tqdm
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor


X_train, X_test, y_train, y_test = train_test_split(
    X_valence, y_valence, test_size=0.2, random_state=42
)


models = {
        
    'XGBoost': (
    XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist'
    ),
        {
            'n_estimators': (100, 1400),
            'max_depth': (3, 25),
            'learning_rate': (0.01, 0.05),
            'subsample': (0.3, 1.0),
            'colsample_bytree': (0.3, 1.0)
        }
    ),
    
    
    'Ridge': (
        Ridge(),
        {'alpha': (1e-3, 1e+3, 'log-uniform')}
    ),
    
    'SVR': (
        SVR(),
        {
            'C': (1e-3, 1e+3, 'log-uniform'),
            'gamma': (1e-4, 1e-1, 'log-uniform'),
            'kernel': ['rbf']
        }
    ),
    
    'Random Forest': (
        RandomForestRegressor(),
        {
            'n_estimators': (100, 1400),
            'max_depth': (5, 70),
            'min_samples_split': (2, 40),
            'min_samples_leaf': (1, 20)
        }
    ),
    
    'kNN': (
        KNeighborsRegressor(),
        {
            'n_neighbors': (1, 35),
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    )
}


results_valence = []

for name, (model, param_grid) in tqdm(models.items()):
    print(f"\nTraining {name}")
    
    pipeline = Pipeline([('model', model)])
    
    opt = BayesSearchCV(
        pipeline,
        {'model__' + k: v for k, v in param_grid.items()},
        n_iter=30,
        scoring='neg_mean_squared_error',
        cv=3,
        n_jobs=-1,
        verbose=0,
        random_state=42
    )
    

    start_train = time.time()
    opt.fit(X_train, y_train)
    end_train = time.time()
    train_time = end_train - start_train
    

    start_pred = time.time()
    y_pred = opt.predict(X_test)
    end_pred = time.time()
    pred_time = end_pred - start_pred
    

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Model: {name}, MAE: {mae}, RMSE: {rmse}')
    

    results_valence.append({
        'Model': name,
        'Best Params': opt.best_params_,
        'RMSE': rmse,
        'MAE': mae,
        'Train Time (s)': train_time,
        'Inference Time (s)': pred_time
    })

results_valence_df = pd.DataFrame(results_valence)
results_valence_df = results_valence_df.sort_values(by='RMSE').reset_index(drop=True)

print("Valence")
display(results_valence_df)

results_valence_df.to_csv('valence_regression_results.csv', index=False)

  0%|          | 0/6 [00:00<?, ?it/s]


Training XGBoost


 17%|█▋        | 1/6 [36:14<3:01:11, 2174.30s/it]

Model: XGBoost, MAE: 0.08768523454385006, RMSE: 0.11183627088677248

Training Linear Regression





ValueError: The search_spaces parameter should contain at least onenon-empty search space, got {}

In [20]:
models_nnn = {
    
    'Ridge': (
        Ridge(),
        {'alpha': (1e-3, 1e+3, 'log-uniform')}
    ),
    
    'SVR': (
        SVR(),
        {
            'C': (1e-3, 1e+3, 'log-uniform'),
            'gamma': (1e-4, 1e-1, 'log-uniform'),
            'kernel': ['rbf']
        }
    ),
    
    'Random Forest': (
        RandomForestRegressor(),
        {
            'n_estimators': (100, 1400),
            'max_depth': (5, 70),
            'min_samples_split': (2, 40),
            'min_samples_leaf': (1, 20)
        }
    ),
    
    'kNN': (
        KNeighborsRegressor(),
        {
            'n_neighbors': (1, 35),
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    )
}

for name, (model, param_grid) in tqdm(models_nnn.items()):
    print(f"\nTraining {name}")
    
    pipeline = Pipeline([('model', model)])
    
    opt = BayesSearchCV(
        pipeline,
        {'model__' + k: v for k, v in param_grid.items()},
        n_iter=30,
        scoring='neg_mean_squared_error',
        cv=3,
        n_jobs=-1,
        verbose=0,
        random_state=42
    )
    

    start_train = time.time()
    opt.fit(X_train, y_train)
    end_train = time.time()
    train_time = end_train - start_train
    

    start_pred = time.time()
    y_pred = opt.predict(X_test)
    end_pred = time.time()
    pred_time = end_pred - start_pred
    

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Model: {name}, MAE: {mae}, RMSE: {rmse}')
    

    results_valence.append({
        'Model': name,
        'Best Params': opt.best_params_,
        'RMSE': rmse,
        'MAE': mae,
        'Train Time (s)': train_time,
        'Inference Time (s)': pred_time
    })

results_valence_df = pd.DataFrame(results_valence)
results_valence_df = results_valence_df.sort_values(by='RMSE').reset_index(drop=True)

print("Valence")
display(results_valence_df)

results_valence_df.to_csv('valence_regression_results.csv', index=False)

  0%|          | 0/4 [00:00<?, ?it/s]


Training Ridge


 25%|██▌       | 1/4 [00:17<00:51, 17.27s/it]

Model: Ridge, MAE: 0.08956046894279142, RMSE: 0.11498066749493462

Training SVR


 50%|█████     | 2/4 [00:35<00:35, 17.67s/it]

Model: SVR, MAE: 0.08996119857800695, RMSE: 0.1138006814503485

Training Random Forest


 75%|███████▌  | 3/4 [20:40<09:19, 559.78s/it]

Model: Random Forest, MAE: 0.08936826175554394, RMSE: 0.11312013027052063

Training kNN


100%|██████████| 4/4 [20:56<00:00, 314.01s/it]

Model: kNN, MAE: 0.0919506510873016, RMSE: 0.11516823442278097
Valence





Unnamed: 0,Model,Best Params,RMSE,MAE,Train Time (s),Inference Time (s)
0,XGBoost,{'model__colsample_bytree': 0.9400391622985333...,0.111836,0.087685,2174.280951,0.005671
1,Random Forest,"{'model__max_depth': 24, 'model__min_samples_l...",0.11312,0.089368,1204.842289,0.046436
2,SVR,"{'model__C': 1.7322493477589913, 'model__gamma...",0.113801,0.089961,17.918732,0.014295
3,Ridge,{'model__alpha': 299.7879984490512},0.114981,0.08956,17.265595,0.002172
4,kNN,"{'model__n_neighbors': 31, 'model__p': 2, 'mod...",0.115168,0.091951,15.874228,0.025952


In [22]:
models_nnn = {

    'XGBoost': (
    XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist'
    ),
        {
            'n_estimators': (100, 1400),
            'max_depth': (3, 25),
            'learning_rate': (0.01, 0.05),
            'subsample': (0.3, 1.0),
            'colsample_bytree': (0.3, 1.0)
        }
    ),
    
    'Ridge': (
        Ridge(),
        {'alpha': (1e-3, 1e+3, 'log-uniform')}
    ),
    
    'SVR': (
        SVR(),
        {
            'C': (1e-3, 1e+3, 'log-uniform'),
            'gamma': (1e-4, 1e-1, 'log-uniform'),
            'kernel': ['rbf']
        }
    ),
    
    'Random Forest': (
        RandomForestRegressor(),
        {
            'n_estimators': (100, 1400),
            'max_depth': (5, 70),
            'min_samples_split': (2, 40),
            'min_samples_leaf': (1, 20)
        }
    ),
    
    'kNN': (
        KNeighborsRegressor(),
        {
            'n_neighbors': (1, 35),
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    )
}

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X_arousal, y_arousal, test_size=0.2, random_state=42
)

results_arousal = []

for name, (model, param_grid) in tqdm(models_nnn.items()):
    print(f"\n Training: {name}")
    
    pipeline = Pipeline([('model', model)])
    
    opt = BayesSearchCV(
        pipeline,
        {'model__' + k: v for k, v in param_grid.items()},
        n_iter=30,
        scoring='neg_mean_squared_error',
        cv=3,
        n_jobs=-1,
        verbose=0,
        random_state=42
    )
    

    start_train = time.time()
    opt.fit(X_train, y_train)
    end_train = time.time()
    train_time = end_train - start_train
    

    start_pred = time.time()
    y_pred = opt.predict(X_test)
    end_pred = time.time()
    pred_time = end_pred - start_pred
    

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Model: {name}, MAE: {mae}, RMSE: {rmse}')
    

    results_arousal.append({
        'Model': name,
        'Best Params': opt.best_params_,
        'RMSE': rmse,
        'MAE': mae,
        'Train Time (s)': train_time,
        'Inference Time (s)': pred_time
    })



results_arousal_df = pd.DataFrame(results_arousal)
results_arousal_df = results_arousal_df.sort_values(by='RMSE').reset_index(drop=True)

print("Arousal:")
display(results_arousal_df)

results_arousal_df.to_csv('arousal_regression_results.csv', index=False)

  0%|          | 0/5 [00:00<?, ?it/s]


 Training: XGBoost


 20%|██        | 1/5 [36:13<2:24:55, 2173.87s/it]

Model: XGBoost, MAE: 0.07280545247347336, RMSE: 0.09845101464890176

 Training: Ridge


 40%|████      | 2/5 [36:29<45:12, 904.12s/it]   

Model: Ridge, MAE: 0.0731724524807826, RMSE: 0.09506634981962096

 Training: SVR


 60%|██████    | 3/5 [36:47<16:39, 499.75s/it]

Model: SVR, MAE: 0.07295508784469505, RMSE: 0.09531350713696297

 Training: Random Forest


 80%|████████  | 4/5 [54:02<11:50, 710.87s/it]

Model: Random Forest, MAE: 0.07406369532795407, RMSE: 0.09979286340357696

 Training: kNN


100%|██████████| 5/5 [54:18<00:00, 651.73s/it]

Model: kNN, MAE: 0.07520338661056082, RMSE: 0.10087263455671898
Arousal:





Unnamed: 0,Model,Best Params,RMSE,MAE,Train Time (s),Inference Time (s)
0,Ridge,{'model__alpha': 569.0560928291766},0.095066,0.073172,15.283967,0.003298
1,SVR,"{'model__C': 1.7822044492040485, 'model__gamma...",0.095314,0.072955,18.520894,0.011618
2,XGBoost,"{'model__colsample_bytree': 0.708964133941113,...",0.098451,0.072805,2173.85864,0.003211
3,Random Forest,"{'model__max_depth': 70, 'model__min_samples_l...",0.099793,0.074064,1034.451573,0.04665
4,kNN,"{'model__n_neighbors': 16, 'model__p': 1, 'mod...",0.100873,0.075203,16.38631,0.023483


In [24]:
results_arousal_df

Unnamed: 0,Model,Best Params,RMSE,MAE,Train Time (s),Inference Time (s)
0,Ridge,{'model__alpha': 569.0560928291766},0.095066,0.073172,15.283967,0.003298
1,SVR,"{'model__C': 1.7822044492040485, 'model__gamma...",0.095314,0.072955,18.520894,0.011618
2,XGBoost,"{'model__colsample_bytree': 0.708964133941113,...",0.098451,0.072805,2173.85864,0.003211
3,Random Forest,"{'model__max_depth': 70, 'model__min_samples_l...",0.099793,0.074064,1034.451573,0.04665
4,kNN,"{'model__n_neighbors': 16, 'model__p': 1, 'mod...",0.100873,0.075203,16.38631,0.023483


In [None]:
results_arousal_df
results_valence_df

# Final Models Training

In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

idx = np.arange(len(X_arousal))

idx_train, idx_test = train_test_split(
    idx,
    test_size=0.1,
    random_state=42
)

X_train_aro = X_arousal[idx_train]
X_test_aro  = X_arousal[idx_test]

y_train_aro = y_arousal[idx_train]
y_test_aro = y_arousal[idx_test]



X_train_val = X_valence[idx_train]
X_test_val  = X_valence[idx_test]

y_train_val = y_valence[idx_train]
y_test_val = y_valence[idx_test]

In [35]:
import ast
import joblib
from sklearn.pipeline import Pipeline

def _ensure_dict(x):
    if isinstance(x, dict):
        return x
    if isinstance(x, str):
        x = x.strip()
        if not x:
            return {}
        try:
            return ast.literal_eval(x)
        except Exception:
            raise ValueError(f"Cannot parse Best Params string: {x[:200]}")
    return {}

def refit_best_models(results_df, models_dict, X_train, y_train, *,
                      save_dir=None, prefix=""):

    refit = {}

    for _, row in results_df.iterrows():
        name = row["Model"]
        best_params = _ensure_dict(row["Best Params"])

        if name not in models_dict:
            print(f"[WARN] '{name}' нет в models_dict — пропускаю")
            continue

        base_estimator = models_dict[name][0]
        pipe = Pipeline([("model", base_estimator)])


        if best_params:
            pipe.set_params(**best_params)

        pipe.fit(X_train, y_train)

        refit[name] = pipe

        if save_dir is not None:
            safe_name = name.replace(" ", "_").replace("/", "_")
            joblib.dump(pipe, f"{save_dir}/{prefix}{safe_name}.joblib")

    return refit


In [36]:
best_models_arousal = refit_best_models(
    results_df=results_arousal_df,
    models_dict=models_nnn,
    X_train=X_train_aro,
    y_train=y_train_aro,
    save_dir=None,
    prefix="arousal_"
)

In [37]:
best_models_valence = refit_best_models(
    results_df=results_valence_df,
    models_dict=models_nnn,
    X_train=X_train_val,
    y_train=y_train_val,
    save_dir=None,
    prefix="valence_"
)

In [38]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix

#Russell model of emotion
def assign_quadrant(valence, arousal, thr=0.5):
    if valence <= thr and arousal <= thr:
        return 2
    elif valence > thr and arousal <= thr:
        return 3
    elif valence <= thr and arousal > thr:
        return 1
    else:
        return 0

def quadrant_vec(v, a, thr=0.5):
    v = np.asarray(v).ravel()
    a = np.asarray(a).ravel()
    return np.array([assign_quadrant(v[i], a[i], thr=thr) for i in range(len(v))], dtype=int)


In [39]:
y_true_quad = quadrant_vec(y_test_val, y_test_aro, thr=0.5)

In [40]:
results_quad = []

common_models = sorted(set(best_models_valence.keys()) & set(best_models_arousal.keys()))

for name in common_models:
    model_val = best_models_valence[name]
    model_aro = best_models_arousal[name]

    v_pred = model_val.predict(X_test_val)
    a_pred = model_aro.predict(X_test_aro)

    y_pred_quad = quadrant_vec(v_pred, a_pred, thr=0.5)

    acc = accuracy_score(y_true_quad, y_pred_quad)

    # weighted metrics
    f1 = f1_score(y_true_quad, y_pred_quad, average="weighted")
    prec, rec, _, _ = precision_recall_fscore_support(
        y_true_quad,
        y_pred_quad,
        average="weighted",
        zero_division=0
    )

    results_quad.append({
        "Model": name,
        "Accuracy": float(acc),
        "Precision": float(prec),
        "Recall": float(rec),
        "F1": float(f1),
    })

results_quad_df = (
    pd.DataFrame(results_quad)
    .sort_values("F1", ascending=False)
    .reset_index(drop=True)
)

display(results_quad_df)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,SVR,0.675325,0.70547,0.675325,0.657283
1,XGBoost,0.675325,0.670199,0.675325,0.650618
2,Ridge,0.649351,0.685862,0.649351,0.632791
3,Random Forest,0.662338,0.763943,0.662338,0.627722
4,kNN,0.649351,0.520276,0.649351,0.575765


In [41]:
results_quad_df.to_csv('PMEmo_classification_from_regression_weighted.csv', index=False)

# Linear regression

In [8]:
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support, confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

res_df = pd.read_csv('/kaggle/input/pmemo-cls-wighted-results/PMEmo_classification_from_regression_weighted.csv')

#Russell model of emotion
def assign_quadrant(valence, arousal, thr=0.5):
    if valence <= thr and arousal <= thr:
        return 2
    elif valence > thr and arousal <= thr:
        return 3
    elif valence <= thr and arousal > thr:
        return 1
    else:
        return 0

def quadrant_vec(v, a, thr=0.5):
    v = np.asarray(v).ravel()
    a = np.asarray(a).ravel()
    return np.array([assign_quadrant(v[i], a[i], thr=thr) for i in range(len(v))], dtype=int)
    
y_true_quad = quadrant_vec(y_test_val, y_test_aro, thr=0.5)

In [17]:
def reg_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    return mae, rmse, r2
    

#Valence LinearRegression
lin_val = LinearRegression()
lin_val.fit(X_train_val, y_train_val)
v_pred = lin_val.predict(X_test_val)
mae_v, rmse_v, r2_v = reg_metrics(y_test_val, v_pred)
print(f"LinearReg (Valence): MAE={mae_v:.4f} RMSE={rmse_v:.4f} R2={r2_v:.4f}")


#Arousal LinearRegression
lin_aro = LinearRegression()
lin_aro.fit(X_train_aro, y_train_aro)
a_pred = lin_aro.predict(X_test_aro)
mae_a, rmse_a, r2_a = reg_metrics(y_test_aro, a_pred)
print(f"LinearReg (Arousal): MAE={mae_a:.4f} RMSE={rmse_a:.4f} R2={r2_a:.4f}")


LinearReg (Valence): MAE=0.1365 RMSE=0.1694 R2=-0.0761
LinearReg (Arousal): MAE=0.1186 RMSE=0.1684 R2=0.1327


In [18]:
results_quad = []

# --- predictions ---
v_pred = lin_val.predict(X_test_val)
a_pred = lin_aro.predict(X_test_aro)

# --- quadrant conversion ---
y_pred_quad = quadrant_vec(v_pred, a_pred, thr=0.5)

# --- classification metrics ---
acc = accuracy_score(y_true_quad, y_pred_quad)

f1 = f1_score(y_true_quad, y_pred_quad, average="weighted")

prec, rec, _, _ = precision_recall_fscore_support(
    y_true_quad,
    y_pred_quad,
    average="weighted",
    zero_division=0
)

results_quad.append({
    "Model": "Linear Regression",
    "Accuracy": float(acc),
    "Precision": float(prec),
    "Recall": float(rec),
    "F1": float(f1),
})

results_quad_df = pd.DataFrame(results_quad)

display(results_quad_df)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Linear Regression,0.649351,0.740628,0.649351,0.681641


In [19]:
results = pd.concat([res_df, results_quad_df]).sort_values("Accuracy", ascending=False).reset_index(drop = True)
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,SVR,0.675325,0.70547,0.675325,0.657283
1,XGBoost,0.675325,0.670199,0.675325,0.650618
2,Random Forest,0.662338,0.763943,0.662338,0.627722
3,Ridge,0.649351,0.685862,0.649351,0.632791
4,kNN,0.649351,0.520276,0.649351,0.575765
5,Linear Regression,0.649351,0.740628,0.649351,0.681641


In [20]:
results.to_csv('PMEmo_audio.csv')