### Author:   __Snehit__
### E-mail:   *snehitc@gmail.com*

## Setup

In [1]:
import os
import numpy as np
import utils.utils as utils
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import gc
import json

from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from scipy.optimize import minimize

from datasets.fetch_data import get_dataset, get_infdataset
from evaluate import evaluate_all
from features.all_feature_dict import get_all_features_in_dict

In [2]:
import sys
sys.path.append('./m2d/')
sys.path.append('./MGA-CLAP')

In [3]:
config_filename = "config_submission4.json"

In [4]:
cfg = utils.load_config(config_filename)
cfg['config_filename'] = config_filename

In [5]:
train_ds = get_dataset(
                    cfg["train_list"],
                    os.path.join(cfg["wav_dir"], "train"),
                    max_sec=cfg["max_len"],
                    sr=cfg["sample_rate"],
                    org_max=10.0,
                    org_min=0.0
                    )
val_ds   = get_dataset(
                    cfg["validation_list"],
                    os.path.join(cfg["wav_dir"], "validation"),
                    max_sec=cfg["max_len"],
                    sr=cfg["sample_rate"],
                    org_max=10.0,
                    org_min=0.0
                    )
train_loader = DataLoader(
                    train_ds,
                    batch_size=cfg["batch_size"],
                    shuffle=False,
                    num_workers=cfg["num_workers"],
                    collate_fn=train_ds.collate_fn,
                    drop_last=False
                    )
val_loader = DataLoader(
                    val_ds, 
                    batch_size=cfg["validation_batch_size"], 
                    shuffle=False,
                    num_workers=cfg["num_workers"], 
                    collate_fn=val_ds.collate_fn
                    )

In [6]:
test_ds   = get_infdataset(
                    cfg["test_list"],
                    os.path.join(cfg["wav_dir"], "test"),
                    max_sec=cfg["max_len"],
                    sr=cfg["sample_rate"],
                    )
test_loader = DataLoader(
                    test_ds, 
                    batch_size=cfg["test_batch_size"], 
                    shuffle=False,
                    num_workers=cfg["num_workers"], 
                    collate_fn=test_ds.collate_fn
                    )

## Train

In [7]:
def GetLabels(label_dict):
    for i, (key, value) in enumerate(label_dict.items()):
        if i!=0:
            if not np.array_equal(value, value_previous):
                raise ValueError("different labels for same data point \nRecommended to set dataloaders with shuffle=False")
        value_previous = value

    return value



def Fit_SVR(Ensemble_set, X_comb, X_comb_val, y, y_val, SVR_input_param, cfg):
    # Preprocessing
    print('Preprocessing...')
    Normalizer_ = Normalizer().fit(X_comb)  # fit does nothing.
    X_scaled_comb = Normalizer_.transform(X_comb)
    X_scaled_comb_val = Normalizer_.transform(X_comb_val)

    StandardScaler_ = StandardScaler()
    X_scaled_comb = StandardScaler_.fit_transform(X_scaled_comb)
    X_scaled_comb_val = StandardScaler_.transform(X_scaled_comb_val)


    # SVR_A
    print('Training SVR...')
    SVR_ = SVR(C=SVR_input_param['C'], 
                kernel=SVR_input_param['kernel'], 
                epsilon=SVR_input_param['epsilon'], 
                gamma=SVR_input_param['gamma'])
    SVR_.fit(X_scaled_comb, y)

    # Train and Validation Results
    print('Predicting on Train set...')
    y_pred_train = SVR_.predict(X_scaled_comb)
    y_pred_train_denorm = y_pred_train*5 + 5
    y_denorm = y*5 + 5
    results = evaluate_all(y_pred_train_denorm, y_denorm)
    print('\t', results)

    print('Predicting on Val set...')
    y_pred_val = SVR_.predict(X_scaled_comb_val)
    y_pred_val_denorm = y_pred_val*5 + 5
    y_val_denorm = y_val*5 + 5
    results_val = evaluate_all(y_pred_val_denorm, y_val_denorm)
    print('\t', results_val)

    # Save the trained SVR and Preprocessor
    utils.Save_Preprocessor_and_SVR(Normalizer_, StandardScaler_, SVR_, Ensemble_set, cfg)

    params = SVR_.dual_coef_.size + SVR_.intercept_.size
    return y_pred_train_denorm, y_pred_val_denorm, params



def Fit_SVR_A(X_dict, X_dict_val, y_dict, y_dict_val, SVR_input_param, param, cfg): 
        Ensemble_set = 'A'
        print(f'--- SVR {Ensemble_set} ---')
        X_comb_A = np.concatenate((X_dict['M2D_Clap']['AudioFeatures'], X_dict['M2D_Clap']['TextFeatures'], X_dict['M2D_Clap']['Cosine_Sim'], 
                            X_dict['M2D_Clap']['Cosine_Ang'], X_dict['M2D_Clap']['L2'], X_dict['M2D_Clap']['L1'],

                            X_dict['MS_Clap']['AudioFeatures'], X_dict['MS_Clap']['TextFeatures'], X_dict['MS_Clap']['Cosine_Sim'], 
                            X_dict['MS_Clap']['Cosine_Ang'], X_dict['MS_Clap']['L2'], X_dict['MS_Clap']['L1'],
                             
                            X_dict['MGA_Clap']['AudioFeatures'], X_dict['MGA_Clap']['TextFeatures'], X_dict['MGA_Clap']['Cosine_Sim'], 
                            X_dict['MGA_Clap']['Cosine_Ang'], X_dict['MGA_Clap']['L2'], X_dict['MGA_Clap']['L1'],), axis=1)

        X_comb_val_A = np.concatenate((X_dict_val['M2D_Clap']['AudioFeatures'], X_dict_val['M2D_Clap']['TextFeatures'], X_dict_val['M2D_Clap']['Cosine_Sim'], 
                                    X_dict_val['M2D_Clap']['Cosine_Ang'], X_dict_val['M2D_Clap']['L2'], X_dict_val['M2D_Clap']['L1'],

                                    X_dict_val['MS_Clap']['AudioFeatures'], X_dict_val['MS_Clap']['TextFeatures'], X_dict_val['MS_Clap']['Cosine_Sim'], 
                                    X_dict_val['MS_Clap']['Cosine_Ang'], X_dict_val['MS_Clap']['L2'], X_dict_val['MS_Clap']['L1'],
                                    
                                    X_dict_val['MGA_Clap']['AudioFeatures'], X_dict_val['MGA_Clap']['TextFeatures'], X_dict_val['MGA_Clap']['Cosine_Sim'], 
                                    X_dict_val['MGA_Clap']['Cosine_Ang'], X_dict_val['MGA_Clap']['L2'], X_dict_val['MGA_Clap']['L1'],), axis=1)
        
        y = GetLabels(y_dict)
        y_val = GetLabels(y_dict_val)
        y_pred_train_denorm_A, y_pred_val_denorm_A, params = Fit_SVR(Ensemble_set, X_comb_A, X_comb_val_A, y, y_val, 
                                                                     SVR_input_param, cfg)
        param['SVR_'+Ensemble_set] = params
        return y_pred_train_denorm_A, y_pred_val_denorm_A, param




def Fit_SVR_B(X_dict, X_dict_val, y_dict, y_dict_val, SVR_input_param, param, cfg):
        Ensemble_set = 'B'
        print(f'\n--- SVR {Ensemble_set} ---')
        X_comb_B = np.concatenate((X_dict['Laion_Clap']['AudioFeatures'], X_dict['Laion_Clap']['TextFeatures'], X_dict['Laion_Clap']['L1'], 
                               X_dict['Whisper']['AudioFeatures'], X_dict['Whisper']['TextFeatures'],), axis=1)

        X_comb_val_B = np.concatenate((X_dict_val['Laion_Clap']['AudioFeatures'], X_dict_val['Laion_Clap']['TextFeatures'], X_dict_val['Laion_Clap']['L1'],
                                   X_dict_val['Whisper']['AudioFeatures'], X_dict_val['Whisper']['TextFeatures'],), axis=1)
        
        y = GetLabels(y_dict)
        y_val = GetLabels(y_dict_val)
        y_pred_train_denorm_B, y_pred_val_denorm_B, params = Fit_SVR(Ensemble_set, X_comb_B, X_comb_val_B, y, y_val, 
                                                                SVR_input_param, cfg)
        param['SVR_'+Ensemble_set] = params
        return y_pred_train_denorm_B, y_pred_val_denorm_B, param



def Param_count(param):
        count = 0
        for key, val in param.items():
            count += val
        print(f'\nTotal Parameter Count: {count}')


def RMSE(y, y_pred):
    rsme = np.sqrt(np.mean( (y-y_pred)**2) )
    return rsme


In [8]:
print("------ Train set ------")
if 'X_dict' not in globals() and 'y_dict' not in globals() and 'param' not in globals():
    X_dict, y_dict, param = get_all_features_in_dict(train_loader, cfg)


------ Train set ------
*** [1/5]Extracting features using M2D_Clap... ***




 using 166 parameters from m2d/m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth
 (included audio_proj params: ['audio_proj.sem_token', 'audio_proj.sem_blocks.0.norm1.weight', 'audio_proj.sem_blocks.0.norm1.bias', 'audio_proj.sem_blocks.0.attn.qkv.weight', 'audio_proj.sem_blocks.0.attn.qkv.bias']
 (included text_proj params: []
 (dropped: [] )
<All keys matched successfully>


  0%|          | 0/938 [00:00<?, ?it/s]

 using model.text_encoder from ./m2d/m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth


100%|██████████| 938/938 [03:37<00:00,  4.31it/s]


*** [1/5]Finished Extracting features using M2D_Clap ***

*** [2/5]Extracting features using MS_Clap... ***


100%|██████████| 938/938 [01:28<00:00, 10.55it/s]


*** [2/5]Finished Extracting features using MS_Clap ***

*** [3/5]Extracting features using Laion_Clap... ***


100%|██████████| 938/938 [02:43<00:00,  5.74it/s]


*** [3/5]Finished Extracting features using Laion_Clap ***

*** [4/5]Extracting features using MGA_Clap... ***


100%|██████████| 938/938 [00:38<00:00, 24.64it/s]


*** [4/5]Finished Extracting features using MGA_Clap ***

*** [5/5]Extracting features using Whisper... ***


100%|██████████| 938/938 [27:26<00:00,  1.75s/it]

*** [5/5]Finished Extracting features using Whisper ***






In [9]:
print("\n------ Val set ------")
if 'X_dict_val' not in globals() and 'y_dict_val' not in globals():
    X_dict_val, y_dict_val, param = get_all_features_in_dict(val_loader, cfg)


------ Val set ------
*** [1/5]Extracting features using M2D_Clap... ***
 using 166 parameters from m2d/m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth
 (included audio_proj params: ['audio_proj.sem_token', 'audio_proj.sem_blocks.0.norm1.weight', 'audio_proj.sem_blocks.0.norm1.bias', 'audio_proj.sem_blocks.0.attn.qkv.weight', 'audio_proj.sem_blocks.0.attn.qkv.bias']
 (included text_proj params: []
 (dropped: [] )
<All keys matched successfully>


  0%|          | 0/375 [00:00<?, ?it/s]

 using model.text_encoder from ./m2d/m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth


100%|██████████| 375/375 [01:04<00:00,  5.78it/s]


*** [1/5]Finished Extracting features using M2D_Clap ***

*** [2/5]Extracting features using MS_Clap... ***


100%|██████████| 375/375 [00:26<00:00, 14.01it/s]


*** [2/5]Finished Extracting features using MS_Clap ***

*** [3/5]Extracting features using Laion_Clap... ***


100%|██████████| 375/375 [01:05<00:00,  5.72it/s]


*** [3/5]Finished Extracting features using Laion_Clap ***

*** [4/5]Extracting features using MGA_Clap... ***


100%|██████████| 375/375 [00:15<00:00, 23.82it/s]


*** [4/5]Finished Extracting features using MGA_Clap ***

*** [5/5]Extracting features using Whisper... ***


100%|██████████| 375/375 [10:59<00:00,  1.76s/it]

*** [5/5]Finished Extracting features using Whisper ***






In [10]:
# Clear GPU memory
gc.collect()
torch.cuda.empty_cache()

In [11]:
# Fit SVR A and B
SVR_A_input_param = cfg['SVR']['A']['input_param']
y_pred_train_denorm_A, y_pred_val_denorm_A, param = Fit_SVR_A(X_dict, X_dict_val, y_dict, y_dict_val, SVR_A_input_param, param, cfg)

SVR_B_input_param = cfg['SVR']['B']['input_param']
y_pred_train_denorm_B, y_pred_val_denorm_B, param = Fit_SVR_B(X_dict, X_dict_val, y_dict, y_dict_val,  SVR_B_input_param, param, cfg)

# Optimize Weights for Ensemble
y = GetLabels(y_dict)
y_val = GetLabels(y_dict_val)
y_denorm = y*5 + 5
y_denorm_val = y_val*5 + 5
def min_func(K):
    y_pred_train = K[0]*y_pred_train_denorm_A + K[1]*y_pred_train_denorm_B
    return RMSE(y_denorm, y_pred_train)
res = minimize(min_func, [1/2]*2, method='TNC', tol=1e-6)
W = res.x
cfg['W'] = W.tolist()

# Final Train and Validation Results
print('\nEnsemble: Prediction = W_A*SVR_A + W_B*SVR_B')
print('Predicting on Train set...')
y_pred_train = W[0]*y_pred_train_denorm_A + W[1]*y_pred_train_denorm_B
y_pred_train_clipped = np.clip(y_pred_train, 0, 10)
results = evaluate_all(y_pred_train_clipped, y_denorm)
print('\t', results)

print('\nPredicting on Val set...')
y_pred_val = W[0]*y_pred_val_denorm_A + W[1]*y_pred_val_denorm_B
y_pred_val_clipped = np.clip(y_pred_val, 0, 10)
results_val = evaluate_all(y_pred_val_clipped, y_denorm_val)
print('\t', results_val)

Param_count(param)


--- SVR A ---
Preprocessing...
Training SVR...
Predicting on Train set...
	 {'SRCC': 0.9275749329754529, 'LCC': 0.9295733184259705, 'KTAU': 0.7927382587553667, 'MSE': 0.8959294885956849, 'N': 7500}
Predicting on Val set...
	 {'SRCC': 0.6571085882541974, 'LCC': 0.6765082897111823, 'KTAU': 0.4754672570356058, 'MSE': 3.112910980514286, 'N': 3000}
Preprocessors and SVR model saved for ensemble set: ./outputs/version_config_submission4/A/ 


--- SVR B ---
Preprocessing...
Training SVR...
Predicting on Train set...
	 {'SRCC': 0.9364892531730183, 'LCC': 0.9207503439284456, 'KTAU': 0.8277503727054965, 'MSE': 1.0161497807248185, 'N': 7500}
Predicting on Val set...
	 {'SRCC': 0.5789392559840857, 'LCC': 0.5861761963179108, 'KTAU': 0.4134016401376451, 'MSE': 3.7540573304077114, 'N': 3000}
Preprocessors and SVR model saved for ensemble set: ./outputs/version_config_submission4/B/ 


Ensemble: Prediction = W_A*SVR_A + W_B*SVR_B
Predicting on Train set...
	 {'SRCC': 0.944291999198004, 'LCC': 0.938816

In [12]:
# Save config file for trained model
config_filepath_filename = os.path.join(cfg['output_dir'] + '/version_' + cfg['config_filename'].split('.')[0], 'config.json')
with open(config_filepath_filename, "w") as json_file:
    json.dump(cfg, json_file, indent=4)

## Inference

In [13]:
chkpt_subdir_name = 'version_config_submission4'
config_filepath_filename = os.path.join('./outputs', chkpt_subdir_name, 'config.json')
cfg = utils.load_config(config_filepath_filename)

In [14]:
def Predict_SVR(dataset_key, X_comb_test, Normalizer_, StandardScaler_, SVR_, cfg):
    # Preprocessing
    print('Preprocessing...')
    X_scaled_comb_test = Normalizer_.transform(X_comb_test)
    X_scaled_comb_test = StandardScaler_.transform(X_scaled_comb_test)

    # SVR_A
    print(f'Predicting on {dataset_key} set...')
    y_pred_test = SVR_.predict(X_scaled_comb_test)
    y_pred_test_denorm = y_pred_test*5 + 5
    
    return y_pred_test_denorm


def Predict_SVR_A(dataset_key, X_dict_test, cfg): 
        Ensemble_set = 'A'
        print(f'--- SVR {Ensemble_set} ---')
        
        X_comb_test_A = np.concatenate((X_dict_test['M2D_Clap']['AudioFeatures'], X_dict_test['M2D_Clap']['TextFeatures'], X_dict_test['M2D_Clap']['Cosine_Sim'], 
                                    X_dict_test['M2D_Clap']['Cosine_Ang'], X_dict_test['M2D_Clap']['L2'], X_dict_test['M2D_Clap']['L1'],

                                    X_dict_test['MS_Clap']['AudioFeatures'], X_dict_test['MS_Clap']['TextFeatures'], X_dict_test['MS_Clap']['Cosine_Sim'], 
                                    X_dict_test['MS_Clap']['Cosine_Ang'], X_dict_test['MS_Clap']['L2'], X_dict_test['MS_Clap']['L1'],
                                    
                                    X_dict_test['MGA_Clap']['AudioFeatures'], X_dict_test['MGA_Clap']['TextFeatures'], X_dict_test['MGA_Clap']['Cosine_Sim'], 
                                    X_dict_test['MGA_Clap']['Cosine_Ang'], X_dict_test['MGA_Clap']['L2'], X_dict_test['MGA_Clap']['L1'],), axis=1)
        
        Normalizer_name = cfg['preprocessor']['A']['Normalizer']
        StandardScaler_name = cfg['preprocessor']['A']['StandardScaler']
        SVR_name = cfg['SVR']['A']['name']
        Ensemble_set = 'A'
        Normalizer_A, StandardScaler_A, SVR_A = utils.Load_Preprocessor_and_SVR(Normalizer_name, StandardScaler_name, SVR_name, Ensemble_set, cfg)
        y_pred_test_denorm_A = Predict_SVR(dataset_key, X_comb_test_A, Normalizer_A, StandardScaler_A, SVR_A, cfg)

        return y_pred_test_denorm_A


def Predict_SVR_B(dataset_key, X_dict_test, cfg): 
        Ensemble_set = 'B'
        print(f'--- SVR {Ensemble_set} ---')
        
        X_comb_test_B = np.concatenate((X_dict_test['Laion_Clap']['AudioFeatures'], X_dict_test['Laion_Clap']['TextFeatures'], X_dict_test['Laion_Clap']['L1'],
                                   X_dict_test['Whisper']['AudioFeatures'], X_dict_test['Whisper']['TextFeatures'],), axis=1)
         
        Normalizer_name = cfg['preprocessor']['B']['Normalizer']
        StandardScaler_name = cfg['preprocessor']['B']['StandardScaler']
        SVR_name = cfg['SVR']['B']['name']
        Ensemble_set = 'B'
        Normalizer_B, StandardScaler_B, SVR_B = utils.Load_Preprocessor_and_SVR(Normalizer_name, StandardScaler_name, SVR_name, Ensemble_set, cfg)
        y_pred_test_denorm_B = Predict_SVR(dataset_key, X_comb_test_B, Normalizer_B, StandardScaler_B, SVR_B, cfg)

        return y_pred_test_denorm_B


In [18]:
print(f"\n------ Test set ------")
if 'X_dict_test' not in globals() and 'y_dict_test' not in globals():
    X_dict_test, _, param = get_all_features_in_dict(test_loader, cfg, test_data=True)

# Clear GPU memory
gc.collect()
torch.cuda.empty_cache()


------ Test set ------
*** [1/5]Extracting features using M2D_Clap... ***
 using 166 parameters from m2d/m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth
 (included audio_proj params: ['audio_proj.sem_token', 'audio_proj.sem_blocks.0.norm1.weight', 'audio_proj.sem_blocks.0.norm1.bias', 'audio_proj.sem_blocks.0.attn.qkv.weight', 'audio_proj.sem_blocks.0.attn.qkv.bias']
 (included text_proj params: []
 (dropped: [] )
<All keys matched successfully>


  0%|          | 0/375 [00:00<?, ?it/s]

 using model.text_encoder from ./m2d/m2d_clap_vit_base-80x1001p16x16p16kpBpTI-2025/checkpoint-30.pth


100%|██████████| 375/375 [01:29<00:00,  4.18it/s]


*** [1/5]Finished Extracting features using M2D_Clap ***

*** [2/5]Extracting features using MS_Clap... ***


100%|██████████| 375/375 [00:26<00:00, 14.38it/s]


*** [2/5]Finished Extracting features using MS_Clap ***

*** [3/5]Extracting features using Laion_Clap... ***


100%|██████████| 375/375 [01:04<00:00,  5.83it/s]


*** [3/5]Finished Extracting features using Laion_Clap ***

*** [4/5]Extracting features using MGA_Clap... ***


100%|██████████| 375/375 [00:14<00:00, 25.15it/s]


*** [4/5]Finished Extracting features using MGA_Clap ***

*** [5/5]Extracting features using Whisper... ***


100%|██████████| 375/375 [10:58<00:00,  1.76s/it]


*** [5/5]Finished Extracting features using Whisper ***



In [19]:
def Predict_Inference(dataset_key, loader, X_dict_inf, cfg):
    y_pred_inf_denorm_A = Predict_SVR_A(dataset_key, X_dict_inf, cfg)
    y_pred_inf_denorm_B = Predict_SVR_B(dataset_key, X_dict_inf, cfg)

    print('\nEnsemble: Prediction = W_A*SVR_A + W_B*SVR_B')
    print(f'Predicting on {dataset_key} set...\n')
    y_pred_inf = cfg['W'][0]*y_pred_inf_denorm_A + cfg['W'][1]*y_pred_inf_denorm_B
    y_pred_inf_clipped = np.clip(y_pred_inf, 0, 10)

    utils.Save_Inference(y_pred_inf_clipped, loader, cfg, dataset_key)


In [16]:
# Inference on Validation set
Predict_Inference('validation', val_loader, X_dict_val, cfg)

--- SVR A ---
Preprocessing...
Predicting on validation set...
--- SVR B ---
Preprocessing...
Predicting on validation set...

Ensemble: Prediction = W_A*SVR_A + W_B*SVR_B
Predicting on validation set...



100%|██████████| 375/375 [00:03<00:00, 117.66it/s]

csv file is saved 
Path: ./outputs/version_config_submission4 
filename: inference_result_for_validation.csv






In [17]:
# Save Evaluation Scores: SRCC, LCC, KTAU, MSE
# %run evaluate.py <inference_csv_path> <ground_truth_csv_path> <save_results_dir>
%run evaluate.py outputs/version_config_submission4/inference_result_for_validation.csv datasets/XACLE_dataset/meta_data/validation_average.csv outputs/version_config_submission4/

The evaluation will be conducted based on the following two files:
	- /home/u5049807/Ensemble_SVR/outputs/version_config_submission4/inference_result_for_validation.csv
	- /home/u5049807/Ensemble_SVR/datasets/XACLE_dataset/meta_data/validation_average.csv
The result will be saved to: /home/u5049807/Ensemble_SVR/outputs/version_config_submission4/evaluation_result.csv


In [20]:
# Inference on Test set
Predict_Inference('test', test_loader, X_dict_test, cfg)

--- SVR A ---
Preprocessing...
Predicting on test set...
--- SVR B ---
Preprocessing...
Predicting on test set...

Ensemble: Prediction = W_A*SVR_A + W_B*SVR_B
Predicting on test set...



100%|██████████| 375/375 [00:02<00:00, 163.18it/s]

csv file is saved 
Path: ./outputs/version_config_submission4 
filename: inference_result_for_test.csv




