In [1]:
import os
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import whisper
import wandb
from transformers import AutoTokenizer, AutoModel
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

In [2]:
# --- Device Setup ---
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda:1


In [3]:
# --- Load and Unfreeze Whisper‑base ---
whisper_model = whisper.load_model("base").to(device)
for param in whisper_model.parameters():
    param.requires_grad = True
whisper_model.train()

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-5): 6 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=512, out_features=512, bias=True)
          (key): Linear(in_features=512, out_features=512, bias=False)
          (value): Linear(in_features=512, out_features=512, bias=True)
          (out): Linear(in_features=512, out_features=512, bias=True)
        )
        (attn_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=512, out_features=2048, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (mlp_ln): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm((512,), eps=1e-05,

In [4]:
# --- Load Russian BERT ---
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
bert_model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased").to(device).eval()

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# --- Initialize wandb ---
wandb.init(project="somos-ensemble2-ssl-sbs", name="finetune-whisper_b+ruBERT+sbs")
!wandb online

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: rtfiof (rtfiof-hse-university). Use `wandb login --relogin` to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

W&B online. Running your script from this directory will now sync to the cloud.


In [6]:
# --- Russian SBS Dataset Class ---
class RussianSBSDataset(Dataset):
    def __init__(self, df, base_dir, subset=False):
        """
        df: A pandas DataFrame containing the data.
        base_dir: Base directory for audio files.
        subset: If True, only a fraction of the data is used.
        
        Assumes the DataFrame has the following columns:
         - wav_path: path to the high-quality audio (always better)
         - gen_wav_path: path to the generated or lower-quality audio
         - txt: text corresponding to the audio sample
        """
        self.df = df.copy()
        if subset:
            self.df = self.df.sample(frac=0.05, random_state=42).reset_index(drop=True)
        self.base_dir = base_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        # For each row, the first audio is the original (better) one and the second is the generated version.
        audio1_path = os.path.join(self.base_dir, row["wav_path"])
        audio2_path = os.path.join(self.base_dir, row["gen_wav_path"])
        
        # Use the provided text (strip extra spaces or newline characters)
        text = row["txt"].strip()
        
        # Since wav_path is always better, we assign fixed SBS scores:
        # For example: 1.0 for the original, 0.0 for the generated.
        sbs1 = 1.0
        sbs2 = 0.0
        
        return audio1_path, audio2_path, text, sbs1, sbs2

In [7]:
# --- Collate Function for SBS ---
def collate_fn_sbs(batch):
    audio1_paths, audio2_paths, texts, sbs1_list, sbs2_list = zip(*batch)
    
    # Process first audio of each pair.
    audios1 = [whisper.load_audio(path) for path in audio1_paths]
    audios1 = [whisper.pad_or_trim(audio) for audio in audios1]
    mels1 = [whisper.log_mel_spectrogram(audio).to(device) for audio in audios1]
    mels1 = torch.stack(mels1)
    # Get audio embeddings (mean-pooled over time).
    audio1_emb = whisper_model.encoder(mels1).mean(dim=1)
    
    # Process second audio of each pair.
    audios2 = [whisper.load_audio(path) for path in audio2_paths]
    audios2 = [whisper.pad_or_trim(audio) for audio in audios2]
    mels2 = [whisper.log_mel_spectrogram(audio).to(device) for audio in audios2]
    mels2 = torch.stack(mels2)
    audio2_emb = whisper_model.encoder(mels2).mean(dim=1)
    
    # Process the text once per pair.
    inputs = tokenizer(list(texts), return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        text_emb = bert_model(**inputs).last_hidden_state[:, 0, :]
        
    sbs1_tensor = torch.tensor(sbs1_list, dtype=torch.float).to(device)
    sbs2_tensor = torch.tensor(sbs2_list, dtype=torch.float).to(device)
    
    # Return the embeddings along with the original file paths.
    return audio1_emb, audio2_emb, text_emb, sbs1_tensor, sbs2_tensor, audio1_paths, audio2_paths


In [14]:
# --- Weak Learners (same as before) ---
class WeakLearners(nn.Module):
    def __init__(self, audio_dim, text_dim, device="cuda"):
        super(WeakLearners, self).__init__()
        self.audio_dim = audio_dim
        self.text_dim = text_dim
        self.device = device

        self.ridge_regressor = Ridge(alpha=1.0)
        self.svr = SVR()
        self.dtr = DecisionTreeRegressor()

        self.fitted = False

    def fit(self, train_loader):
        print("Fitting weak learners on SBS data...")
        all_audio_emb, all_text_emb, all_labels = [], [], []
        # For each pair in the batch, treat the first and second audio separately.
        for audio1_emb, audio2_emb, text_emb, sbs1, sbs2, _, _ in tqdm(train_loader, desc="Extracting embeddings", unit="batch"):
            audio1_np = audio1_emb.cpu().detach().numpy()
            audio2_np = audio2_emb.cpu().detach().numpy()
            text_np = text_emb.cpu().detach().numpy()
            sbs1_np = sbs1.cpu().detach().numpy()
            sbs2_np = sbs2.cpu().detach().numpy()
            
            # Append first audio example.
            all_audio_emb.append(audio1_np)
            all_text_emb.append(text_np)
            all_labels.append(sbs1_np)
            
            # Append second audio example.
            all_audio_emb.append(audio2_np)
            all_text_emb.append(text_np)
            all_labels.append(sbs2_np)
        
        all_audio_emb = np.vstack(all_audio_emb)
        all_text_emb = np.vstack(all_text_emb)
        all_labels = np.hstack(all_labels)
        
        # Combine audio and text embeddings.
        combined_embeddings = np.hstack((all_audio_emb, all_text_emb))
        
        # Train each weak learner.
        for model, name in zip([self.ridge_regressor, self.svr, self.dtr],
                               ["Ridge Regression", "SVR", "Decision Tree"]):
            print(f"Training {name}...")
            model.fit(combined_embeddings, all_labels)
        self.fitted = True
        print("Weak learners training completed.")

    def forward(self, audio_emb, text_emb):
        if not self.fitted:
            raise RuntimeError("Weak learners have not been fitted. Call 'fit()' before using the model.")
        # Concatenate audio and text embeddings.
        combined = torch.cat([audio_emb, text_emb], dim=1).cpu().detach().numpy()
        with torch.no_grad():
            ridge_pred = self.ridge_regressor.predict(combined)
            svr_pred = self.svr.predict(combined)
            dtr_pred = self.dtr.predict(combined)
        # Convert predictions to tensors.
        ridge_pred = torch.from_numpy(ridge_pred).float().to(self.device)
        svr_pred = torch.from_numpy(svr_pred).float().to(self.device)
        dtr_pred = torch.from_numpy(dtr_pred).float().to(self.device)
        return ridge_pred, svr_pred, dtr_pred 

In [9]:

# --- Stacking Meta-Learner ---
class StackingMetaLearner(nn.Module):
    def __init__(self, weak_output_dim=3, hidden_dim=256):
        super(StackingMetaLearner, self).__init__()
        self.fc1 = nn.Linear(weak_output_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, weak_outputs):
        x = F.relu(self.fc1(weak_outputs))
        return self.fc2(x)

In [10]:
# --- SSLEnsembleModel (Ensemble using weak learners and meta-learner) ---
class SSLEnsembleModel(nn.Module):
    def __init__(self, audio_dim, text_dim, hidden_dim, weak_learners):
        super(SSLEnsembleModel, self).__init__()
        if weak_learners is None:
            raise ValueError("Weak learners must be provided and fitted before initializing SSLEnsembleModel.")
        self.weak_learners = weak_learners
        self.stacking_meta_learner = StackingMetaLearner(weak_output_dim=3, hidden_dim=hidden_dim)

    def forward(self, audio_emb, text_emb):
        if not self.weak_learners.fitted:
            raise RuntimeError("Weak learners have not been fitted. Call 'fit()' before using the model.")
        # Get predictions from the weak learners.
        ridge_pred, svr_pred, dtr_pred = self.weak_learners(audio_emb, text_emb)
        # Stack the predictions into one tensor.
        weak_outputs = torch.stack([ridge_pred, svr_pred, dtr_pred], dim=1)
        # Meta-learner produces the final output.
        final_output = self.stacking_meta_learner(weak_outputs)
        return final_output

In [11]:
# --- Training Function with Intermediate Evaluation ---
def train_meta_learner(train_loader, test_loader, ensemble_model, optimizer, criterion, epochs=20, eval_interval=15000):
    ensemble_model.train()
    
    for epoch in range(epochs):
        total_loss = 0.0
        total_mse = 0.0
        total_mae = 0.0
        batch_count = 0

        # for batch_idx, (audio1_emb, audio2_emb, text_emb, sbs1, sbs2) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
        for batch_idx, (audio1_emb, audio2_emb, text_emb, sbs1, sbs2, _, _) in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):

            optimizer.zero_grad()

            # Forward pass for both audio inputs.
            pred1 = ensemble_model(audio1_emb, text_emb).squeeze()
            pred2 = ensemble_model(audio2_emb, text_emb).squeeze()

            # Compute loss
            loss1 = criterion(pred1, sbs1)
            loss2 = criterion(pred2, sbs2)
            loss = loss1 + loss2
            total_loss += loss.item()

            # Backpropagation
            loss.backward()
            optimizer.step()

            batch_count += 1

            # Convert to CPU for logging
            pred1_cpu = np.atleast_1d(pred1.detach().cpu().numpy())
            pred2_cpu = np.atleast_1d(pred2.detach().cpu().numpy())
            sbs1_cpu = np.atleast_1d(sbs1.cpu().numpy())
            sbs2_cpu = np.atleast_1d(sbs2.cpu().numpy())

            # Compute evaluation metrics
            total_mse += (mean_squared_error(sbs1_cpu, pred1_cpu) + mean_squared_error(sbs2_cpu, pred2_cpu))
            total_mae += (mean_absolute_error(sbs1_cpu, pred1_cpu) + mean_absolute_error(sbs2_cpu, pred2_cpu))

            if (batch_idx + 1) % eval_interval == 0:
                print(f"Evaluating at batch {batch_idx+1}...")
                evaluate(test_loader, ensemble_model, criterion)

        avg_loss = total_loss / (2 * batch_count)
        avg_mse = total_mse / (2 * batch_count)
        avg_mae = total_mae / (2 * batch_count)

        print(f"Epoch {epoch+1}: Loss={avg_loss:.4f}, MSE={avg_mse:.4f}, MAE={avg_mae:.4f}")
        wandb.log({"epoch_loss": avg_loss, "epoch_mse": avg_mse, "epoch_mae": avg_mae})
        evaluate(test_loader, ensemble_model, criterion)


In [12]:
# --- Evaluation Function ---
def evaluate(test_loader, ensemble_model, criterion):
    ensemble_model.eval()
    total_loss = 0.0
    total_mse = 0.0
    total_mae = 0.0
    correct_order = 0
    total_samples = 0
    batch_count = 0
    better_audio_paths = []  # to store the paths of the "better" audio

    with torch.no_grad():
        for audio1_emb, audio2_emb, text_emb, sbs1, sbs2, audio1_paths, audio2_paths in tqdm(test_loader, desc="Evaluating"):
            pred1 = ensemble_model(audio1_emb, text_emb).squeeze()
            pred2 = ensemble_model(audio2_emb, text_emb).squeeze()

            loss1 = criterion(pred1, sbs1)
            loss2 = criterion(pred2, sbs2)
            total_loss += (loss1.item() + loss2.item())
            batch_count += 1

            # Convert tensors to CPU numpy arrays for metric computation.
            pred1_cpu = pred1.cpu().numpy()
            pred2_cpu = pred2.cpu().numpy()
            sbs1_cpu = sbs1.cpu().numpy()
            sbs2_cpu = sbs2.cpu().numpy()

            total_mse += (mean_squared_error(sbs1_cpu, pred1_cpu) + mean_squared_error(sbs2_cpu, pred2_cpu))
            total_mae += (mean_absolute_error(sbs1_cpu, pred1_cpu) + mean_absolute_error(sbs2_cpu, pred2_cpu))

            # Ranking: since sbs1 should be greater than sbs2, count correct order predictions.
            correct_order += np.sum((sbs1_cpu > sbs2_cpu) == (pred1_cpu > pred2_cpu))
            total_samples += len(sbs1_cpu)

            # Determine which audio is considered "better" (i.e. higher predicted SBS) for each pair.
            for i in range(len(sbs1_cpu)):
                if pred1_cpu[i] > pred2_cpu[i]:
                    better_audio_paths.append(audio1_paths[i])
                else:
                    better_audio_paths.append(audio2_paths[i])

    avg_loss = total_loss / (2 * batch_count)
    avg_mse = total_mse / (2 * batch_count)
    avg_mae = total_mae / (2 * batch_count)
    accuracy = correct_order / total_samples if total_samples > 0 else 0

    print(f"Test Loss: {avg_loss:.4f}, Test MSE: {avg_mse:.4f}, Test MAE: {avg_mae:.4f}, Ranking Accuracy: {accuracy:.4f}")
    wandb.log({
        "test_loss": avg_loss, 
        "test_mse": avg_mse, 
        "test_mae": avg_mae, 
        "test_ranking_accuracy": accuracy
    })

    # Print 10 sample paths of the audio the model considers "better"
    print("\nSample audio paths considered 'better':")
    for path in better_audio_paths[:10]:
        print(path)
    
    return avg_loss, accuracy


In [16]:
# --- Main Script ---
if __name__ == "__main__":
    # Base directory for audio files (adjust as needed)
    base_audio_dir = "./"  # or a specific path

    # Load the single CSV file.
    csv_path = "buriy_audiobooks_2_val/df_gen_xtts_2.csv"
    df = pd.read_csv(csv_path)

    # Perform an 80/20 train-test split.
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

    # Create dataset objects.
    # subset = True  # set to False for full dataset
    subset = False
    
    train_dataset = RussianSBSDataset(train_df, base_dir=base_audio_dir, subset=subset)
    test_dataset = RussianSBSDataset(test_df, base_dir=base_audio_dir, subset=subset)
    
    # Create dataloaders.
    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn_sbs)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn_sbs)
    
    # Initialize and fit weak learners.
    weak_learners = WeakLearners(audio_dim=512, text_dim=768, device=device)
    weak_learners.fit(train_loader)
    
    # Initialize the ensemble model.
    ensemble_model = SSLEnsembleModel(audio_dim=512, text_dim=768, hidden_dim=256, weak_learners=weak_learners).to(device)
    
    # Train the stacking meta-learner.
    optimizer = torch.optim.Adam(ensemble_model.stacking_meta_learner.parameters(), lr=1e-5)
    criterion = nn.MSELoss()
    train_meta_learner(train_loader, test_loader, ensemble_model, optimizer, criterion, epochs=20)
    
    # Final evaluation on the test set.
    evaluate(test_loader, ensemble_model, criterion)

Fitting weak learners on SBS data...


Extracting embeddings: 100%|████████████████████████████████████████████████████| 1550/1550 [10:57<00:00,  2.36batch/s]


Training Ridge Regression...
Training SVR...
Training Decision Tree...
Weak learners training completed.


Epoch 1: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [13:00<00:00,  1.99it/s]


Epoch 1: Loss=0.1774, MSE=0.1774, MAE=0.3066


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:16<00:00,  1.98it/s]


Test Loss: 0.0693, Test MSE: 0.0693, Test MAE: 0.2060, Ranking Accuracy: 0.9884

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 2: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [13:01<00:00,  1.98it/s]


Epoch 2: Loss=0.0174, MSE=0.0174, MAE=0.1048


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:15<00:00,  1.99it/s]


Test Loss: 0.0666, Test MSE: 0.0666, Test MAE: 0.1664, Ranking Accuracy: 0.9787

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 3: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [13:01<00:00,  1.98it/s]


Epoch 3: Loss=0.0085, MSE=0.0085, MAE=0.0718


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:13<00:00,  2.00it/s]


Test Loss: 0.0792, Test MSE: 0.0792, Test MAE: 0.1648, Ranking Accuracy: 0.9748

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 4: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [13:00<00:00,  1.99it/s]


Epoch 4: Loss=0.0043, MSE=0.0043, MAE=0.0506


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:13<00:00,  2.00it/s]


Test Loss: 0.0966, Test MSE: 0.0966, Test MAE: 0.1623, Ranking Accuracy: 0.9748

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 5: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [12:57<00:00,  1.99it/s]


Epoch 5: Loss=0.0017, MSE=0.0017, MAE=0.0301


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:11<00:00,  2.02it/s]


Test Loss: 0.1170, Test MSE: 0.1170, Test MAE: 0.1617, Ranking Accuracy: 0.9748

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 6: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [12:51<00:00,  2.01it/s]


Epoch 6: Loss=0.0005, MSE=0.0005, MAE=0.0157


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:11<00:00,  2.03it/s]


Test Loss: 0.1340, Test MSE: 0.1340, Test MAE: 0.1630, Ranking Accuracy: 0.9729

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 7: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [12:54<00:00,  2.00it/s]


Epoch 7: Loss=0.0002, MSE=0.0002, MAE=0.0084


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:13<00:00,  2.01it/s]


Test Loss: 0.1441, Test MSE: 0.1441, Test MAE: 0.1650, Ranking Accuracy: 0.9703

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 8: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [13:02<00:00,  1.98it/s]


Epoch 8: Loss=0.0001, MSE=0.0001, MAE=0.0052


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:14<00:00,  2.00it/s]


Test Loss: 0.1489, Test MSE: 0.1489, Test MAE: 0.1656, Ranking Accuracy: 0.9697

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 9: 100%|█████████████████████████████████████████████████████████████████████| 1550/1550 [12:59<00:00,  1.99it/s]


Epoch 9: Loss=0.0000, MSE=0.0000, MAE=0.0038


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:14<00:00,  1.99it/s]


Test Loss: 0.1520, Test MSE: 0.1520, Test MAE: 0.1668, Ranking Accuracy: 0.9690

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 10: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [13:00<00:00,  1.99it/s]


Epoch 10: Loss=0.0000, MSE=0.0000, MAE=0.0030


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:13<00:00,  2.00it/s]


Test Loss: 0.1538, Test MSE: 0.1538, Test MAE: 0.1668, Ranking Accuracy: 0.9690

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 11: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [13:01<00:00,  1.98it/s]


Epoch 11: Loss=0.0000, MSE=0.0000, MAE=0.0024


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:13<00:00,  2.00it/s]


Test Loss: 0.1560, Test MSE: 0.1560, Test MAE: 0.1676, Ranking Accuracy: 0.9677

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 12: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:56<00:00,  2.00it/s]


Epoch 12: Loss=0.0000, MSE=0.0000, MAE=0.0019


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:12<00:00,  2.02it/s]


Test Loss: 0.1579, Test MSE: 0.1579, Test MAE: 0.1685, Ranking Accuracy: 0.9665

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 13: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:55<00:00,  2.00it/s]


Epoch 13: Loss=0.0000, MSE=0.0000, MAE=0.0017


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:11<00:00,  2.03it/s]


Test Loss: 0.1595, Test MSE: 0.1595, Test MAE: 0.1692, Ranking Accuracy: 0.9658

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 14: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:53<00:00,  2.00it/s]


Epoch 14: Loss=0.0000, MSE=0.0000, MAE=0.0015


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:14<00:00,  2.00it/s]


Test Loss: 0.1604, Test MSE: 0.1604, Test MAE: 0.1694, Ranking Accuracy: 0.9632

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 15: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:53<00:00,  2.00it/s]


Epoch 15: Loss=0.0000, MSE=0.0000, MAE=0.0013


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:11<00:00,  2.03it/s]


Test Loss: 0.1617, Test MSE: 0.1617, Test MAE: 0.1700, Ranking Accuracy: 0.9600

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 16: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:55<00:00,  2.00it/s]


Epoch 16: Loss=0.0000, MSE=0.0000, MAE=0.0012


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:10<00:00,  2.03it/s]


Test Loss: 0.1625, Test MSE: 0.1625, Test MAE: 0.1703, Ranking Accuracy: 0.9542

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 17: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:55<00:00,  2.00it/s]


Epoch 17: Loss=0.0000, MSE=0.0000, MAE=0.0011


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:14<00:00,  2.00it/s]


Test Loss: 0.1631, Test MSE: 0.1631, Test MAE: 0.1705, Ranking Accuracy: 0.9497

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 18: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:57<00:00,  1.99it/s]


Epoch 18: Loss=0.0000, MSE=0.0000, MAE=0.0010


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:10<00:00,  2.04it/s]


Test Loss: 0.1635, Test MSE: 0.1635, Test MAE: 0.1707, Ranking Accuracy: 0.9490

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 19: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:45<00:00,  2.03it/s]


Epoch 19: Loss=0.0000, MSE=0.0000, MAE=0.0009


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:09<00:00,  2.05it/s]


Test Loss: 0.1645, Test MSE: 0.1645, Test MAE: 0.1713, Ranking Accuracy: 0.9394

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Epoch 20: 100%|████████████████████████████████████████████████████████████████████| 1550/1550 [12:44<00:00,  2.03it/s]


Epoch 20: Loss=0.0000, MSE=0.0000, MAE=0.0009


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:09<00:00,  2.05it/s]


Test Loss: 0.1646, Test MSE: 0.1646, Test MAE: 0.1712, Ranking Accuracy: 0.9426

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav


Evaluating: 100%|████████████████████████████████████████████████████████████████████| 388/388 [03:09<00:00,  2.05it/s]


Test Loss: 0.1646, Test MSE: 0.1646, Test MAE: 0.1712, Ranking Accuracy: 0.9426

Sample audio paths considered 'better':
./buriy_audiobooks_2_val/3/c3/eaa7f742fe0f.wav
./buriy_audiobooks_2_val/b/e9/aa5506a728f1.wav
./buriy_audiobooks_2_val/a/09/9146542460f7.wav
./buriy_audiobooks_2_val/6/43/0a22b6d2ae6b.wav
./buriy_audiobooks_2_val/6/60/4372d2481049.wav
./buriy_audiobooks_2_val/1/29/1b138a36c183.wav
./buriy_audiobooks_2_val/c/b4/a9cf4621c185.wav
./buriy_audiobooks_2_val/2/fd/40e353ba9921.wav
./buriy_audiobooks_2_val/4/fe/2a8ca63e231f.wav
./buriy_audiobooks_2_val/5/83/7847bd102962.wav
