In [1]:
import os
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
import whisper
from transformers import BertTokenizer, BertModel
import numpy as np
from tqdm import tqdm
import wandb
from collections import Counter


In [2]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# --- Load and Unfreeze Whisper‑medium ---
whisper_model = whisper.load_model("base.en").to(device)
# Unfreeze all layers in Whisper
for param in whisper_model.parameters():
    param.requires_grad = True
whisper_model.train()  # Set to train mode so gradients are computed

# Load BERT tokenizer and model (BERT remains frozen here).
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased").to(device).eval()

# --- Initialize wandb ---
wandb.init(project="somos-ensemble", name="finetune-whisper-ensemble")
!wandb online

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: rtfiof (rtfiof-hse-university). Use `wandb login --relogin` to force relogin


W&B online. Running your script from this directory will now sync to the cloud.


In [3]:
device


device(type='cuda', index=1)

In [4]:

# --- Utility Functions ---

# Function to compute class weights
# Compute class weights for imbalanced dataset
def compute_class_weights(labels, num_classes):
    class_counts = Counter(labels)
    total_samples = sum(class_counts.values())
    weights = {cls: total_samples / (num_classes * count) for cls, count in class_counts.items()}
    return torch.tensor([weights[i] for i in range(num_classes)], dtype=torch.float).to(device)


# Function to compute sample weights for oversampling
def get_sample_weights(dataset, class_weights):
    sample_weights = []

    for _, _, label in dataset:
        sample_weights.append(class_weights[label].item())

    return torch.tensor(sample_weights, dtype=torch.float)

def load_json(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

def process_audio_path(clean_path, base_dir="data/somos/audios"):
    return os.path.join(base_dir, clean_path.replace("\\", "/"))

# Earth Mover’s Distance (EMD) Loss for ordinal MOS prediction.
def emd_loss(y_pred, y_true, num_classes):
    y_pred = F.softmax(y_pred, dim=-1)  # Convert logits to probability distribution
    y_true = F.one_hot(y_true, num_classes).float()  # Convert labels to one-hot

    cdf_pred = torch.cumsum(y_pred, dim=-1)  # Compute cumulative sum for predicted distribution
    cdf_true = torch.cumsum(y_true, dim=-1)  # Compute cumulative sum for true distribution

    loss = torch.mean((cdf_pred - cdf_true) ** 2)  # Use squared difference for smoother gradients
    return loss

def entropy_regularization(gate_weights, lambda_reg=0.01):
    # Compute entropy loss to encourage diverse gating weights
    eps = 1e-8
    entropy = -torch.sum(gate_weights * torch.log(gate_weights + eps), dim=1)
    return lambda_reg * torch.mean(entropy)

def save_model(model, epoch, best_acc, save_path="models"):
    os.makedirs(save_path, exist_ok=True)
    model_path = os.path.join(save_path, f"model_epoch_{epoch}.pth")
    torch.save(model.state_dict(), model_path)
    best_model_path = os.path.join(save_path, "best_model.pth")
    if best_acc:
        torch.save(model.state_dict(), best_model_path)

In [5]:
# --- Dataset Class ---
class SOMOSDataset(Dataset):
    def __init__(self, json_file, base_dir="data/somos/audios"):
        self.samples = load_json(json_file)
        self.base_dir = base_dir
        self.labels = [int(float(sample["mos"])) - 1 for sample in self.samples]  # Convert 1-5 to 0-4

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        text = sample["text"]
        label = torch.tensor(int(float(sample["mos"])) - 1, dtype=torch.long)
        audio_path = process_audio_path(sample["clean path"], self.base_dir)
        return audio_path, text, label

def collate_fn(batch):
    audio_paths, texts, labels = zip(*batch)
    audios = [whisper.load_audio(path) for path in audio_paths]
    audios = [whisper.pad_or_trim(audio) for audio in audios]
    mel_spectrograms = [whisper.log_mel_spectrogram(audio).to(device) for audio in audios]
    mel_spectrograms = torch.stack(mel_spectrograms)

    # Compute audio embeddings with gradients enabled
    audio_embeddings = whisper_model.encoder(mel_spectrograms).mean(dim=1)

    # Process texts using BERT (BERT remains frozen)
    inputs = tokenizer(list(texts), return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        text_embeddings = bert_model(**inputs).last_hidden_state[:, 0, :]

    labels = torch.stack(labels).to(device)
    return audio_embeddings, text_embeddings, labels


In [6]:
# --- Model Definitions ---
class ComplexFusionSubModel(nn.Module):
    def __init__(self, audio_dim, text_dim, hidden_dim, num_classes, dropout_rate=0.05):
        super(ComplexFusionSubModel, self).__init__()
        self.audio_fc = nn.Sequential(
            nn.Linear(audio_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
        )
        self.text_fc = nn.Sequential(
            nn.Linear(text_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
        )
        self.attention = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.Tanh(),
            nn.Linear(hidden_dim // 2, 1),
            nn.Softmax(dim=1)
        )
        self.fusion_fc = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim // 2, num_classes)
        )

    def forward(self, audio_emb, text_emb):
        audio_feat = self.audio_fc(audio_emb)
        text_feat = self.text_fc(text_emb)
        fusion = torch.cat([audio_feat, text_feat], dim=1)
        attn_weights = self.attention(fusion)
        fusion = fusion * attn_weights
        return self.fusion_fc(fusion)

class EnsembleFusionClassifier(nn.Module):
    def __init__(self, audio_dim, text_dim, hidden_dim, num_classes, dropout_rate=0.05, num_models=3):
        super(EnsembleFusionClassifier, self).__init__()
        self.num_models = num_models
        self.sub_models = nn.ModuleList([
            ComplexFusionSubModel(audio_dim, text_dim, hidden_dim, num_classes, dropout_rate)
            for _ in range(num_models)
        ])
        self.gate = nn.Sequential(
            nn.Linear(audio_dim + text_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, num_models),
            nn.Softmax(dim=1)
        )
        self.residual = nn.Sequential(
            nn.Linear(num_classes, num_classes),
            nn.BatchNorm1d(num_classes),
            nn.ReLU()
        )

    def forward(self, audio_emb, text_emb, return_gate=False):
        gate_input = torch.cat([audio_emb, text_emb], dim=1)
        gate_weights = self.gate(gate_input)  # (batch_size, num_models)
        outputs = [model(audio_emb, text_emb) for model in self.sub_models]
        outputs = torch.stack(outputs, dim=1)  # (batch_size, num_models, num_classes)
        gate_weights_unsq = gate_weights.unsqueeze(2)  # (batch_size, num_models, 1)
        ensemble_output = (gate_weights_unsq * outputs).sum(dim=1)
        final_output = ensemble_output + self.residual(ensemble_output)
        if return_gate:
            return final_output, gate_weights
        return final_output

In [7]:
# --- Main Training Function ---
def main():
    train_json = "data/somos/audios/train.json"
    test_json = "data/somos/audios/test.json"

    train_dataset = SOMOSDataset(train_json)
    test_dataset = SOMOSDataset(test_json)

    # Compute class weights and create weighted sampler
    class_weights = compute_class_weights(train_dataset.labels, num_classes=5)
    sample_weights = [class_weights[label].item() for label in train_dataset.labels]
    sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)
    
    train_loader = DataLoader(train_dataset, batch_size=4, sampler=sampler, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=collate_fn)
    
    dummy_audio, dummy_text, _ = next(iter(train_loader))
    audio_dim, text_dim = dummy_audio.shape[1], dummy_text.shape[1]
    num_classes = 5

    # Instantiate the ensemble classifier.
    model = EnsembleFusionClassifier(audio_dim, text_dim, hidden_dim=256, num_classes=num_classes, dropout_rate=0.05, num_models=3).to(device)
    
    # Let wandb watch the model (logs gradients and parameter histograms).
    wandb.watch(model, log="all", log_freq=100)
    
    scaler = torch.cuda.amp.GradScaler()
    # Use EMD loss as criterion.
    criterion = lambda y_pred, y_true: emd_loss(y_pred, y_true, num_classes=5)
    optimizer = optim.Adam(model.parameters(), lr=1e-6)
    # optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)

    num_epochs = 100
    best_acc = 0.0

    for epoch in range(num_epochs):
        model.train()
        running_loss, correct_preds, total_samples = 0.0, 0, 0

        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1} Training", leave=False)
        for audio_emb, text_emb, labels in train_pbar:
            optimizer.zero_grad()
            # Get model output along with gating weights for logging.
            with torch.cuda.amp.autocast():
                outputs, gate_weights = model(audio_emb, text_emb, return_gate=True)
                loss = criterion(outputs, labels)
                # Regularize gating network: add entropy loss on the gate weights.
                loss += entropy_regularization(gate_weights, lambda_reg=0.01)

            scaler.scale(loss).backward()

            # Clip gradients to prevent exploding gradients.
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item() * audio_emb.size(0)
            preds = torch.argmax(outputs, dim=1)
            correct_preds += (preds == labels).sum().item()
            total_samples += labels.size(0)

            wandb.log({
                "train_loss": loss.item(),
                "gate_weights_mean": gate_weights.mean().item(),
                "gate_weights_std": gate_weights.std().item(),
            })
            train_pbar.set_postfix(loss=loss.item())

        train_acc = 100 * correct_preds / total_samples
        wandb.log({"train_acc": train_acc})
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {running_loss/total_samples:.4f} | Train Acc: {train_acc:.2f}%")

        # Evaluation phase.
        model.eval()
        test_loss, correct_preds, total_samples = 0.0, 0, 0
        test_predictions = []
        
        with torch.no_grad():
            test_pbar = tqdm(test_loader, desc=f"Epoch {epoch+1} Validation", leave=False)
            for audio_emb, text_emb, labels in test_pbar:
                audio_emb = audio_emb.to(device)
                text_emb = text_emb.to(device)
                labels = labels.to(device)

                outputs = model(audio_emb, text_emb)
                loss = criterion(outputs, labels)

                test_loss += loss.item() * audio_emb.size(0)
                preds = torch.argmax(outputs, dim=1)
                correct_preds += (preds == labels).sum().item()
                total_samples += labels.size(0)
                test_predictions.extend(zip(labels.cpu().tolist(), preds.cpu().tolist()))
                test_pbar.set_postfix(loss=loss.item())

        test_acc = 100 * correct_preds / total_samples
        avg_test_loss = test_loss / total_samples
        wandb.log({"val_loss": avg_test_loss, "val_acc": test_acc})
        print(f"Epoch {epoch+1}/{num_epochs} - Val Loss: {avg_test_loss:.4f} | Val Acc: {test_acc:.2f}%")

        print("\nSample Predictions (Real MOS vs Predicted MOS):")
        for i, (real_mos, pred_mos) in enumerate(test_predictions[:5]):
            print(f"Example {i+1}: Real MOS = {real_mos + 1}, Predicted MOS = {pred_mos + 1}")
            wandb.log({f"sample_{i}_real_vs_pred": f"{real_mos+1} vs {pred_mos+1}"})
        
        save_model(model, epoch + 1, test_acc > best_acc)

        if test_acc > best_acc:
            best_acc = test_acc

    print("Training complete! Best validation accuracy:", best_acc)

In [8]:
main()


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
                                                                                                                       

Epoch 1/100 - Train Loss: 0.1855 | Train Acc: 19.81%


                                                                                                                       

Epoch 1/100 - Val Loss: 0.1626 | Val Acc: 16.07%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 4
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 3


                                                                                                                       

Epoch 2/100 - Train Loss: 0.1843 | Train Acc: 19.80%


                                                                                                                       

Epoch 2/100 - Val Loss: 0.1611 | Val Acc: 17.37%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 4
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 3


                                                                                                                       

Epoch 3/100 - Train Loss: 0.1830 | Train Acc: 20.65%


                                                                                                                       

Epoch 3/100 - Val Loss: 0.1554 | Val Acc: 20.87%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 4/100 - Train Loss: 0.1827 | Train Acc: 20.23%


                                                                                                                       

Epoch 4/100 - Val Loss: 0.1599 | Val Acc: 17.07%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 5/100 - Train Loss: 0.1807 | Train Acc: 20.02%


                                                                                                                       

Epoch 5/100 - Val Loss: 0.1604 | Val Acc: 18.13%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 6/100 - Train Loss: 0.1775 | Train Acc: 20.74%


                                                                                                                       

Epoch 6/100 - Val Loss: 0.1644 | Val Acc: 14.50%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 4
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 7/100 - Train Loss: 0.1763 | Train Acc: 20.12%


                                                                                                                       

Epoch 7/100 - Val Loss: 0.1608 | Val Acc: 16.00%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 4
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 8/100 - Train Loss: 0.1739 | Train Acc: 19.91%


                                                                                                                       

Epoch 8/100 - Val Loss: 0.1597 | Val Acc: 16.40%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 9/100 - Train Loss: 0.1730 | Train Acc: 20.15%


                                                                                                                       

Epoch 9/100 - Val Loss: 0.1597 | Val Acc: 15.40%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 10/100 - Train Loss: 0.1713 | Train Acc: 19.71%


                                                                                                                       

Epoch 10/100 - Val Loss: 0.1659 | Val Acc: 14.03%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 11/100 - Train Loss: 0.1705 | Train Acc: 20.77%


                                                                                                                       

Epoch 11/100 - Val Loss: 0.1624 | Val Acc: 14.70%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 12/100 - Train Loss: 0.1712 | Train Acc: 20.44%


                                                                                                                       

Epoch 12/100 - Val Loss: 0.1603 | Val Acc: 15.50%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 13/100 - Train Loss: 0.1702 | Train Acc: 20.65%


                                                                                                                       

Epoch 13/100 - Val Loss: 0.1547 | Val Acc: 17.07%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 4
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 14/100 - Train Loss: 0.1688 | Train Acc: 20.35%


                                                                                                                       

Epoch 14/100 - Val Loss: 0.1536 | Val Acc: 16.47%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 1
Example 2: Real MOS = 3, Predicted MOS = 1
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 15/100 - Train Loss: 0.1692 | Train Acc: 20.05%


                                                                                                                       

Epoch 15/100 - Val Loss: 0.1597 | Val Acc: 15.23%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 16/100 - Train Loss: 0.1698 | Train Acc: 20.40%


                                                                                                                       

Epoch 16/100 - Val Loss: 0.1592 | Val Acc: 16.30%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 17/100 - Train Loss: 0.1689 | Train Acc: 20.74%


                                                                                                                       

Epoch 17/100 - Val Loss: 0.1565 | Val Acc: 16.17%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 18/100 - Train Loss: 0.1696 | Train Acc: 20.05%


                                                                                                                       

Epoch 18/100 - Val Loss: 0.1574 | Val Acc: 15.43%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 1
Example 2: Real MOS = 3, Predicted MOS = 1
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 19/100 - Train Loss: 0.1690 | Train Acc: 20.74%


                                                                                                                       

Epoch 19/100 - Val Loss: 0.1531 | Val Acc: 17.10%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 20/100 - Train Loss: 0.1687 | Train Acc: 20.79%


                                                                                                                       

Epoch 20/100 - Val Loss: 0.1531 | Val Acc: 16.23%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 3
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 21/100 - Train Loss: 0.1681 | Train Acc: 20.70%


                                                                                                                       

Epoch 21/100 - Val Loss: 0.1461 | Val Acc: 19.83%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 22/100 - Train Loss: 0.1680 | Train Acc: 20.50%


                                                                                                                       

Epoch 22/100 - Val Loss: 0.1463 | Val Acc: 21.00%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 5
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 23/100 - Train Loss: 0.1691 | Train Acc: 20.31%


                                                                                                                       

Epoch 23/100 - Val Loss: 0.1485 | Val Acc: 18.63%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 24/100 - Train Loss: 0.1684 | Train Acc: 20.94%


                                                                                                                       

Epoch 24/100 - Val Loss: 0.1550 | Val Acc: 16.53%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 1
Example 2: Real MOS = 3, Predicted MOS = 1
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 25/100 - Train Loss: 0.1682 | Train Acc: 20.77%


                                                                                                                       

Epoch 25/100 - Val Loss: 0.1532 | Val Acc: 18.23%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 1
Example 2: Real MOS = 3, Predicted MOS = 1
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 26/100 - Train Loss: 0.1695 | Train Acc: 19.69%


                                                                                                                       

Epoch 26/100 - Val Loss: 0.1447 | Val Acc: 20.63%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 5
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 3


                                                                                                                       

Epoch 27/100 - Train Loss: 0.1688 | Train Acc: 21.22%


                                                                                                                       

Epoch 27/100 - Val Loss: 0.1485 | Val Acc: 19.60%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 28/100 - Train Loss: 0.1683 | Train Acc: 20.74%


                                                                                                                       

Epoch 28/100 - Val Loss: 0.1508 | Val Acc: 19.07%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 3
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 3


                                                                                                                       

Epoch 29/100 - Train Loss: 0.1680 | Train Acc: 20.16%


                                                                                                                       

Epoch 29/100 - Val Loss: 0.1505 | Val Acc: 17.43%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 3
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 30/100 - Train Loss: 0.1679 | Train Acc: 20.51%


                                                                                                                       

Epoch 30/100 - Val Loss: 0.1541 | Val Acc: 16.93%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 5
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 31/100 - Train Loss: 0.1678 | Train Acc: 20.01%


                                                                                                                       

Epoch 31/100 - Val Loss: 0.1620 | Val Acc: 15.73%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 1
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 32/100 - Train Loss: 0.1675 | Train Acc: 20.56%


                                                                                                                       

Epoch 32/100 - Val Loss: 0.1539 | Val Acc: 16.27%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 33/100 - Train Loss: 0.1673 | Train Acc: 20.39%


                                                                                                                       

Epoch 33/100 - Val Loss: 0.1498 | Val Acc: 16.50%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 3
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 34/100 - Train Loss: 0.1663 | Train Acc: 20.73%


                                                                                                                       

Epoch 34/100 - Val Loss: 0.1537 | Val Acc: 16.50%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 1
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 35/100 - Train Loss: 0.1689 | Train Acc: 19.76%


                                                                                                                       

Epoch 35/100 - Val Loss: 0.1514 | Val Acc: 17.43%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 3
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 3


                                                                                                                       

Epoch 36/100 - Train Loss: 0.1677 | Train Acc: 20.37%


                                                                                                                       

Epoch 36/100 - Val Loss: 0.1542 | Val Acc: 17.83%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 37/100 - Train Loss: 0.1668 | Train Acc: 20.88%


                                                                                                                       

Epoch 37/100 - Val Loss: 0.1466 | Val Acc: 18.83%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 5
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 5
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

Epoch 38/100 - Train Loss: 0.1667 | Train Acc: 20.63%


                                                                                                                       

Epoch 38/100 - Val Loss: 0.1488 | Val Acc: 18.03%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 3
Example 2: Real MOS = 3, Predicted MOS = 3
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 3
Example 5: Real MOS = 3, Predicted MOS = 3


                                                                                                                       

Epoch 39/100 - Train Loss: 0.1662 | Train Acc: 20.95%


                                                                                                                       

Epoch 39/100 - Val Loss: 0.1512 | Val Acc: 16.93%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 1
Example 2: Real MOS = 3, Predicted MOS = 1
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 1


                                                                                                                       

Epoch 40/100 - Train Loss: 0.1678 | Train Acc: 19.92%


                                                                                                                       

Epoch 40/100 - Val Loss: 0.1494 | Val Acc: 18.87%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 5
Example 5: Real MOS = 3, Predicted MOS = 3


                                                                                                                       

Epoch 41/100 - Train Loss: 0.1669 | Train Acc: 20.77%


                                                                                                                       

Epoch 41/100 - Val Loss: 0.1485 | Val Acc: 19.23%

Sample Predictions (Real MOS vs Predicted MOS):
Example 1: Real MOS = 4, Predicted MOS = 4
Example 2: Real MOS = 3, Predicted MOS = 4
Example 3: Real MOS = 5, Predicted MOS = 2
Example 4: Real MOS = 2, Predicted MOS = 1
Example 5: Real MOS = 3, Predicted MOS = 4


                                                                                                                       

KeyboardInterrupt: 