In [None]:
# Ignore warnings for cleaner logs
import warnings
warnings.simplefilter("ignore")

In [None]:
# Installing pyloudnorm for LUFS Normalization
!pip install pyloudnorm

Collecting pyloudnorm
  Downloading pyloudnorm-0.1.1-py3-none-any.whl.metadata (5.6 kB)
Downloading pyloudnorm-0.1.1-py3-none-any.whl (9.6 kB)
Installing collected packages: pyloudnorm
Successfully installed pyloudnorm-0.1.1


In [None]:
# Making Necessary Imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import librosa
import os
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from tqdm.auto import tqdm
from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig
import pyloudnorm as pyln
import math

2025-12-17 22:43:42.375383: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766011422.562939      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766011422.617553      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766011423.068948      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766011423.068998      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766011423.069000      23 computation_placer.cc:177] computation placer alr

In [None]:
# Reading CSVs
train_df = pd.read_csv('/kaggle/input/shl-assignment/train_updated.csv')
test_df = pd.read_csv('/kaggle/input/shl-assignment/test_updated.csv')

In [5]:
train_df.head()

Unnamed: 0,filename,label,path,text
0,audio_173,3.0,/kaggle/input/shl-intern-hiring-assessment-202...,My favorite place to visit would be Japan bec...
1,audio_138,3.0,/kaggle/input/shl-intern-hiring-assessment-202...,I loved reading and my hobbies as reading. Em...
2,audio_127,2.0,/kaggle/input/shl-intern-hiring-assessment-202...,"Yeah, my favorite place to visit is Thirmala...."
3,audio_95,2.0,/kaggle/input/shl-intern-hiring-assessment-202...,I am going to tell about my hobby. And my hob...
4,audio_73,3.5,/kaggle/input/shl-intern-hiring-assessment-202...,hmm this is a tough one so my best day of my ...


In [6]:
test_df.head()

Unnamed: 0,filename,path,text
0,audio_141,/kaggle/input/shl-intern-hiring-assessment-202...,I love I love when I love stories Sherlock Ho...
1,audio_114,/kaggle/input/shl-intern-hiring-assessment-202...,"I have a lot of favorite days, but one of the..."
2,audio_17,/kaggle/input/shl-intern-hiring-assessment-202...,My topic is describe the scene of a hospital....
3,audio_76,/kaggle/input/shl-intern-hiring-assessment-202...,A playground has a lot of equipment. It typic...
4,audio_156,/kaggle/input/shl-intern-hiring-assessment-202...,the best day of my life are the most days whe...


In [None]:
# Analysing Distirbution of Label
train_df['label'].value_counts()

label
3.0    154
2.0     90
2.5     72
3.5     46
5.0     20
4.0     15
4.5     10
1.0      1
1.5      1
Name: count, dtype: int64

In [None]:
# Forming bins for stratified split 
train_df['bin'] = train_df['label'].apply(int)

In [None]:
# Analysing bin distribution
train_df['bin'].value_counts()

bin
3    200
2    162
4     25
5     20
1      2
Name: count, dtype: int64

In [None]:
# Config Class containing essential hyperparameters
class CONFIG():
    def __init__(self):
        self.random_seed = 42
        self.sample_rate = 16000
        self.batch_size = 4
        self.text_encoder = "yiiino/deberta-v3-large-cola"
        self.wav_encoder = "openai/whisper-large-v3-turbo"
        self.epochs = 150
        self.lr = 1e-4
        self.es_patience = 5

cfg = CONFIG()

In [None]:
# Setting Device to CUDA
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Setting Torch Random Seed
torch.manual_seed(cfg.random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(cfg.random_seed)
    torch.cuda.manual_seed_all(cfg.random_seed)

In [None]:
# Defining the PyTorch Dataset Class for Training
class FusionMOSDataset(Dataset):
    def __init__(self, df, sample_rate=16000, log_target=True):
        self.df = df
        self.sample_rate = sample_rate
        self.log_target = log_target

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Loading the transcript of WAV files
        text = row["text"]

        # Loading the audio array of WAV files
        audio, _ = librosa.load(row["path"], sr=self.sample_rate, mono=True)
        
        # Performing LUFS Normalization as per EBU R128 meter with target loudness: -23 LUFS (speech standard)
        meter = pyln.Meter(self.sample_rate)  
        loudness = meter.integrated_loudness(audio)
        audio = pyln.normalize.loudness(audio, loudness, -23.0)
        audio = torch.tensor(audio, dtype=torch.float32)

        # Loading the MOS Likert Grammar Scores for each audio instance
        y = torch.tensor(float(row["label"]), dtype=torch.float32)
        # Performing logarithmic scaling since log(MOS_Score) follows Normal Distribution more closely
        if self.log_target:
            y = torch.log1p(y)

        return text, audio, y

# Collate function for batching
def collate_fn(batch):
    texts, audios, labels = zip(*batch)
    return list(texts), list(audios), torch.stack(labels)

In [None]:
# Defining the Dataset Object
dataset = FusionMOSDataset(
    train_df,
    cfg.sample_rate,
    log_target = True
)

# Performing a 80%:20% stratified training - validation split on the bins
train_idx, valid_idx = train_test_split(
    np.arange(len(train_df)),
    test_size=0.2,            
    shuffle=True,             
    stratify=train_df['bin'],         
    random_state=42
)

train_ds = Subset(dataset, train_idx)
valid_ds = Subset(dataset, valid_idx)

# Defining the Dataloaders for Training
train_loader = DataLoader(
    train_ds,
    batch_size=cfg.batch_size,
    shuffle=True,
    collate_fn = collate_fn
)
valid_loader = DataLoader(
    valid_ds,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn = collate_fn
)

In [None]:
# Defining the Text Encoder Model Class
class TextEncoder(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

        # Freezing the paramters of text encoder
        for param in self.model.parameters():
            param.requires_grad = False

    def forward(self, texts):
        # Transforming the input text into tokens and placing them onto the GPU
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

        inputs = {k: v.to(next(self.model.parameters()).device)
                  for k, v in inputs.items()}

        # Retrieve Token Embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Mean pooling of token embeddings for a singular transcript embedding
        hidden = outputs.last_hidden_state
        mask = inputs["attention_mask"].unsqueeze(-1)
        emb = (hidden * mask).sum(dim=1) / mask.sum(dim=1)
        return emb

In [None]:
# Defining the WAV Encoder Model Class
class WAVEncoder(nn.Module):
    def __init__(self, model_name, sampling_rate=16000):
        super().__init__()
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.sampling_rate = sampling_rate

        # Freezing the paramters of WAV encoder
        for p in self.model.parameters():
            p.requires_grad = False

    def forward(self, audio_list):
        # Creating list of audio arrays on CPU
        audio_list = [a.cpu().numpy() for a in audio_list]

        # Processing the audio array
        inputs = self.processor(
            audio_list,
            sampling_rate=self.sampling_rate,
            return_tensors="pt"
        )

        # Retrieve WAV Embeddings
        with torch.no_grad():
            outputs = self.model.encoder(
                inputs.input_features.to(next(self.model.parameters()).device)
            )
        
        # Mean pooling for a singlular WAV embedding
        hidden = outputs.last_hidden_state
        pooled = hidden.mean(dim=1)

        return pooled

In [None]:
# Defining the MLP Regressor Head for MOS Prediction
class FusionMLP(nn.Module):
    def __init__(self, input_dim):
        # High Dropout to prevent overfitting
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

In [None]:
# Defning the Combined Model Class
class FusionMOSModel(nn.Module):
    def __init__(self, text_encoder, wav_encoder, sampling_rate):
        super().__init__()
        self.text_encoder = TextEncoder(text_encoder)
        self.audio_encoder = WAVEncoder(wav_encoder, sampling_rate)

        # Retrieving the embedding size of text embeddings and WAV embeddings
        self.text_dim = AutoConfig.from_pretrained(text_encoder).hidden_size
        self.wav_dim = AutoConfig.from_pretrained(wav_encoder).hidden_size

        # Fusing the embeddings to incorporate both text and audio context for prediction
        fusion_dim =  self.text_dim + self.wav_dim
        self.regressor = FusionMLP(fusion_dim)

    def forward(self, texts, audios):
        text_emb = self.text_encoder(texts)
        audio_emb = self.audio_encoder(audios)

        # Concatenting the mebeddings to incorporate both text and audio context for prediction
        fused = torch.cat([text_emb, audio_emb], dim=1)
        return self.regressor(fused)

In [None]:
# Defining the model and placing it on the GPU
model = FusionMOSModel(
    cfg.text_encoder,
    cfg.wav_encoder,
    cfg.sample_rate
).to(DEVICE)

tokenizer_config.json:   0%|          | 0.00/400 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

In [None]:
# Using MSE Loss since log of MOS Score resembles Normal Distribution
criterion = nn.MSELoss()

# Using AdamW Optimizer
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=cfg.lr
)

# Using Reduce LR on Plateau Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=1
)

In [None]:
# Computing Grad Norm for analysing training
def compute_grad_norm(model):
    total_norm = 0.0
    for p in model.parameters():
        if p.grad is not None:
            param_norm = p.grad.data.norm(2)
            total_norm += param_norm.item() ** 2
    return total_norm ** 0.5

In [None]:
# Defining the training function
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0

    for texts, audios, y in tqdm(dataloader, total=len(dataloader), desc="Training: "):
        # Placing the labels on GPU
        y = y.to(DEVICE)

        # Computing training loss
        optimizer.zero_grad()
        pred_log = model(texts, audios)
        loss = criterion(pred_log, y)

        # Performing Backprop
        loss.backward()
        # Computing gradients magnitude for training analysis
        grad_norm = compute_grad_norm(model)
        # Updating weights
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader), grad_norm

In [None]:
# Defining the Validation Function
def evaluate(model, dataloader):
    model.eval()
    preds, gts = [], []

    # Obtaining predictions
    with torch.no_grad():
        for texts, audios, y in tqdm(dataloader, total=len(dataloader), desc="Validation: "):
            # Obtaing the log of MOS Score
            pred_log = model(texts, audios)
            # Antilog of predictions to get MOS score and clamping it between 0 to 5
            pred = torch.expm1(pred_log).clamp(0, 5)
            gt = torch.expm1(y)

            preds.append(pred.cpu())
            gts.append(gt.cpu())

    preds = torch.cat(preds).numpy()
    gts = torch.cat(gts).numpy()

    # Computing RMSE and Pearson Correlation Coefficient
    rmse = math.sqrt(mean_squared_error(gts, preds))
    cc = pearsonr(gts, preds)[0]
    return rmse, cc

In [None]:
# Training Loop
best_rmse = float("inf")
early_stop_counter = 0

for epoch in range(cfg.epochs):
    # Computing average training loss and grad norm
    train_loss, grad_norm = train_epoch(
        model, train_loader, optimizer, criterion
    )

    # Computing RMSE and Pearson Correlation Coefficient on the validation set
    rmse, cc = evaluate(model, valid_loader)
    # Obtaining the current LR
    current_lr = optimizer.param_groups[0]["lr"]
    # Triggering the scheduler for LR Decay
    scheduler.step(rmse)
    
    # Printing the epochs stats
    print(
        f"Epoch {epoch+1:02d} | "
        f"Training Loss: {train_loss:.4f} | "
        f"Grad Norm: {grad_norm:.4f} | "
        f"Val RMSE: {rmse:.3f} | "
        f"Val CC: {cc:.3f} | "
        f"LR: {current_lr:.2e}"
    )

    # Checking for Early Stopping and saving the best model
    if rmse < best_rmse:
        best_rmse = rmse
        early_stop_counter = 0
        torch.save(model.state_dict(), "whisper_deberta_ft_fusion_lufs.pt")
    else:
        early_stop_counter += 1

    if early_stop_counter >= cfg.es_patience:
        print("Early stopping triggered.")
        break

Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 01 | Training Loss: 0.2347 | Grad Norm: 6.1088 | Val RMSE: 0.760 | Val CC: 0.431 | LR: 1.00e-04


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 02 | Training Loss: 0.0650 | Grad Norm: 2.3501 | Val RMSE: 0.732 | Val CC: 0.591 | LR: 1.00e-04


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 03 | Training Loss: 0.0643 | Grad Norm: 3.3721 | Val RMSE: 0.726 | Val CC: 0.590 | LR: 1.00e-04


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 04 | Training Loss: 0.0512 | Grad Norm: 1.8193 | Val RMSE: 0.621 | Val CC: 0.611 | LR: 1.00e-04


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 05 | Training Loss: 0.0603 | Grad Norm: 2.2328 | Val RMSE: 0.594 | Val CC: 0.667 | LR: 1.00e-04


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 06 | Training Loss: 0.0556 | Grad Norm: 1.4640 | Val RMSE: 0.581 | Val CC: 0.701 | LR: 1.00e-04


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 07 | Training Loss: 0.0534 | Grad Norm: 2.7371 | Val RMSE: 0.609 | Val CC: 0.666 | LR: 1.00e-04


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 08 | Training Loss: 0.0475 | Grad Norm: 3.5808 | Val RMSE: 0.587 | Val CC: 0.725 | LR: 1.00e-04


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 09 | Training Loss: 0.0458 | Grad Norm: 1.1753 | Val RMSE: 0.558 | Val CC: 0.709 | LR: 5.00e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 10 | Training Loss: 0.0428 | Grad Norm: 2.9514 | Val RMSE: 0.614 | Val CC: 0.696 | LR: 5.00e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 11 | Training Loss: 0.0388 | Grad Norm: 1.3362 | Val RMSE: 0.586 | Val CC: 0.710 | LR: 5.00e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 12 | Training Loss: 0.0441 | Grad Norm: 2.2585 | Val RMSE: 0.582 | Val CC: 0.718 | LR: 2.50e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 13 | Training Loss: 0.0377 | Grad Norm: 2.3042 | Val RMSE: 0.548 | Val CC: 0.735 | LR: 2.50e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 14 | Training Loss: 0.0468 | Grad Norm: 1.5356 | Val RMSE: 0.532 | Val CC: 0.739 | LR: 2.50e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 15 | Training Loss: 0.0369 | Grad Norm: 2.0733 | Val RMSE: 0.515 | Val CC: 0.739 | LR: 2.50e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 16 | Training Loss: 0.0394 | Grad Norm: 0.9878 | Val RMSE: 0.613 | Val CC: 0.733 | LR: 2.50e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 17 | Training Loss: 0.0390 | Grad Norm: 2.7189 | Val RMSE: 0.538 | Val CC: 0.734 | LR: 2.50e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 18 | Training Loss: 0.0327 | Grad Norm: 2.5314 | Val RMSE: 0.518 | Val CC: 0.737 | LR: 1.25e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 19 | Training Loss: 0.0383 | Grad Norm: 1.4724 | Val RMSE: 0.517 | Val CC: 0.761 | LR: 1.25e-05


Training:   0%|          | 0/82 [00:00<?, ?it/s]

Validation:   0%|          | 0/21 [00:00<?, ?it/s]

Epoch 20 | Training Loss: 0.0329 | Grad Norm: 2.5749 | Val RMSE: 0.520 | Val CC: 0.762 | LR: 6.25e-06
Early stopping triggered.
