In [None]:
# Ignore warnings for cleaner logs
import warnings
warnings.simplefilter("ignore")

In [None]:
# Installing pyloudnorm for LUFS Normalization
!pip install pyloudnorm

Collecting pyloudnorm
  Downloading pyloudnorm-0.1.1-py3-none-any.whl.metadata (5.6 kB)
Downloading pyloudnorm-0.1.1-py3-none-any.whl (9.6 kB)
Installing collected packages: pyloudnorm
Successfully installed pyloudnorm-0.1.1


In [None]:
# Making Necessary Imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import librosa
import os
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from scipy.stats import pearsonr
from tqdm import tqdm
from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig
import pyloudnorm as pyln
import math

2025-12-18 06:11:43.751610: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766038303.913667      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766038303.961677      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766038304.350306      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766038304.350342      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766038304.350345      24 computation_placer.cc:177] computation placer alr

In [None]:
# Reading CSVs
train_df = pd.read_csv('/kaggle/input/shl-ft-model/train_updated.csv')
test_df = pd.read_csv('/kaggle/input/shl-ft-model/test_updated.csv')

In [5]:
train_df.head()

Unnamed: 0,filename,label,path,text
0,audio_173,3.0,/kaggle/input/shl-intern-hiring-assessment-202...,My favorite place to visit would be Japan bec...
1,audio_138,3.0,/kaggle/input/shl-intern-hiring-assessment-202...,I loved reading and my hobbies as reading. Em...
2,audio_127,2.0,/kaggle/input/shl-intern-hiring-assessment-202...,"Yeah, my favorite place to visit is Thirmala...."
3,audio_95,2.0,/kaggle/input/shl-intern-hiring-assessment-202...,I am going to tell about my hobby. And my hob...
4,audio_73,3.5,/kaggle/input/shl-intern-hiring-assessment-202...,hmm this is a tough one so my best day of my ...


In [6]:
test_df.head()

Unnamed: 0,filename,path,text
0,audio_141,/kaggle/input/shl-intern-hiring-assessment-202...,I love I love when I love stories Sherlock Ho...
1,audio_114,/kaggle/input/shl-intern-hiring-assessment-202...,"I have a lot of favorite days, but one of the..."
2,audio_17,/kaggle/input/shl-intern-hiring-assessment-202...,My topic is describe the scene of a hospital....
3,audio_76,/kaggle/input/shl-intern-hiring-assessment-202...,A playground has a lot of equipment. It typic...
4,audio_156,/kaggle/input/shl-intern-hiring-assessment-202...,the best day of my life are the most days whe...


In [None]:
# Analysing Distirbution of Label
train_df['label'].value_counts()

label
3.0    154
2.0     90
2.5     72
3.5     46
5.0     20
4.0     15
4.5     10
1.0      1
1.5      1
Name: count, dtype: int64

In [None]:
# Forming bins for stratified split 
train_df['bin'] = train_df['label'].apply(int)

In [None]:
# Analysing bin distribution
train_df['bin'].value_counts()

bin
3    200
2    162
4     25
5     20
1      2
Name: count, dtype: int64

In [None]:
# Config Class containing essential hyperparameters
class CONFIG():
    def __init__(self):
        self.random_seed = 42
        self.sample_rate = 16000
        self.batch_size = 4
        self.text_encoder = "yiiino/deberta-v3-large-cola"
        self.wav_encoder = "openai/whisper-large-v3-turbo"
        self.epochs = 150
        self.lr = 1e-4
        self.es_patience = 5

cfg = CONFIG()

In [None]:
# Setting Device to CUDA
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# Setting Torch Random Seed
torch.manual_seed(cfg.random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(cfg.random_seed)
    torch.cuda.manual_seed_all(cfg.random_seed)

In [None]:
# Defining the PyTorch Dataset Class for Train and Test Datasets
class FusionMOSDataset(Dataset):
    def __init__(self, df, split, sample_rate=16000, log_target=True):
        self.df = df
        self.split = split
        self.sample_rate = sample_rate
        self.log_target = log_target

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # text
        text = row["text"]

        # audio
        audio, _ = librosa.load(row["path"], sr=self.sample_rate, mono=True)
        meter = pyln.Meter(self.sample_rate)  # EBU R128 meter
        loudness = meter.integrated_loudness(audio)
        
        # Target loudness: -23 LUFS (speech standard)
        audio = pyln.normalize.loudness(audio, loudness, -23.0)
        audio = torch.tensor(audio, dtype=torch.float32)

        # label
        if self.split == 'train':
            y = torch.tensor(float(row["label"]), dtype=torch.float32)
            if self.log_target:
                y = torch.log1p(y)
    
            return text, audio, y
        else:
            return text, audio

# Collate function for batching where labels are present (train dataset)
def collate_fn(batch):
    texts, audios, labels = zip(*batch)
    return list(texts), list(audios), torch.stack(labels)

# Collate function for batching where labels are absent (test dataset)
def collate_fn_test(batch):
    texts, audios = zip(*batch)
    return list(texts), list(audios)

In [None]:
# Defining the training and test datasets
train_ds = FusionMOSDataset(
    train_df,
    'train',
    cfg.sample_rate,
    log_target = True
)

test_ds = FusionMOSDataset(
    test_df,
    'test',
    cfg.sample_rate,
    log_target = True
)

# Defining the training and test Dataloaders
train_loader = DataLoader(
    train_ds,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn = collate_fn
)
test_loader = DataLoader(
    test_ds,
    batch_size=cfg.batch_size,
    shuffle=False,
    collate_fn = collate_fn_test
)

In [None]:
# Defining the Text Encoder Model Class
class TextEncoder(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

        # Freezing the paramters of text encoder
        for param in self.model.parameters():
            param.requires_grad = False

    def forward(self, texts):
        # Transforming the input text into tokens and placing them onto the GPU
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )

        inputs = {k: v.to(next(self.model.parameters()).device)
                  for k, v in inputs.items()}

        # Retrieve Token Embeddings
        with torch.no_grad():
            outputs = self.model(**inputs)

        # Mean pooling of token embeddings for a singular transcript embedding
        hidden = outputs.last_hidden_state
        mask = inputs["attention_mask"].unsqueeze(-1)
        emb = (hidden * mask).sum(dim=1) / mask.sum(dim=1)
        return emb

In [None]:
# Defining the WAV Encoder Model Class
class WAVEncoder(nn.Module):
    def __init__(self, model_name, sampling_rate=16000):
        super().__init__()
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        self.sampling_rate = sampling_rate

        # Freezing the paramters of WAV encoder
        for p in self.model.parameters():
            p.requires_grad = False

    def forward(self, audio_list):
        # Creating list of audio arrays on CPU
        audio_list = [a.cpu().numpy() for a in audio_list]

        # Processing the audio array
        inputs = self.processor(
            audio_list,
            sampling_rate=self.sampling_rate,
            return_tensors="pt"
        )

        # Retrieve WAV Embeddings
        with torch.no_grad():
            outputs = self.model.encoder(
                inputs.input_features.to(next(self.model.parameters()).device)
            )
        
        # Mean pooling for a singlular WAV embedding
        hidden = outputs.last_hidden_state
        pooled = hidden.mean(dim=1)

        return pooled

In [None]:
# Defining the MLP Regressor Head for MOS Prediction
class FusionMLP(nn.Module):
    def __init__(self, input_dim):
        # High Dropout to prevent overfitting
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)

In [None]:
# Defning the Combined Model Class
class FusionMOSModel(nn.Module):
    def __init__(self, text_encoder, wav_encoder, sampling_rate):
        super().__init__()
        self.text_encoder = TextEncoder(text_encoder)
        self.audio_encoder = WAVEncoder(wav_encoder, sampling_rate)

        # Retrieving the embedding size of text embeddings and WAV embeddings
        self.text_dim = AutoConfig.from_pretrained(text_encoder).hidden_size
        self.wav_dim = AutoConfig.from_pretrained(wav_encoder).hidden_size

        # Fusing the embeddings to incorporate both text and audio context for prediction
        fusion_dim =  self.text_dim + self.wav_dim
        self.regressor = FusionMLP(fusion_dim)

    def forward(self, texts, audios):
        text_emb = self.text_encoder(texts)
        audio_emb = self.audio_encoder(audios)

        # Concatenting the mebeddings to incorporate both text and audio context for prediction
        fused = torch.cat([text_emb, audio_emb], dim=1)
        return self.regressor(fused)

In [None]:
# Defining the model and placing it on the GPU
model = FusionMOSModel(
    cfg.text_encoder,
    cfg.wav_encoder,
    cfg.sample_rate
)

# Loading the model checkpoint and placing it on GPU
model.load_state_dict(torch.load('/kaggle/input/shl-ft-model/whisper_deberta_ft_fusion_lufs.pt'))

model.to(DEVICE);

tokenizer_config.json:   0%|          | 0.00/400 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

In [None]:
# Defining the Validation Function
def evaluate(model, dataloader):
    model.eval()
    preds, gts = [], []

    # Obtaining predictions
    with torch.no_grad():
        for texts, audios, y in tqdm(dataloader, total=len(dataloader), desc="Validation: "):
            # Obtaing the log of MOS Score
            pred_log = model(texts, audios)
            # Antilog of predictions to get MOS score and clamping it between 0 to 5
            pred = torch.expm1(pred_log).clamp(0, 5)
            gt = torch.expm1(y)

            preds.append(pred.cpu())
            gts.append(gt.cpu())

    preds = torch.cat(preds).numpy()
    gts = torch.cat(gts).numpy()

    # Computing RMSE and Pearson Correlation Coefficient
    rmse = math.sqrt(mean_squared_error(gts, preds))
    cc = pearsonr(gts, preds)[0]
    return rmse, cc

In [None]:
# Obtaining RMSE and Pearson Correlation Coefficient on the Training Dataset
rmse_train, cc_train = evaluate(model, train_loader)
print(
    f"Train RMSE: {rmse_train:.3f} | "
    f"Train CC: {cc_train:.3f} | "
)

Validation: 100%|██████████| 103/103 [04:02<00:00,  2.36s/it]

Train RMSE: 0.461 | Train CC: 0.802 | 





In [None]:
# Prediction Function
def predict(model, dataloader):
    model.eval()
    preds = []
    # Obtaining prediction for the data
    with torch.no_grad():
        for texts, audios in tqdm(dataloader, total=len(dataloader), desc="Prediction: "):
            # Obtaing the log of MOS Score
            pred_log = model(texts, audios)
            # Antilog of predictions to get MOS score and clamping it between 0 to 5
            pred = torch.expm1(pred_log).clamp(0, 5)
            preds.append(pred.cpu())

    preds = torch.cat(preds).numpy()
    return preds

In [None]:
# Finding the MOS Score on test data and rounding it off to 3 decimal digits
test_preds = predict(model, test_loader)
test_df['label'] = np.round(test_preds, 3)

Prediction: 100%|██████████| 50/50 [01:42<00:00,  2.04s/it]


In [None]:
# Output Format Sanity Check
test_df[['filename', 'label']]

Unnamed: 0,filename,label
0,audio_141,2.562
1,audio_114,4.127
2,audio_17,3.257
3,audio_76,4.451
4,audio_156,3.114
...,...,...
192,audio_107,3.440
193,audio_15,3.251
194,audio_93,4.110
195,audio_31_1,2.488


In [None]:
# Forming the submission.csv file
test_df[['filename', 'label']].to_csv('submission.csv', index=False)