# TitaNet

In [1]:
%pip install tqdm
%pip install torch==1.13.0
%pip install torchaudio==0.13.0
%pip install pandas==1.3.3
%pip install librosa==0.8.1
%pip install transformers
%pip install wandb
%pip install pathlib
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.
Collecting torch==1.13.0
  Downloading torch-1.13.0-cp310-cp310-manylinux1_x86_64.whl.metadata (23 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==1.13.0)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==1.13.0)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==1.13.0)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==1.13.0)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Downloading torch-1.13.0-cp310-cp310-manylinux1_x86_64.whl (890.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m890.1/890.1 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:

# Libraries

In [3]:
import yaml
import pandas as pd
from glob import glob
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader
import pandas as pd
import torchaudio
from scipy.stats import pearsonr,spearmanr
from sklearn.metrics import mean_squared_error
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import transformers
import logging
import wandb
from pathlib import Path
import os
import math
import titanet_modules as modules

logger = logging.getLogger(__name__)


In [4]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=/home/ec2-user/SageMaker/cache
%env HF_DATASETS_CACHE=/home/ec2-user/SageMaker/cache
%env CUDA_LAUNCH_BLOCKING=1

env: LC_ALL=C.UTF-8
env: LANG=C.UTF-8
env: TRANSFORMERS_CACHE=/home/ec2-user/SageMaker/cache
env: HF_DATASETS_CACHE=/home/ec2-user/SageMaker/cache
env: CUDA_LAUNCH_BLOCKING=1


# Data Loader

In [5]:
class NISQADataset(object):
    def __init__(self,args,d_type):
        self.args = args
        self.d_type = d_type
        self.dfdata = self.load_CSV_data()
        
    def __len__(self):
        return len(self.dfdata)
    
    def __getitem__(self, idx):
        row_data = self.dfdata.iloc[idx]
        spectrogram, mos = self.generate_waveform_spectrogram_mos_pair(row_data)
        return spectrogram, mos
         
    def generate_spectrograms(self, waveform):
        
        spectrogram_transform = torchaudio.transforms.MelSpectrogram(
                                                sample_rate=self.args.sample_rate,
                                                n_fft=self.args.n_fft,
                                                win_length=self.args.win_length,
                                                hop_length=self.args.hop_length,
                                                n_mels=self.args.n_mels
                                            )
        
        spectrogram = spectrogram_transform(waveform)
        return spectrogram

    def generate_waveform_spectrogram_mos_pair(self, row_data):  
        audio_path = self.args.data_dir + '/' + self.d_type + '/' + row_data[self.args.csv_deg]
        mos_rating = row_data[self.args.csv_mos_train]
        
        mos_rating = torch.tensor(mos_rating)

        waveform, sample_rate = torchaudio.load(audio_path) 

        resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)  
        waveform = resample_transform(waveform)  

        spectrogram = self.generate_spectrograms(waveform)
        spectrogram = spectrogram.squeeze(0)
            
        return spectrogram,mos_rating
        

    def load_CSV_data(self):
        data = pd.read_csv(self.args.titanet_csv_file + '/' + self.d_type +'.csv')       
        return data

In [6]:
def custom_collate_fn(batch):
    spectrograms = [item[0] for item in batch]
    mos = [item[1] for item in batch]
    
    max_len = max([spectrogram.shape[-1] for spectrogram in spectrograms])
    
    padded_spectrograms = [F.pad(spectrogram, (0, max_len - spectrogram.shape[-1])) for spectrogram in spectrograms]
    
    padded_spectrograms = torch.stack(padded_spectrograms)
    mos = torch.stack(mos)
    
    return padded_spectrograms, mos

In [7]:
def listnet_loss(y_i, z_i):
    P_y_i = F.softmax(y_i.float(), dim=0)
    P_z_i = F.softmax(z_i.float(), dim=0)
    return - torch.sum(P_y_i * torch.log(P_z_i))

# Model Architecture

In [8]:
class TitaNetEncoder(nn.Module):
    def __init__(
        self,
        n_mels,
        n_mega_blocks,
        n_sub_blocks,
        hidden_size,
        output_size,
        mega_block_kernel_size,
        attention_hidden_size,
        embedding_size,
        prolog_kernel_size=3,
        epilog_kernel_size=1,
        se_reduction=16,
        dropout=0.5,
    ):
        super(TitaNetEncoder, self).__init__()

        self.prolog = modules.ConvBlock1d(n_mels, hidden_size, prolog_kernel_size)
        self.mega_blocks = nn.Sequential(
            *[
                MegaBlock(
                    hidden_size,
                    hidden_size,
                    mega_block_kernel_size,
                    n_sub_blocks,
                    se_reduction=se_reduction,
                    dropout=dropout,
                )
                for _ in range(n_mega_blocks)
            ]
        )
        self.epilog = modules.ConvBlock1d(hidden_size, output_size, epilog_kernel_size)
        self.pool = nn.Sequential(
                AttentiveStatsPooling(output_size, attention_hidden_size),
                # nn.BatchNorm1d(output_size * 2),
            )

        # self.linear = nn.Linear(output_size * 2, embedding_size)
        self.linear = nn.Linear(output_size * 2, embedding_size)


    def forward(self, spectrograms):
        # [B, M, T] -> [B, H, T]
        prolog_outputs = self.prolog(spectrograms)

        # [B, H, T] -> [B, H, T]
        mega_blocks_outputs = self.mega_blocks(prolog_outputs)

        # [B, H, T] -> [B, DE, T]
        encodings = self.epilog(mega_blocks_outputs)
        
        # [B, DE, T] -> [B, DE * 2]
        pooled = self.pool(encodings)

        # [B, DE * 2] -> [B, E]
        # return torch.clamp(self.linear(pooled), min=0, max=5)
        return self.linear(pooled)


class MegaBlock(nn.Module):

    def __init__(
        self,
        input_size,
        output_size,
        kernel_size,
        n_sub_blocks,
        se_reduction=16,
        dropout=0.5,
    ):
        super(MegaBlock, self).__init__()

        # Store attributes
        self.dropout = dropout

        # Define sub-blocks composed of depthwise convolutions
        channels = [input_size] + [output_size] * n_sub_blocks
        self.sub_blocks = nn.Sequential(
            *[
                modules.ConvBlock1d(
                    in_channels,
                    out_channels,
                    kernel_size,
                    activation="relu",
                    dropout=dropout,
                    depthwise=True,
                )
                for in_channels, out_channels in zip(channels[:-1], channels[1:])
            ],
            modules.SqueezeExcitation(output_size, reduction=se_reduction)
        )

        # Define the final skip connection
        self.skip_connection = nn.Sequential(
            nn.Conv1d(input_size, output_size, kernel_size=1),
            nn.BatchNorm1d(output_size),
        )

    def forward(self, prolog_outputs):
        """
        Given prolog outputs of shape [B, H, T], return
        a feature tensor of shape [B, H, T]

        B: batch size
        H: hidden size
        T: maximum number of time steps (frames)
        """
        # [B, H, T] -> [B, H, T]
        mega_block_outputs = self.skip_connection(prolog_outputs) + self.sub_blocks(
            prolog_outputs
        )
        return F.dropout(
            F.relu(mega_block_outputs), p=self.dropout, training=self.training
        )

class AttentiveStatsPooling(nn.Module):

    def __init__(self, input_size, hidden_size, eps=1e-6):
        super(AttentiveStatsPooling, self).__init__()

        # Store attributes
        self.eps = eps

        # Define architecture
        self.in_linear = nn.Linear(input_size, hidden_size)
        self.out_linear = nn.Linear(hidden_size, input_size)

    def forward(self, encodings):
        """
        Given encoder outputs of shape [B, DE, T], return
        pooled outputs of shape [B, DE * 2]

        B: batch size
        T: maximum number of time steps (frames)
        DE: encoding output size
        """
        # Compute a scalar score for each frame-level feature
        # [B, DE, T] -> [B, DE, T]
        energies = self.out_linear(
            torch.tanh(self.in_linear(encodings.transpose(1, 2)))
        ).transpose(1, 2)

        # Normalize scores over all frames by a softmax function
        # [B, DE, T] -> [B, DE, T]
        alphas = torch.softmax(energies, dim=2)

        # Compute mean vector weighted by normalized scores
        # [B, DE, T] -> [B, DE]
        means = torch.sum(alphas * encodings, dim=2)

        # Compute std vector weighted by normalized scores
        # [B, DE, T] -> [B, DE]
        residuals = torch.sum(alphas * encodings ** 2, dim=2) - means ** 2
        stds = torch.sqrt(residuals.clamp(min=self.eps))

        # Concatenate mean and std vectors to produce
        # utterance-level features
        # [[B, DE]; [B, DE]] -> [B, DE * 2]
        return torch.cat([means, stds], dim=1)


In [9]:
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)

# Create Dataloader

In [11]:
config_file = '/home/ec2-user/SageMaker/Noise_modelling/config.yaml'

with open(config_file, "r") as ymlfile:
    config_dict = yaml.load(ymlfile, Loader=yaml.FullLoader)

args = Config(**config_dict)

if not os.path.exists(args.output_dir):
    Path(args.output_dir).mkdir(parents=True, exist_ok=True)

In [12]:
dataset_train = NISQADataset(args,'train')
dataset_val = NISQADataset(args,'val')
dataset_test = NISQADataset(args,'test')


train_reader = DataLoader(dataset_train,
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=4,
                pin_memory=True,
                persistent_workers=True,
                collate_fn=custom_collate_fn,
                drop_last=True,
                )

val_reader = DataLoader(dataset_val,
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=4,
                pin_memory=True,
                collate_fn=custom_collate_fn,
                persistent_workers=True,
                drop_last=True,
                )

test_reader = DataLoader(dataset_test,
                batch_size=args.batch_size,
                shuffle=True,
                num_workers=4,
                pin_memory=True,
                collate_fn=custom_collate_fn,
                persistent_workers=True,
                drop_last=True,
                )

In [14]:
len(dataset_test.dfdata)

712

In [15]:
model = TitaNetEncoder(
            args.n_mels,
            args.n_mega_blocks,
            args.n_sub_blocks,
            args.encoder_hidden_size,
            args.encoder_output_size,
            args.mega_block_kernel_size,
            args.attention_hidden_size,
            args.embedding_size,
            prolog_kernel_size=args.prolog_kernel_size,
            epilog_kernel_size=args.epilog_kernel_size,
            se_reduction=args.se_reduction,
            dropout=args.dropout,
        )

In [16]:
torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [17]:
div=1e6
n_params = sum([np.prod(p.size()) for p in model.parameters() if p.requires_grad]) / div
print(f"This model has {n_params:.2f}M parameters")

This model has 0.89M parameters


In [18]:
# model

# Eval Results

In [19]:
def eval_result(predicts, labels):
    spearman_corr, _ = spearmanr(predicts, labels)
    corr, _ = pearsonr(predicts, labels)
    res = {"Prearson Corr":corr,"Spearman Corr":spearman_corr,"Eval Loss":np.sqrt(mean_squared_error(predicts, labels))}
    return res

def eval_model(model, validset_reader):
    model.eval()
    predicts = list()
    labels = list()
    with torch.no_grad():
        for spectrograms, mos in tqdm(validset_reader):
            spectrograms, mos = spectrograms.to(device), mos.to(device)

            score_tensor = mos.to(torch.float)
            prob = model(spectrograms)
            
            predict = prob.type_as(score_tensor).view(-1).tolist()
            score = score_tensor.view(-1).tolist()
            predicts.extend(predict)
            labels.extend(score)
            
        results = eval_result(predicts, labels)
            
    return results

In [26]:
def train_model(model, args, trainset_reader, validset_reader,testset_reader):
    saved_checkpoints = []
    save_path = args.output_dir
    best_acc = 0.0
    running_loss = 0.0
    gamma = 0.99999

    total_steps = len(trainset_reader.dataset.dfdata)/args.batch_size

    t_total = int(
        total_steps / args.gradient_accumulation_steps * args.num_train_epochs)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=1e-8)
    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=t_total
    )

    global_step = 0
    for epoch in range(int(args.num_train_epochs)):
        optimizer.zero_grad()
            
        logger.info('Training Started !')
        for spectrograms,mos in tqdm(trainset_reader):
            spectrograms, mos = spectrograms.to(device), mos.to(device)

            model.train()
        
            score = model(spectrograms)
                    
            pred_score = score.view(-1)
            score_tensor = mos.view(-1)

            loss = F.mse_loss(pred_score, score_tensor.to(torch.float))
            
#             lambda_ = 1/(1+math.exp(gamma*((int(args.num_train_epochs)/2)-epoch)))
            
#             loss = lambda_*loss + (1-lambda_)*listnet_loss(score_tensor,pred_score)
       
            running_loss += loss.item()
            if args.gradient_accumulation_steps != 0:
                loss = loss / args.gradient_accumulation_steps
                
            loss.backward()
            global_step += 1
            
            if global_step % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

        logger.info('Epoch: {}, Loss: {} ,LR : {}'.format(epoch, (running_loss / global_step),scheduler.get_last_lr()[0]))
        
        train_res = {
            "Train Loss":(running_loss / global_step),
            "Learning Rate" : scheduler.get_last_lr()[0],
        }
                
        logger.info('Eval Started ! ')
        result_dict_val = eval_model(model, validset_reader)
        result_dict_test = eval_model(model, testset_reader)
        logger.info(result_dict_val)
        
        train_res.update({'val':result_dict_val})
        train_res.update({'test':result_dict_test})
        
        wandb.log(train_res)

        check_point_path = save_path + f"/model_{epoch}_best.pt"

        torch.save({'epoch': epoch,
                    'model': model.state_dict()},check_point_path)
        
        saved_checkpoints.append(check_point_path)

        if len(saved_checkpoints) > args.max_model_save:
            old_checkpoint = saved_checkpoints.pop(0)
            if os.path.exists(old_checkpoint):
                os.remove(old_checkpoint)

In [27]:
wandb.login(key=args.wandb_key)
wandb.init(project=args.wandb_proj_name, config=args, name=args.wandb_run_name)

train_model(model, args, train_reader, val_reader, test_reader)

wandb.finish()

 19%|█▉        | 17/89 [00:11<00:49,  1.46it/s]
Exception ignored in: <function tqdm.__del__ at 0x7f4542321ea0>
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tqdm/std.py", line 1147, in __del__
    def __del__(self):
KeyboardInterrupt: 

KeyboardInterrupt

