# TrainRawNet2: On BD ASR Dataset

Necessary Imports:

In [1]:
import torch
from torch.utils import data

import torchaudio

import csv
import pandas as pd

import glob
from pathlib import Path

Directories and lcoations:

In [115]:
# Directories are assumed to have a trailing '/' or '\\' in all the subsequent code

CURRENT_WORKING_DIRECTORY = "W:/SpeakerRecognitionResearch"

BANGLA_ASR_DATASET_DIRECTORY = "data/BanglaASR/WavFiles/"
BANGLA_ASR_TSV_LOCATION = "data/BanglaASR/utt_spk_text.tsv"

# To avoid file location related errors, we make sure "SpeakerRecognitionResearch" root folder is the current working directory.
os.chdir(CURRENT_WORKING_DIRECTORY)
os.getcwd()

'W:\\SpeakerRecognitionResearch'

Constants:

In [116]:
# If sample_rate = 16K and number_of_samples = 32000, then each tensor will be equivalent to 2 seconds of data
SAMPLE_RATE = 16000
NUMBER_OF_SAMPLES = 32000

# Bangla ASR Dataset has around half of second of silence in the beginning
# This constant will be used to cut samples from the left of the audio
TRIM_AMOUNT_TIME = 0.5

In [117]:
# Device

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {}.".format(device))
if device=="cuda": print(torch.cuda.get_device_name(0))

Using cuda.
NVIDIA GeForce GTX 1050


## Custom dataset for Bangla ASR

This custom dataset is written with the assumption that the Dataset has been already converted into wav format. Check evaluate_asr_ds.ipynb notebook for conversion method.

In [118]:
class BanglaAsrDataset(data.Dataset):
    def __init__(self, dataset_dir, tsv_loc, target_sample_rate, target_num_samples, trim_amount_time, device):

        tsv_dataframe = pd.read_csv(tsv_loc, quoting=csv.QUOTE_NONE, sep='\t', header=None)

        # The TSV file contains speech annotations in the third column.
        # We don't need the annotations, so we drop the column
        tsv_dataframe = tsv_dataframe.iloc[:,:-1]

        self.dataset_dir = dataset_dir
        self.wav_to_spk_mapping = dict(sorted(tsv_dataframe.values.tolist()))
        self.wav_path_list = self._get_audio_path_list()
        self.target_sample_rate = target_sample_rate
        self.target_num_samples = target_num_samples
        self.trim_amount_time = trim_amount_time
        self.device = device
        
    def _get_audio_path_list(self):
        pattern = '**/*.wav'
        files = glob.glob(self.dataset_dir + pattern , recursive=True)

        # Normalize the file paths. To get file paths with '/' or '\\' consistently depending on OS
        wav_list = [os.path.normpath(i) for i in files]
        return wav_list

    def _resample_to_target_sr(self, signal, sample_rate):
        if sample_rate != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sample_rate, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_to_mono(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(siggnal, dim=0, keepdim=True)
        return signal

    def _trim(self, signal):
        total_samples = signal.shape[-1]

        # We cut a fixed amount on the left side if the signal is big enough
        trim_samples_amount = int(self.target_sample_rate * self.trim_amount_time)

        if total_samples >= trim_samples_amount + self.target_num_samples:
            signal = signal[: , trim_samples_amount:]
            total_samples = signal.shape[-1]

        # We cut from the right side if the signal is too big
        if total_samples > self.target_num_samples:
            signal = signal[:, :self.target_num_samples]
        
        # We add zero padding on the right if signal is too small
        if total_samples < self.target_num_samples:
            num_missing_samples = self.target_num_samples - total_samples
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
            
        return signal

    def _normalize_like_sincnet(self, signal):
        return signal/torch.max(torch.abs(signal))

    def __len__(self):
        return len(self.wav_to_spk_mapping)

    def __getitem__(self, index):
        wav_path = self.wav_path_list[index]
        wav_name = Path(wav_path).stem
        label = self.wav_to_spk_mapping[wav_name]

        signal, sample_rate = torchaudio.load(wav_path)
        signal = signal.to(self.device)

        signal = self._resample_to_target_sr(signal, sample_rate)
        signal = self._mix_down_to_mono(signal)

        signal =  self._trim(signal)
        signal = self._normalize_like_sincnet(signal)

        return signal, label

In [119]:
bangla_asr_dataset = BanglaAsrDataset(
    dataset_dir=BANGLA_ASR_DATASET_DIRECTORY,
    tsv_loc = BANGLA_ASR_TSV_LOCATION,
    target_sample_rate=SAMPLE_RATE,
    target_num_samples = NUMBER_OF_SAMPLES,
    trim_amount_time = TRIM_AMOUNT_TIME,
    device = device
)

assert bangla_asr_dataset.wav_to_spk_mapping['000020a912'] == '16cfb' , "The dictionary returned wrong mapping"

In [120]:
print(bangla_asr_dataset[0])
print(bangla_asr_dataset[0][0])
print(bangla_asr_dataset[0][0].shape)

(tensor([[-0.0076, -0.0041,  0.0149,  ..., -0.1191, -0.1366, -0.1302]],
       device='cuda:0'), '16cfb')
tensor([[-0.0076, -0.0041,  0.0149,  ..., -0.1191, -0.1366, -0.1302]],
       device='cuda:0')
torch.Size([1, 32000])


# Model !

In [135]:
from torch import nn
from torchsummary import summary

import numpy as np
import math

import torch.nn.functional as F
from tqdm import tqdm

## FRM

In [122]:
class FRM(nn.Module):
    def __init__(self, nb_dim, do_add = True, do_mul = True):
        super(FRM, self).__init__()
        self.fc = nn.Linear(nb_dim, nb_dim)
        self.sig = nn.Sigmoid()
        self.do_add = do_add
        self.do_mul = do_mul
    def forward(self, x):
        y = F.adaptive_avg_pool1d(x, 1).view(x.size(0), -1)
        
        y = self.sig(self.fc(y)).view(x.size(0), x.size(1), -1)

        if self.do_mul: x = x * y
        if self.do_add: x = x + y
        return x

## Residual Block wFRM

In [123]:
class Residual_block_wFRM(nn.Module):
    def __init__(self, nb_filts, first = False):
        super(Residual_block_wFRM, self).__init__()
        self.first = first
        if not self.first:
            self.bn1 = nn.BatchNorm1d(num_features = nb_filts[0])
        self.lrelu = nn.LeakyReLU()
        self.lrelu_keras = nn.LeakyReLU(negative_slope=0.3)
        
        self.conv1 = nn.Conv1d(in_channels = nb_filts[0],
            out_channels = nb_filts[1],
            kernel_size = 3,
            padding = 1,
            stride = 1)
        self.bn2 = nn.BatchNorm1d(num_features = nb_filts[1])
        self.conv2 = nn.Conv1d(in_channels = nb_filts[1],
            out_channels = nb_filts[1],
            padding = 1,
            kernel_size = 3,
            stride = 1)
        
        if nb_filts[0] != nb_filts[1]:
            self.downsample = True
            self.conv_downsample = nn.Conv1d(in_channels = nb_filts[0],
                out_channels = nb_filts[1],
                padding = 0,
                kernel_size = 1,
                stride = 1)
            
        else:
            self.downsample = False
        self.mp = nn.MaxPool1d(3)
        self.frm = FRM(
            nb_dim = nb_filts[1],
            do_add = True,
            do_mul = True)
        
    def forward(self, x):
        identity = x
        if not self.first:
            out = self.bn1(x)
            out = self.lrelu_keras(out)
        else:
            out = x
            
        out = self.conv1(out)
        out = self.bn2(out)
        out = self.lrelu_keras(out)
        out = self.conv2(out)
        
        if self.downsample:
            identity = self.conv_downsample(identity)
            
        out += identity
        out = self.mp(out)
        out = self.frm(out)
        return out

## LayerNorm

In [124]:
class LayerNorm(nn.Module):

    def __init__(self, features, eps=1e-6):
        super(LayerNorm,self).__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

## SincConv Fast

In [125]:
class SincConv_fast(nn.Module):
    """Sinc-based convolution
    Parameters
    ----------
    in_channels : `int`
        Number of input channels. Must be 1.
    out_channels : `int`
        Number of filters.
    kernel_size : `int`
        Filter length.
    sample_rate : `int`, optional
        Sample rate. Defaults to 16000.
    Usage
    -----
    See `torch.nn.Conv1d`
    Reference
    ---------
    Mirco Ravanelli, Yoshua Bengio,
    "Speaker Recognition from raw waveform with SincNet".
    https://arxiv.org/abs/1808.00158
    """

    @staticmethod
    def to_mel(hz):
        return 2595 * np.log10(1 + hz / 700)

    @staticmethod
    def to_hz(mel):
        return 700 * (10 ** (mel / 2595) - 1)

    def __init__(self, out_channels, kernel_size, sample_rate=16000, in_channels=1,
                 stride=1, padding=0, dilation=1, bias=False, groups=1, min_low_hz=50, min_band_hz=50):

        super(SincConv_fast,self).__init__()

        if in_channels != 1:
            #msg = (f'SincConv only support one input channel '
            #       f'(here, in_channels = {in_channels:d}).')
            msg = "SincConv only support one input channel (here, in_channels = {%i})" % (in_channels)
            raise ValueError(msg)

        self.out_channels = out_channels
        self.kernel_size = kernel_size
        
        # Forcing the filters to be odd (i.e, perfectly symmetrics)
        if kernel_size%2==0:
            self.kernel_size=self.kernel_size+1
            
        self.stride = stride
        self.padding = padding
        self.dilation = dilation

        if bias:
            raise ValueError('SincConv does not support bias.')
        if groups > 1:
            raise ValueError('SincConv does not support groups.')

        self.sample_rate = sample_rate
        self.min_low_hz = min_low_hz
        self.min_band_hz = min_band_hz

        # initialize filterbanks such that they are equally spaced in Mel scale
        low_hz = 30
        high_hz = self.sample_rate / 2 - (self.min_low_hz + self.min_band_hz)

        mel = np.linspace(self.to_mel(low_hz),
                          self.to_mel(high_hz),
                          self.out_channels + 1)
        hz = self.to_hz(mel)
        

        # filter lower frequency (out_channels, 1)
        self.low_hz_ = nn.Parameter(torch.Tensor(hz[:-1]).view(-1, 1))

        # filter frequency band (out_channels, 1)
        self.band_hz_ = nn.Parameter(torch.Tensor(np.diff(hz)).view(-1, 1))

        # Hamming window
        #self.window_ = torch.hamming_window(self.kernel_size)
        n_lin=torch.linspace(0, (self.kernel_size/2)-1, steps=int((self.kernel_size/2))) # computing only half of the window
        self.window_=0.54-0.46*torch.cos(2*math.pi*n_lin/self.kernel_size);

        # (1, kernel_size/2)
        n = (self.kernel_size - 1) / 2.0
        self.n_ = 2*math.pi*torch.arange(-n, 0).view(1, -1) / self.sample_rate # Due to symmetry, I only need half of the time axes

    def forward(self, waveforms):
        """
        Parameters
        ----------
        waveforms : `torch.Tensor` (batch_size, 1, n_samples)
            Batch of waveforms.
        Returns
        -------
        features : `torch.Tensor` (batch_size, out_channels, n_samples_out)
            Batch of sinc filters activations.
        """

        self.n_ = self.n_.to(waveforms.device)

        self.window_ = self.window_.to(waveforms.device)

        low = self.min_low_hz  + torch.abs(self.low_hz_)
        
        high = torch.clamp(low + self.min_band_hz + torch.abs(self.band_hz_),self.min_low_hz,self.sample_rate/2)
        band=(high-low)[:,0]
        
        f_times_t_low = torch.matmul(low, self.n_)
        f_times_t_high = torch.matmul(high, self.n_)

        band_pass_left=((torch.sin(f_times_t_high)-torch.sin(f_times_t_low))/(self.n_/2))*self.window_ # Equivalent of Eq.4 of the reference paper (SPEAKER RECOGNITION FROM RAW WAVEFORM WITH SINCNET). I just have expanded the sinc and simplified the terms. This way I avoid several useless computations. 
        band_pass_center = 2*band.view(-1,1)
        band_pass_right= torch.flip(band_pass_left,dims=[1])
        
        
        band_pass=torch.cat([band_pass_left,band_pass_center,band_pass_right],dim=1)

        
        band_pass = band_pass / (2*band[:,None])
        

        self.filters = (band_pass).view(
            self.out_channels, 1, self.kernel_size)

        return F.conv1d(waveforms, self.filters, stride=self.stride,
                        padding=self.padding, dilation=self.dilation,
                         bias=None, groups=1)

## RawNet2

In [126]:
class RawNet2(nn.Module):
    def __init__(self, d_args):
        super(RawNet2, self).__init__()

        self.ln = LayerNorm(d_args['nb_samp'])
        self.first_conv = SincConv_fast(in_channels = d_args['in_channels'],
            out_channels = d_args['filts'][0],
            kernel_size = d_args['first_conv']
            )

        self.first_bn = nn.BatchNorm1d(num_features = d_args['filts'][0])
        self.lrelu = nn.LeakyReLU()
        self.lrelu_keras = nn.LeakyReLU(negative_slope = 0.3)
        
        self.block0 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][1], first = True))
        self.block1 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][1]))
 
        self.block2 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][2]))
        d_args['filts'][2][0] = d_args['filts'][2][1]
        self.block3 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][2]))
        self.block4 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][2]))
        self.block5 = nn.Sequential(Residual_block_wFRM(nb_filts = d_args['filts'][2]))
        self.avgpool = nn.AdaptiveAvgPool1d(1)

        self.bn_before_gru = nn.BatchNorm1d(num_features = d_args['filts'][2][-1])
        self.gru = nn.GRU(input_size = d_args['filts'][2][-1],
            hidden_size = d_args['gru_node'],
            num_layers = d_args['nb_gru_layer'],
            batch_first = True)

        
        self.fc1_gru = nn.Linear(in_features = d_args['gru_node'],
            out_features = d_args['nb_fc_node'])
        self.fc2_gru = nn.Linear(in_features = d_args['nb_fc_node'],
            out_features = d_args['nb_classes'],
            bias = True)
        
        self.sig = nn.Sigmoid()
        
    def forward(self, x, y = 0, is_test=False):
        #follow sincNet recipe
        nb_samp = x.shape[0]
        len_seq = x.shape[1]
        x = self.ln(x)
        x=x.view(nb_samp,1,len_seq)
        x = F.max_pool1d(torch.abs(self.first_conv(x)), 3)
        x = self.first_bn(x)
        x = self.lrelu_keras(x)
        
        x = self.block0(x)
        x = self.block1(x)

        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        x = self.block5(x)

        x = self.bn_before_gru(x)
        x = self.lrelu_keras(x)
        x = x.permute(0, 2, 1)  #(batch, filt, time) >> (batch, time, filt)
        self.gru.flatten_parameters()
        x, _ = self.gru(x)
        x = x[:,-1,:]
        code = self.fc1_gru(x)
        if is_test: return code
        
        code_norm = code.norm(p=2,dim=1, keepdim=True) / 10.
        code = torch.div(code, code_norm)
        out = self.fc2_gru(code)
        return out

## Training

In [127]:
model_dict = {}
model_dict['nb_classes'] = 508
model_dict['first_conv'] = 251
model_dict['in_channels'] = 1
model_dict['filts'] = [128, [128,128], [128,256], [256,256]]
model_dict['m_blocks'] = [2, 4]
model_dict['nb_fc_att_node'] =[1]
model_dict['nb_fc_node'] = 1024
model_dict['gru_node'] = 1024
model_dict['nb_gru_layer'] = 1
model_dict['nb_samp'] = NUMBER_OF_SAMPLES

In [128]:
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4
AMSGRAD = True
EPOCHS = 5
BATCH_SIZE = 2

# Higher number may cause errors in notebook
NUMBER_OF_WORKERS = 0

In [129]:
bangla_asr_data_loader = data.DataLoader(bangla_asr_dataset,
            batch_size = BATCH_SIZE, 
            shuffle = False,
            drop_last = False,
            num_workers = NUMBER_OF_WORKERS)

### Batch explained:

If batch size = 4

One batch = [ tensor([[[x,x,x]]], [[x,x,x]], [[x,x,x]], [[x,x,x]]])    , (label1, label2, label3, label4) ]

In [130]:
model = RawNet2(model_dict)

model.to(device)

RawNet2(
  (ln): LayerNorm()
  (first_conv): SincConv_fast()
  (first_bn): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lrelu): LeakyReLU(negative_slope=0.01)
  (lrelu_keras): LeakyReLU(negative_slope=0.3)
  (block0): Sequential(
    (0): Residual_block_wFRM(
      (lrelu): LeakyReLU(negative_slope=0.01)
      (lrelu_keras): LeakyReLU(negative_slope=0.3)
      (conv1): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
      (mp): MaxPool1d(kernel_size=3, stride=3, padding=0, dilation=1, ceil_mode=False)
      (frm): FRM(
        (fc): Linear(in_features=128, out_features=128, bias=True)
        (sig): Sigmoid()
      )
    )
  )
  (block1): Sequential(
    (0): Residual_block_wFRM(
      (bn1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats

In [131]:
model(bangla_asr_dataset[0][0]).shape

torch.Size([1, 508])

In [132]:
params = [
    {
        'params': [
            param for name, param in model.named_parameters()
            if 'bn' not in name
        ]
    },
    {
        'params': [
            param for name, param in model.named_parameters()
            if 'bn' in name
        ],
        'weight_decay':
        0
    },
]

criterion = {}
criterion['cce'] = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(params,
            lr = LEARNING_RATE,
            weight_decay = WEIGHT_DECAY,
            amsgrad = AMSGRAD)

In [None]:
def train_model(model, db_gen, optimizer, epoch, args, device, lr_scheduler, criterion):
    model.train()
    with tqdm(total = len(db_gen), ncols = 70) as pbar:
        for m_batch, m_label in db_gen:
            
            m_batch, m_label = m_batch.to(device), m_label.to(device)

            output = model(m_batch, m_label)
            cce_loss = criterion['cce'](output, m_label)
            loss = cce_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            pbar.set_description('epoch: %d, cce:%.3f'%(epoch, cce_loss))
            pbar.update(1)
            if args.do_lr_decay:
                if args.lr_decay == 'keras': lr_scheduler.step()

In [134]:
for epoch in tqdm(range(EPOCHS)):
    train_model(model = model,
        db_gen = devset_gen,
        args = args,
        optimizer = optimizer,
        lr_scheduler = lr_scheduler,
        criterion = criterion,
        device = device,
        epoch = epoch)
        
    TA_eval_eer = time_augmented_evaluate_model(mode = 'eval',
        model = model,
        db_gen = TA_evalset_gen, 
        l_utt = l_eval,
        save_dir = save_dir,
        epoch = epoch,
        device = device,
        l_trial = l_eval_trial,
        args = args)
    f_eer.write('epoch:%d, TA_eval_eer:%.4f\n'%(epoch, TA_eval_eer))
        
    save_model_dict = model_1gpu.state_dict() if args.mg else model.state_dict()
    if float(TA_eval_eer) < best_TA_eval_eer:
        print('New best TA_EER: %f'%float(TA_eval_eer))
        best_TA_eval_eer = float(TA_eval_eer)


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 31)