In [1]:
!git clone https://github.com/Liu-Tianchi/MFA-TDNN.git 
!pip install speechbrain "typeguard<3"

Cloning into 'MFA-TDNN'...
remote: Enumerating objects: 51, done.[K
remote: Counting objects: 100% (51/51), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 51 (delta 22), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (51/51), 18.93 KiB | 4.73 MiB/s, done.
Resolving deltas: 100% (22/22), done.
Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting typeguard<3
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9->speechbrain)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.9->speechbrain)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 

In [2]:
import sys
sys.path.append("/kaggle/working/MFA-TDNN")

In [3]:
import torch
from ECAPA_tc_0813 import ECAPA_tc_0813, Classifier 
model = ECAPA_tc_0813(input_size=192)
x = torch.rand(2, 200, 80)
print(model(x).squeeze(1).shape)

160 640 4
1 640 512
2 512 512
3 512 512
torch.Size([2, 192])


## ECAPA Architecture

In [4]:
'''
This is the ECAPA-TDNN model.
This model is modified and combined based on the following three projects:
  1. https://github.com/clovaai/voxceleb_trainer/issues/86
  2. https://github.com/lawlict/ECAPA-TDNN/blob/master/ecapa_tdnn.py
  3. https://github.com/speechbrain/speechbrain/blob/96077e9a1afff89d3f5ff47cab4bca0202770e4f/speechbrain/lobes/models/ECAPA_TDNN.py

'''

import math, torch, torchaudio
import torch.nn as nn
import torch.nn.functional as F


class SEModule(nn.Module):
    def __init__(self, channels, bottleneck=128):
        super(SEModule, self).__init__()
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Conv1d(channels, bottleneck, kernel_size=1, padding=0),
            nn.ReLU(),
            # nn.BatchNorm1d(bottleneck), # I remove this layer
            nn.Conv1d(bottleneck, channels, kernel_size=1, padding=0),
            nn.Sigmoid(),
            )

    def forward(self, input):
        x = self.se(input)
        return input * x

class Bottle2neck(nn.Module):

    def __init__(self, inplanes, planes, kernel_size=None, dilation=None, scale = 8):
        super(Bottle2neck, self).__init__()
        width       = int(math.floor(planes / scale))
        self.conv1  = nn.Conv1d(inplanes, width*scale, kernel_size=1)
        self.bn1    = nn.BatchNorm1d(width*scale)
        self.nums   = scale -1
        convs       = []
        bns         = []
        num_pad = math.floor(kernel_size/2)*dilation
        for i in range(self.nums):
            convs.append(nn.Conv1d(width, width, kernel_size=kernel_size, dilation=dilation, padding=num_pad))
            bns.append(nn.BatchNorm1d(width))
        self.convs  = nn.ModuleList(convs)
        self.bns    = nn.ModuleList(bns)
        self.conv3  = nn.Conv1d(width*scale, planes, kernel_size=1)
        self.bn3    = nn.BatchNorm1d(planes)
        self.relu   = nn.ReLU()
        self.width  = width
        self.se     = SEModule(planes)

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.relu(out)
        out = self.bn1(out)

        spx = torch.split(out, self.width, 1)
        for i in range(self.nums):
          if i==0:
            sp = spx[i]
          else:
            sp = sp + spx[i]
          sp = self.convs[i](sp)
          sp = self.relu(sp)
          sp = self.bns[i](sp)
          if i==0:
            out = sp
          else:
            out = torch.cat((out, sp), 1)
        out = torch.cat((out, spx[self.nums]),1)

        out = self.conv3(out)
        out = self.relu(out)
        out = self.bn3(out)
        
        out = self.se(out)
        out += residual
        return out 

class PreEmphasis(torch.nn.Module):

    def __init__(self, coef: float = 0.97):
        super().__init__()
        self.coef = coef
        self.register_buffer(
            'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
        )

    def forward(self, input: torch.tensor) -> torch.tensor:
        input = input.unsqueeze(1)
        input = F.pad(input, (1, 0), 'reflect')
        return F.conv1d(input, self.flipped_filter).squeeze(1)

class FbankAug(nn.Module):

    def __init__(self, freq_mask_width = (0, 8), time_mask_width = (0, 10)):
        self.time_mask_width = time_mask_width
        self.freq_mask_width = freq_mask_width
        super().__init__()

    def mask_along_axis(self, x, dim):
        original_size = x.shape
        batch, fea, time = x.shape
        if dim == 1:
            D = fea
            width_range = self.freq_mask_width
        else:
            D = time
            width_range = self.time_mask_width

        mask_len = torch.randint(width_range[0], width_range[1], (batch, 1), device=x.device).unsqueeze(2)
        mask_pos = torch.randint(0, max(1, D - mask_len.max()), (batch, 1), device=x.device).unsqueeze(2)
        arange = torch.arange(D, device=x.device).view(1, 1, -1)
        mask = (mask_pos <= arange) * (arange < (mask_pos + mask_len))
        mask = mask.any(dim=1)

        if dim == 1:
            mask = mask.unsqueeze(2)
        else:
            mask = mask.unsqueeze(1)
            
        x = x.masked_fill_(mask, 0.0)
        return x.view(*original_size)

    def forward(self, x):    
        x = self.mask_along_axis(x, dim=2)
        x = self.mask_along_axis(x, dim=1)
        return x

class ECAPA_TDNN(nn.Module):

    def __init__(self, C=512):

        super(ECAPA_TDNN, self).__init__()

        self.torchfbank = torch.nn.Sequential(
            PreEmphasis(),            
            torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
                                                 f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
            )

        self.specaug = FbankAug() # Spec augmentation

        self.conv1  = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
        self.relu   = nn.ReLU()
        self.bn1    = nn.BatchNorm1d(C)
        self.layer1 = Bottle2neck(C, C, kernel_size=3, dilation=1, scale=8)
        self.extra_conv1 = nn.Conv1d(C, int(C/4), kernel_size=1)
        self.layer2 = Bottle2neck(C, C, kernel_size=3, dilation=2, scale=8)
        self.extra_conv2 = nn.Conv1d(C, int(C/4), kernel_size=1)
        self.layer3 = Bottle2neck(C, C, kernel_size=3, dilation=2, scale=8)
        self.extra_conv3 = nn.Conv1d(C, int(C/2), kernel_size=1)
        self.extra_layer1 = Bottle2neck(C, C, kernel_size=3, dilation=3, scale=8)
        self.extra_layer2 = Bottle2neck(C, C, kernel_size=3, dilation=4, scale=8)
        # I fixed the shape of the output from MFA layer, that is close to the setting from ECAPA paper.
        self.layer4 = nn.Conv1d(3*C, 1536, kernel_size=1)
        self.attention = nn.Sequential(
            nn.Conv1d(4608, 256, kernel_size=1),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Tanh(), # I add this layer
            nn.Conv1d(256, 1536, kernel_size=1),
            nn.Softmax(dim=2),
            )
        self.bn5 = nn.BatchNorm1d(3072)
        self.fc6 = nn.Linear(3072, 192)
        self.bn6 = nn.BatchNorm1d(192)


    def forward(self, x, aug):
        with torch.no_grad():
            x = self.torchfbank(x)+1e-6
            x = x.log()   
            x = x - torch.mean(x, dim=-1, keepdim=True)
            if aug == True:
                x = self.specaug(x)

        x = self.conv1(x)
        x = self.relu(x)
        x = self.bn1(x)

        x1 = self.layer1(x)
        x2 = self.layer2(x+x1)
        x3 = self.layer3(x+x1+x2)
        x4 = self.extra_layer1(x+x1+x2+x3)
        x5 = self.extra_layer2(x+x1+x2+x3+x4)

        x = self.layer4(torch.cat((self.extra_conv1(x1),self.extra_conv2(x2),self.extra_conv3(x3),x4,x5),dim=1))
        x = self.relu(x)

        t = x.size()[-1]

        global_x = torch.cat((x,torch.mean(x,dim=2,keepdim=True).repeat(1,1,t), torch.sqrt(torch.var(x,dim=2,keepdim=True).clamp(min=1e-4)).repeat(1,1,t)), dim=1)
        
        w = self.attention(global_x)

        mu = torch.sum(x * w, dim=2)
        sg = torch.sqrt( ( torch.sum((x**2) * w, dim=2) - mu**2 ).clamp(min=1e-4) )

        x = torch.cat((mu,sg),1)
        x = self.bn5(x)
        x = self.fc6(x)
        x = self.bn6(x)

        return x

## Tools

In [5]:
'''
Some utilized functions
These functions are all copied from voxceleb_trainer: https://github.com/clovaai/voxceleb_trainer/blob/master/tuneThreshold.py
'''

import os, numpy, torch
from sklearn import metrics
from operator import itemgetter
import torch.nn.functional as F

def init_args(args):
	args.score_save_path    = os.path.join(args.save_path, 'score.txt')
	args.model_save_path    = os.path.join(args.save_path, 'model')
	os.makedirs(args.model_save_path, exist_ok = True)
	return args

def tuneThresholdfromScore(scores, labels, target_fa, target_fr = None):
	
	fpr, tpr, thresholds = metrics.roc_curve(labels, scores, pos_label=1)
	fnr = 1 - tpr
	tunedThreshold = [];
	if target_fr:
		for tfr in target_fr:
			idx = numpy.nanargmin(numpy.absolute((tfr - fnr)))
			tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]])
	for tfa in target_fa:
		idx = numpy.nanargmin(numpy.absolute((tfa - fpr))) # numpy.where(fpr<=tfa)[0][-1]
		tunedThreshold.append([thresholds[idx], fpr[idx], fnr[idx]])
	idxE = numpy.nanargmin(numpy.absolute((fnr - fpr)))
	eer  = max(fpr[idxE],fnr[idxE])*100
	
	return tunedThreshold, eer, fpr, fnr

# Creates a list of false-negative rates, a list of false-positive rates
# and a list of decision thresholds that give those error-rates.
def ComputeErrorRates(scores, labels):

      # Sort the scores from smallest to largest, and also get the corresponding
      # indexes of the sorted scores.  We will treat the sorted scores as the
      # thresholds at which the the error-rates are evaluated.
      sorted_indexes, thresholds = zip(*sorted(
          [(index, threshold) for index, threshold in enumerate(scores)],
          key=itemgetter(1)))
      sorted_labels = []
      labels = [labels[i] for i in sorted_indexes]
      fnrs = []
      fprs = []

      # At the end of this loop, fnrs[i] is the number of errors made by
      # incorrectly rejecting scores less than thresholds[i]. And, fprs[i]
      # is the total number of times that we have correctly accepted scores
      # greater than thresholds[i].
      for i in range(0, len(labels)):
          if i == 0:
              fnrs.append(labels[i])
              fprs.append(1 - labels[i])
          else:
              fnrs.append(fnrs[i-1] + labels[i])
              fprs.append(fprs[i-1] + 1 - labels[i])
      fnrs_norm = sum(labels)
      fprs_norm = len(labels) - fnrs_norm

      # Now divide by the total number of false negative errors to
      # obtain the false positive rates across all thresholds
      fnrs = [x / float(fnrs_norm) for x in fnrs]

      # Divide by the total number of corret positives to get the
      # true positive rate.  Subtract these quantities from 1 to
      # get the false positive rates.
      fprs = [1 - x / float(fprs_norm) for x in fprs]
      return fnrs, fprs, thresholds

# Computes the minimum of the detection cost function.  The comments refer to
# equations in Section 3 of the NIST 2016 Speaker Recognition Evaluation Plan.
def ComputeMinDcf(fnrs, fprs, thresholds, p_target, c_miss, c_fa):
    min_c_det = float("inf")
    min_c_det_threshold = thresholds[0]
    for i in range(0, len(fnrs)):
        # See Equation (2).  it is a weighted sum of false negative
        # and false positive errors.
        c_det = c_miss * fnrs[i] * p_target + c_fa * fprs[i] * (1 - p_target)
        if c_det < min_c_det:
            min_c_det = c_det
            min_c_det_threshold = thresholds[i]
    # See Equations (3) and (4).  Now we normalize the cost.
    c_def = min(c_miss * p_target, c_fa * (1 - p_target))
    min_dcf = min_c_det / c_def
    return min_dcf, min_c_det_threshold

def accuracy(output, target, topk=(1,)):

	maxk = max(topk)
	batch_size = target.size(0)
	_, pred = output.topk(maxk, 1, True, True)
	pred = pred.t()
	correct = pred.eq(target.view(1, -1).expand_as(pred))
	res = []
	for k in topk:
		correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
		res.append(correct_k.mul_(100.0 / batch_size))
	
	return res

## AAM Softmax Loss

In [6]:
'''
AAMsoftmax loss function copied from voxceleb_trainer: https://github.com/clovaai/voxceleb_trainer/blob/master/loss/aamsoftmax.py
'''

import torch, math
import torch.nn as nn
import torch.nn.functional as F

class AAMsoftmax(nn.Module):
    def __init__(self, n_class, m, s):
        
        super(AAMsoftmax, self).__init__()
        self.m = m
        self.s = s
        self.weight = torch.nn.Parameter(torch.FloatTensor(n_class, 192), requires_grad=True)
        self.ce = nn.CrossEntropyLoss()
        nn.init.xavier_normal_(self.weight, gain=1)
        self.cos_m = math.cos(self.m)
        self.sin_m = math.sin(self.m)
        self.th = math.cos(math.pi - self.m)
        self.mm = math.sin(math.pi - self.m) * self.m

    def forward(self, x, label=None):
        
        cosine = F.linear(F.normalize(x), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.mul(cosine, cosine)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)
        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, label.view(-1, 1), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output = output * self.s
        
        loss = self.ce(output, label)
        prec1 = accuracy(output.detach(), label.detach(), topk=(1,))[0]

        return loss, prec1

## ECAPA Model

In [19]:
'''
This part is used to train the speaker model and evaluate the performances
'''

import torch, sys, os, tqdm, numpy, soundfile, time, pickle, random
import torch.nn as nn

class ECAPAModel(nn.Module):
    def __init__(self, lr, lr_decay, C, n_class, m, s, test_step, device='cuda', **kwargs):
        super(ECAPAModel, self).__init__()
        
        # Store device
        self.device = torch.device(device)

        self.torchfbank = torch.nn.Sequential(
            PreEmphasis(),            
            torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, 
                                                 f_min=20, f_max=7600, window_fn=torch.hamming_window, n_mels=80),
        )

        self.specaug = FbankAug()  # Spec augmentation

        # ECAPA-TDNN - Don't force .cuda() here, let it be moved explicitly
        self.speaker_encoder = ECAPA_tc_0813(input_size=192)
        # Classifier
        self.speaker_loss = AAMsoftmax(n_class=n_class, m=m, s=s)
        
        # Move model to device after creation
        self.to(self.device)
        
        self.optim = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=2e-5)
        self.scheduler = torch.optim.lr_scheduler.StepLR(self.optim, step_size=test_step, gamma=lr_decay)
        
        print(time.strftime("%m-%d %H:%M:%S") + " Model para number = %.2f" %
              (sum(param.numel() for param in self.speaker_encoder.parameters()) / 1024 / 1024))

    def transform(self, x, aug):
        with torch.no_grad():
            x = self.torchfbank(x) + 1e-6
            x = x.log()
            x = x - torch.mean(x, dim=-1, keepdim=True)
            if aug == True:
                x = self.specaug(x)
        return x.transpose(1, 2)

    def train_network(self, epoch, loader):
        self.train()
        
        # Update the learning rate based on the current epoch
        self.scheduler.step(epoch - 1)
        
        index, top1, loss = 0, 0, 0
        lr = self.optim.param_groups[0]['lr']
        
        for num, (data, labels) in enumerate(loader, start=1):
            # Move data to device efficiently
            if isinstance(data, torch.Tensor):
                data = data.to(self.device, non_blocking=True)
            else:
                data = torch.FloatTensor(data).to(self.device, non_blocking=True)
            
            if isinstance(labels, torch.Tensor):
                labels = labels.to(self.device, non_blocking=True)
            else:
                labels = torch.LongTensor(labels).to(self.device, non_blocking=True)
            
            # Forward pass
            self.zero_grad()
            speaker_embedding = self.speaker_encoder.forward(self.transform(data, aug=True)).squeeze(1)
            nloss, prec = self.speaker_loss.forward(speaker_embedding, labels)
            
            # Backward pass
            nloss.backward()
            self.optim.step()
            
            # Statistics
            batch_size = len(labels)
            index += batch_size
            top1 += prec
            loss += nloss.detach().cpu().item()
            
            # Progress reporting
            sys.stderr.write(time.strftime("%m-%d %H:%M:%S") +
                             " [%2d] Lr: %5f, Training: %.2f%%, " % (epoch, lr, 100 * (num / len(loader))) +
                             " Loss: %.5f, ACC: %2.2f%% \r" % (loss/num, top1/index*100))
            sys.stderr.flush()
        
        sys.stdout.write("\n")
        return loss/num, lr, top1/index*100

    def eval_network(self, eval_list, eval_path):
        self.eval()
        files = []
        embeddings = {}
        lines = open(eval_list).read().splitlines()
        new_lines = []
        for line in lines:
            line = line.strip('"')
            if os.path.exists(os.path.join(eval_path, line.split()[1])) and os.path.exists(os.path.join(eval_path, line.split()[2])):
                files.append(line.split()[1])
                files.append(line.split()[2])
                new_lines.append(line)
        lines = new_lines
        setfiles = list(set(files))
        setfiles.sort()

        for idx, file in tqdm.tqdm(enumerate(setfiles), total=len(setfiles)):
            audio, _ = soundfile.read(os.path.join(eval_path, file))
            data_1 = torch.FloatTensor(numpy.stack([audio], axis=0)).cuda()

            max_audio = 300 * 160 + 240
            if audio.shape[0] <= max_audio:
                shortage = max_audio - audio.shape[0]
                audio = numpy.pad(audio, (0, shortage), 'wrap')
            feats = []
            startframe = numpy.linspace(0, audio.shape[0]-max_audio, num=5)
            for asf in startframe:
                feats.append(audio[int(asf):int(asf)+max_audio])
            feats = numpy.stack(feats, axis=0).astype(float)
            data_2 = torch.FloatTensor(feats).cuda()
            with torch.no_grad():
                embedding_1 = self.speaker_encoder.forward(self.transform(data_1, aug=False)).squeeze(1)
                embedding_1 = F.normalize(embedding_1, p=2, dim=1)
                embedding_2 = self.speaker_encoder.forward(self.transform(data_2, aug=False)).squeeze(1)
                embedding_2 = F.normalize(embedding_2, p=2, dim=1)
            embeddings[file] = [embedding_1, embedding_2]

        scores, labels = [], []
        for line in lines:
            embedding_11, embedding_12 = embeddings[line.split()[1]]
            embedding_21, embedding_22 = embeddings[line.split()[2]]
            score_1 = torch.mean(torch.matmul(embedding_11, embedding_21.T))
            score_2 = torch.mean(torch.matmul(embedding_12, embedding_22.T))
            score = (score_1 + score_2) / 2
            score = score.detach().cpu().numpy()
            scores.append(score)
            labels.append(int(line.split()[0]))

        EER = tuneThresholdfromScore(scores, labels, [1, 0.1])[1]
        fnrs, fprs, thresholds = ComputeErrorRates(scores, labels)
        minDCF, _ = ComputeMinDcf(fnrs, fprs, thresholds, 0.05, 1, 1)
        
        return EER, minDCF

    def test_network(self, test_list, test_path):
        self.eval()
        files = []
        scores = []
        embeddings = {}
        lines = open(test_list).read().splitlines()
        for line in lines:
            files.append(line.split()[0])
            files.append(line.split()[1])
        setfiles = list(set(files))
        setfiles.sort()

        for idx, file in tqdm.tqdm(enumerate(setfiles), total=len(setfiles)):
            audio, _ = soundfile.read(os.path.join(test_path, file))
            data_1 = torch.FloatTensor(numpy.stack([audio], axis=0)).cuda()

            max_audio = 300 * 160 + 240
            if audio.shape[0] <= max_audio:
                shortage = max_audio - audio.shape[0]
                audio = numpy.pad(audio, (0, shortage), 'wrap')
            feats = []
            startframe = numpy.linspace(0, audio.shape[0]-max_audio, num=5)
            for asf in startframe:
                feats.append(audio[int(asf):int(asf)+max_audio])
            feats = numpy.stack(feats, axis=0).astype(float)
            data_2 = torch.FloatTensor(feats).cuda()
            with torch.no_grad():
                embedding_1 = self.speaker_encoder.forward(self.transform(data_1, aug=False)).squeeze(1)
                embedding_1 = F.normalize(embedding_1, p=2, dim=1)
                embedding_2 = self.speaker_encoder.forward(self.transform(data_2, aug=False)).squeeze(1)
                embedding_2 = F.normalize(embedding_2, p=2, dim=1)
            embeddings[file] = [embedding_1, embedding_2]

        with open("predictions.txt", "w") as f:
            for line in lines:
                embedding_11, embedding_12 = embeddings[line.split()[0]]
                embedding_21, embedding_22 = embeddings[line.split()[1]]
                score_1 = torch.mean(torch.matmul(embedding_11, embedding_21.T))
                score_2 = torch.mean(torch.matmul(embedding_12, embedding_22.T))
                score = (score_1 + score_2) / 2
                score = score.detach().cpu().numpy()
                scores.append(score)
                f.write(f"{score:.4f}\n")
        return scores

    def forward(self, file_path):
        audio, _ = soundfile.read(file_path)
        data = torch.FloatTensor(numpy.stack([audio], axis=0)).cuda()
        embedding = self.speaker_encoder.forward(self.transform(data, aug=False)).squeeze(1)
        embedding = F.normalize(embedding, p=2, dim=1)
        return embedding

    def save_parameters(self, path):
        torch.save(self.state_dict(), path)

    def load_parameters(self, path):
        self_state = self.state_dict()
        loaded_state = torch.load(path)
        for name, param in loaded_state.items():
            origname = name
            if name not in self_state:
                name = name.replace("module.", "")
                if name not in self_state:
                    print("%s is not in the model." % origname)
                    continue
            if self_state[name].size() != loaded_state[origname].size():
                print("Wrong parameter length: %s, model: %s, loaded: %s" % (
                    origname, self_state[name].size(), loaded_state[origname].size()))
                continue
            self_state[name].copy_(param)


## Datasets & Data Collator

### MFCC

In [8]:
import librosa
import torch
import torch.nn as nn
import torch.nn.functional as F


class Mel_Spectrogram(nn.Module):
    def __init__(self, sample_rate=16000, n_fft=512, win_length=400, hop=160, n_mels=80, coef=0.97, requires_grad=False):
        super(Mel_Spectrogram, self).__init__()
        self.n_fft = n_fft
        self.n_mels = n_mels
        self.win_length = win_length
        self.hop = hop

        self.pre_emphasis = PreEmphasis(coef)
        mel_basis = librosa.filters.mel(
            sr=sample_rate, n_fft=n_fft, n_mels=n_mels)
        self.mel_basis = nn.Parameter(
            torch.FloatTensor(mel_basis), requires_grad=requires_grad)
        self.instance_norm = nn.InstanceNorm1d(num_features=n_mels)
        window = torch.hamming_window(self.win_length)
        self.window = nn.Parameter(
            torch.FloatTensor(window), requires_grad=False)

    def forward(self, x):
        x = self.pre_emphasis(x)
        x = torch.stft(x, n_fft=self.n_fft, hop_length=self.hop,
                       window=self.window, win_length=self.win_length, return_complex=True)
        x = torch.abs(x)
        x += 1e-9
        x = torch.log(x)
        x = torch.matmul(self.mel_basis, x)
        x = self.instance_norm(x)
        x = x.unsqueeze(1)
        return x

In [9]:
import pandas as pd
import random


def generate_speaker_pairs(csv_file, output_file = "eval_pairs.txt", num_pairs=10000):
    # Load the dataset
    data = pd.read_csv(csv_file)
    
    # Group audio files by speaker ID
    speakers = data.groupby('speaker')['audio_name'].apply(list).to_dict()
    all_speakers = list(speakers.keys())
    pairs = []
    
    # Generate positive pairs (same speaker)
    while len(pairs) < num_pairs // 2:
        speaker = random.choice(all_speakers)
        if len(speakers[speaker]) < 2:
            continue
        file1, file2 = random.sample(speakers[speaker], 2)
        pairs.append([1, file1, file2])
    
    # Generate negative pairs (different speakers)
    while len(pairs) < num_pairs:
        sp1, sp2 = random.sample(all_speakers, 2)
        file1 = random.choice(speakers[sp1])
        file2 = random.choice(speakers[sp2])
        pairs.append([0, file1, file2])
    
    with open(output_file, "w") as fout:
        for pair in pairs:
            fout.write(f"{pair[0]} {pair[1]} {pair[2]}\n")
test_path = "/kaggle/input/vispeech/metadata/noisy_testset.csv"
generate_speaker_pairs(test_path)


In [21]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd

from scipy.io import wavfile
from scipy import signal
import soundfile

def compute_dB(waveform):
    """
    Args:
        x (numpy.array): Input waveform (#length).
    Returns:
        numpy.array: Output array (#length).
    """
    val = max(0.0, np.mean(np.power(waveform, 2)))
    dB = 10*np.log10(val+1e-4)
    return dB

class WavAugment(object):
    def __init__(self, noise_csv_path="/kaggle/input/csv-files/musan_filelist.csv", rir_csv_path="/kaggle/input/csv-files/rir_data.csv"):
        noise_csv = pd.read_csv("/kaggle/input/csv-files/musan_filelist.csv")
        self.noise_paths = noise_csv["utt_paths"].values
        self.noise_names = noise_csv["speaker_name"].values
        self.rir_paths = pd.read_csv(rir_csv_path)["utt_paths"].values

    def augment(self, waveform):
        if np.random.rand() < 0.4:
            idx = np.random.randint(0, 8)
            if idx == 0:
                waveform = self.add_gaussian_noise(waveform)
                waveform = self.add_real_noise(waveform)
    
            if idx == 1 or idx == 2 or idx == 3:
                waveform = self.add_real_noise(waveform)
    
            if idx == 4 or idx == 5 or idx == 6:
                waveform = self.reverberate(waveform)
    
            if idx == 7:
                waveform = self.change_volum(waveform)
                waveform = self.reverberate(waveform)
    
            if idx == 6:
                waveform = self.change_volum(waveform)
                waveform = self.add_real_noise(waveform)
    
            if idx == 8:
                waveform = self.add_gaussian_noise(waveform)
                waveform = self.reverberate(waveform)

        return waveform

    def add_gaussian_noise(self, waveform):
        """
        Args:
            x (numpy.array): Input waveform array (#length).
        Returns:
            numpy.array: Output waveform array (#length).
        """
        snr = np.random.uniform(low=10, high=25)
        clean_dB = compute_dB(waveform)
        noise = np.random.randn(len(waveform))
        noise_dB = compute_dB(noise)
        noise = np.sqrt(10 ** ((clean_dB - noise_dB - snr) / 10)) * noise
        waveform = (waveform + noise)
        return waveform

    def change_volum(self, waveform):
        """
        Args:
            x (numpy.array): Input waveform array (#length).
        Returns:
            numpy.array: Output waveform array (#length).
        """
        volum = np.random.uniform(low=0.8, high=1.0005)
        waveform = waveform * volum
        return waveform

    def add_real_noise(self, waveform):
        """
        Args:
            x (numpy.array): Input length (#length).
        Returns:
            numpy.array: Output waveform array (#length).
        """
        clean_dB = compute_dB(waveform)

        idx = np.random.randint(0, len(self.noise_paths))
        sample_rate, noise = wavfile.read(self.noise_paths[idx])
        noise = noise.astype(np.float64)

        snr = np.random.uniform(15, 25)

        noise_length = len(noise)
        audio_length = len(waveform)

        if audio_length >= noise_length:
            shortage = audio_length - noise_length
            noise = np.pad(noise, (0, shortage), 'wrap')
        else:
            start = np.random.randint(0, (noise_length-audio_length))
            noise = noise[start:start+audio_length]

        noise_dB = compute_dB(noise)
        noise = np.sqrt(10 ** ((clean_dB - noise_dB - snr) / 10)) * noise
        waveform = (waveform + noise)
        return waveform

    def reverberate(self, waveform):
        """
        Args:
            x (numpy.array): Input length (#length).
        Returns:
            numpy.array: Output waveform array (#length).
        """
        audio_length = len(waveform)
        idx = np.random.randint(0, len(self.rir_paths))

        path = self.rir_paths[idx]
        rir, sample_rate = soundfile.read(path)
        rir = rir/np.sqrt(np.sum(rir**2))

        waveform = signal.convolve(waveform, rir, mode='full')
        return waveform[:audio_length]
import collections
import os
import random

import numpy as np
import pandas as pd
import torch
import librosa
from scipy import signal
from scipy.io import wavfile
from sklearn.utils import shuffle
from torch.utils.data import DataLoader, Dataset

def load_audio(filename, num_frames):
    sample_rate, waveform = wavfile.read(filename)
    audio_length = waveform.shape[0]
    second = int(num_frames/100)

    if second <= 0:
        return waveform.astype(np.float64).copy()

    length = np.int64(sample_rate * second)

    if audio_length <= length:
        shortage = length - audio_length
        waveform = np.pad(waveform, (0, shortage), 'wrap')
        waveform = waveform.astype(np.float64)
    else:
        start = np.int64(random.random()*(audio_length-length))
        waveform =  waveform[start:start+length].astype(np.float64)
    return waveform.copy()

class Train_Dataset(Dataset):
    def __init__(self, train_csv_path, num_frames, pairs=False, **kwargs):
        self.pairs = pairs
        df = pd.read_csv(train_csv_path)
        self.labels = df["utt_spk_int_labels"].values
        self.paths = df["utt_paths"].values
        self.labels, self.paths = shuffle(self.labels, self.paths)
        self.num_frames = num_frames
        self.wav_aug = WavAugment()

        print("Train Dataset load {} speakers".format(len(set(self.labels))))
        print("Train Dataset load {} utterance".format(len(self.labels)))

    def __getitem__(self, index):
        waveform_1 = load_audio(self.paths[index], self.num_frames)

        waveform_1 = self.wav_aug.augment(waveform_1)
        if self.pairs == False:
            return torch.FloatTensor(waveform_1), self.labels[index]

        else:
            waveform_2 = load_audio(self.paths[index], self.num_frames)
            waveform_2 = self.wav_aug.augment(waveform_2)
            return torch.FloatTensor(waveform_1), torch.FloatTensor(waveform_2), self.labels[index]

    def __len__(self):
        return len(self.paths)
class Evaluation_Dataset(Dataset):
    def __init__(self, paths, second=-1, **kwargs):
        self.paths = paths
        self.second = second
        print("load {} utterance".format(len(self.paths)))

    def __getitem__(self, index):
        waveform = load_audio(self.paths[index], self.second)
        return torch.FloatTensor(waveform), self.paths[index]

    def __len__(self):
        return len(self.paths)

## Training Object

In [17]:
'''
This is the main code of the ECAPATDNN project, to define the parameters and build the construction
'''

import argparse, glob, os, torch, warnings, time

parser = argparse.ArgumentParser(description = "ECAPA_trainer")
## Training Settings
parser.add_argument('--num_frames', type=int,   default=400,     help='Duration of the input segments, eg: 200 for 2 second')
parser.add_argument('--max_epoch',  type=int,   default=40,      help='Maximum number of epochs')
parser.add_argument('--batch_size', type=int,   default=64,     help='Batch size')
parser.add_argument('--n_cpu',      type=int,   default=4,       help='Number of loader threads')
parser.add_argument('--test_step',  type=int,   default=1,       help='Test and save every [test_step] epochs')
parser.add_argument('--lr',         type=float, default=0.001,   help='Learning rate')
parser.add_argument("--second", type=int, default=4)
parser.add_argument("--lr_decay",   type=float, default=0.9,    help='Learning rate decay every [test_step] epochs')

## Training and evaluation path/lists, save path
parser.add_argument('--train_list', type=str,   default="/kaggle/input/vietnam-celeb-dataset/vietnam-celeb-t.txt",     help='The path of the training list, https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/train_list.txt')
parser.add_argument('--train_path', type=str,   default="/kaggle/input/vietnam-celeb-dataset/full-dataset/data",                    help='The path of the training data, eg:"/data08/VoxCeleb2/train/wav" in my case')
parser.add_argument('--train_csv_path', type=str, default="/kaggle/input/csv-files/processed_vietnam_celeb_dataset.csv")
parser.add_argument('--eval_list',  type=str,   default="/kaggle/input/vietnam-celeb-dataset/full-dataset/vietnam-celeb-h.txt",              help='The path of the evaluation list, veri_test2.txt comes from https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt')
parser.add_argument('--eval_path',  type=str,   default="/kaggle/input/vietnam-celeb-dataset/full-dataset/data",                    help='The path of the evaluation data, eg:"/data08/VoxCeleb1/test/wav" in my case')
parser.add_argument("--trial_path", type=str, default="/kaggle/working/vi_speech.csv")
parser.add_argument('--musan_path', type=str,   default="/kaggle/input/musan-dataset",                    help='The path to the MUSAN set, eg:"/data08/Others/musan_split" in my case')
parser.add_argument('--rir_path',   type=str,   default="/kaggle/input/rir-dataset/sim_rir_16k/simulated_rirs_16k",     help='The path to the RIR set, eg:"/data08/Others/RIRS_NOISES/simulated_rirs" in my case');
parser.add_argument('--save_path',  type=str,   default="exps/exp1",                                     help='Path to save the score.txt and models')
parser.add_argument('--initial_model',  type=str,   default="",                                          help='Path of the initial_model')
parser.add_argument('--aug', action='store_true')

## Model and Loss settings
parser.add_argument('--C',       type=int,   default=512,   help='Channel size for the speaker encoder')
parser.add_argument('--m',       type=float, default=0.2,    help='Loss margin in AAM softmax')
parser.add_argument('--s',       type=float, default=30,     help='Loss scale in AAM softmax')
parser.add_argument('--n_class', type=int,   default=880,   help='Number of speakers')

## Command
parser.add_argument('--eval',    dest='eval', action='store_true', help='Only do evaluation')

## Initialization
warnings.simplefilter("ignore")
torch.multiprocessing.set_sharing_strategy('file_system')
args = parser.parse_args([])
args = init_args(args)

In [None]:
## Define the data loader
train_dataset = Train_Dataset(**vars(args))
trainLoader = torch.utils.data.DataLoader(train_dataset, batch_size = args.batch_size, shuffle = True, num_workers = args.n_cpu, drop_last = True, pin_memory = True)

## Search for the exist models
modelfiles = glob.glob('%s/model_0*.model'%args.model_save_path)
modelfiles.sort()

## Only do evaluation, the initial_model is necessary
if args.eval == True:
	s = ECAPAModel(**vars(args))
	print("Model %s loaded from previous state!"%args.initial_model)
	s.load_parameters(args.initial_model)
	EER, minDCF = s.eval_network(eval_list = args.eval_list, eval_path = args.eval_path)
	print("EER %2.2f%%, minDCF %.4f%%"%(EER, minDCF))
	quit()

## If initial_model is exist, system will train from the initial_model
if args.initial_model != "":
	print("Model %s loaded from previous state!"%args.initial_model)
	s = ECAPAModel(**vars(args))
	s.load_parameters(args.initial_model)
	epoch = 1

## Otherwise, system will try to start from the saved model&epoch
elif len(modelfiles) >= 1:
	print("Model %s loaded from previous state!"%modelfiles[-1])
	epoch = int(os.path.splitext(os.path.basename(modelfiles[-1]))[0][6:]) + 1
	s = ECAPAModel(**vars(args))
	s.load_parameters(modelfiles[-1])
## Otherwise, system will train from scratch
else:
	epoch = 1
	s = ECAPAModel(**vars(args))

EERs = []
MinDCF = []
score_file = open(args.score_save_path, "a+")

print(f'Large Margin Finetuning with:\nLength = {args.num_frames / 100}\nLearning Rate = {args.lr}\nMargin = {args.m}\nEpochs = {args.max_epoch}')
while(1):
	## Training for one epoch
	loss, lr, acc = s.train_network(epoch = epoch, loader = trainLoader)
    
	## Evaluation every [test_step] epochs
	if epoch % args.test_step == 0:
		s.save_parameters(args.model_save_path + "/model_%04d.model"%epoch)
		eer, mindcf = s.eval_network(eval_list = args.eval_list, eval_path = args.eval_path)
		EERs.append(eer)
		MinDCF.append(mindcf)
		print(time.strftime("%Y-%m-%d %H:%M:%S"), "%d epoch, MinDCF %2.2f%%, EER %2.2f%%, bestEER %2.2f%%"%(epoch, mindcf, EERs[-1], min(EERs)))
		score_file.write("%d epoch, LR %f, LOSS %f, ACC %2.2f%%, MinDCF %2.2f%%, EER %2.2f%%, bestEER %2.2f%%\n"%(epoch, lr, loss, acc, mindcf, EERs[-1], min(EERs)))
		score_file.flush()

	if (epoch % 1 == 0) or args.max_epoch == epoch:    
		vispeech_eer, _ = s.eval_network(eval_list = "/kaggle/working/eval_pairs.txt", eval_path = "/kaggle/input/vispeech/noisy_testset")
		print(f"EER on ViSpeech : {vispeech_eer:.2f}%")      
	if (epoch % 3 == 0) or args.max_epoch == epoch:    
		vox_eer, _ = s.eval_network(eval_list = "/kaggle/input/voxvietnam/test_list_gt.csv", eval_path = "/kaggle/input/voxvietnam/wav")
		print(f"EER on Vox : {vox_eer:.2f}%")      
	if epoch >= args.max_epoch:
		break
	epoch += 1

Train Dataset load 880 speakers
Train Dataset load 59417 utterance
160 640 4
1 640 512
2 512 512
3 512 512
05-31 03:15:00 Model para number = 6.98
Large Margin Finetuning with:
Length = 4.0
Learning Rate = 0.001
Margin = 0.2
Epochs = 40


05-31 03:19:26 [ 1] Lr: 0.001000, Training: 68.21%,  Loss: 9.74188, ACC: 5.84%  

In [10]:
model_0 = ECAPAModel(**vars(args))
model_0.load_parameters("/kaggle/working/exps/exp1/model/model_0020.model")
vox_eer, _ = model_0.eval_network("/kaggle/input/voxvietnam/test_list_gt.csv", "/kaggle/input/voxvietnam/wav")
print(vox_eer)

05-25 10:37:42 Model para number = 6.65


100%|██████████| 17786/17786 [05:08<00:00, 57.60it/s]


5.168478990468519


In [14]:
print(train_dataset[1])

(tensor([ 0.3769,  0.3261,  0.2673,  ..., -0.4006, -0.4245, -0.4364]), tensor([-0.0462, -0.0335, -0.0223,  ..., -0.2159, -0.2824, -0.3720]), 261)


In [None]:
with open("/kaggle/input/voxvietnam/test_list_gt.csv", "r") as f:
    path = f.readlines()[0].strip('"')[:-2].split()[2]
    file_path = os.path.join("/kaggle/input/voxvietnam/wav", path)
    print(os.path.exists(file_path))

## AS Norm

### Utils

In [33]:
def read_scp(scp_file):
    """read scp file (also support PIPE format)

    Args:
        scp_file (str): path to the scp file

    Returns:
        list: key_value_list
    """
    key_value_list = []
    with open(scp_file, "r", encoding='utf8') as fin:
        for line in fin:
            tokens = line.strip().split()
            key = tokens[0]
            value = " ".join(tokens[1:])
            key_value_list.append((key, value))
    return key_value_list


def read_lists(list_file):
    """read list file with only 1 column

    Args:
        list_file (str): path to the list file

    Returns:
        list: lists
    """
    lists = []
    with open(list_file, 'r', encoding='utf8') as fin:
        for line in fin:
            lists.append(line.strip())
    return lists


def read_table(table_file):
    """read table file with any columns

    Args:
        table_file (str): path to the table file

    Returns:
        list: table_list
    """
    table_list = []
    with open(table_file, 'r', encoding='utf8') as fin:
        for line in fin:
            tokens = line.strip().split()
            table_list.append(tokens)
    return table_list

### Score Norm

In [34]:
pip install fire

Collecting fire
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=b5a79a11df056b20eda141374a20d0372405724533b696c6c6a57a46ac1fce07
  Stored in directory: /root/.cache/pip/wheels/46/54/24/1624fd5b8674eb1188623f7e8e17cdf7c0f6c24b609dfb8a89
Successfully built fire
Installing collected packages: fire
Successfully installed fire-0.7.0
Note: you may need to restart the kernel to use updated packages.


In [77]:
import logging
import os

import fire
import kaldiio
import numpy as np
from tqdm import tqdm
from scipy.special import expit



def get_mean_std(emb, cohort, top_n):
    emb, cohort = emb.squeeze(), cohort.squeeze()
    emb = emb / np.sqrt(np.sum(emb**2, axis=1, keepdims=True))
    cohort = cohort / np.sqrt(np.sum(cohort**2, axis=1, keepdims=True))
    emb_cohort_score = np.matmul(emb, cohort.T)
    emb_cohort_score = np.sort(emb_cohort_score, axis=1)[:, ::-1]
    emb_cohort_score_topn = emb_cohort_score[:, :top_n]

    emb_mean = np.mean(emb_cohort_score_topn, axis=1)
    emb_std = np.std(emb_cohort_score_topn, axis=1)

    return emb_mean, emb_std


def split_embedding(utt_list, emb_scp, mean_vec):
    embs = []
    utt2idx = {}
    utt2emb = {}
    for utt, emb in kaldiio.load_scp_sequential(emb_scp):
        emb = emb.squeeze()
        emb = emb - mean_vec
        utt2emb[utt] = emb

    for utt in utt_list:
        embs.append(utt2emb[utt])
        utt2idx[utt] = len(embs) - 1

    return np.array(embs), utt2idx


def as_norm(score_norm_method,
         top_n,
         trial_score_file,
         score_norm_file,
         cohort_emb_scp,
         eval_emb_scp,
         mean_vec_path=None):
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    # get embedding
    if not mean_vec_path:
        print("Do not do mean normalization for evaluation embeddings.")
        mean_vec = 0.0
    else:
        assert os.path.exists(
            mean_vec_path), "mean_vec file ({}) does not exist !!!".format(
                mean_vec_path)
        mean_vec = np.load(mean_vec_path)

    # get embedding
    logging.info('get embedding ...')

    enroll_list, test_list, _ = zip(*read_table(trial_score_file))
    enroll_list = sorted(list(set(enroll_list)))  # remove overlap and sort
    test_list = sorted(list(set(test_list)))
    enroll_emb, enroll_utt2idx = split_embedding(enroll_list, eval_emb_scp,
                                                 mean_vec)
    test_emb, test_utt2idx = split_embedding(test_list, eval_emb_scp, mean_vec)

    cohort_list, _ = zip(*read_table(cohort_emb_scp))
    cohort_emb, _ = split_embedding(cohort_list, cohort_emb_scp, mean_vec)

    logging.info("computing normed score ...")
    if score_norm_method == "asnorm":
        top_n = top_n
    elif score_norm_method == "snorm":
        top_n = cohort_emb.shape[0]
    else:
        raise ValueError(score_norm_method)
    enroll_mean, enroll_std = get_mean_std(enroll_emb, cohort_emb, top_n)
    test_mean, test_std = get_mean_std(test_emb, cohort_emb, top_n)

    # score norm
    with open(trial_score_file, 'r', encoding='utf-8') as fin:
        with open(score_norm_file, 'w', encoding='utf-8') as fout:
            lines = fin.readlines()
            for line in tqdm(lines):
                line = line.strip().split()
                enroll_idx = enroll_utt2idx[line[0]]
                test_idx = test_utt2idx[line[1]]
                score = float(line[2])
                normed_score = 0.5 * (
                    (score - enroll_mean[enroll_idx]) / enroll_std[enroll_idx]
                    + (score - test_mean[test_idx]) / test_std[test_idx])
                normed_score = expit(normed_score)
                # compute mag mean for score calibration
                enroll_mag = np.linalg.norm(enroll_emb[enroll_idx])
                test_mag = np.linalg.norm(test_emb[test_idx])
                """
                fout.write(
                    '{} {} {:.5f} {:.4f} {:.4f} {:.4f} {:.4f}\n'.format(
                        line[0], line[1], normed_score, enroll_mag,
                        test_mag, enroll_mean[enroll_idx],
                        test_mean[test_idx]))
                """
                fout.write(f"{normed_score}\n")
trial_score_file = "/kaggle/working/trial_test_list.txt"
score_norm_file = "predictions.txt"
cohort_emb_scp = "/kaggle/working/cohort_emb.scp"
eval_emb_scp = "/kaggle/working/test_emb.scp"
as_norm("asnorm", 250, trial_score_file, score_norm_file, cohort_emb_scp, eval_emb_scp)

Do not do mean normalization for evaluation embeddings.


100%|██████████| 15971/15971 [00:00<00:00, 69918.35it/s]


In [None]:
with open("predictions.txt", "r") as score_file:
    scores = []
    for line in score_file.readlines():
        score = float(line.split()[2])
        scores.append(score)
with open("/kaggle/input/voxvietnam/test_list_gt.csv", "r") as test_file:
    labels = []
    for line in test_file.readlines():
        label = line.strip('"').split()[0]
        labels.append(int(label))
eer = tuneThresholdfromScore(scores, labels, [1, 0.1])[1]
print(f"EER after AS Norm: {eer}")

In [31]:
with open("predictions.txt", "r") as score_file:
    scores = []
    for score in score_file.readlines():
        score = float(score)
        scores.append(score)
print(len(scores))

9895


## Cohort Sets

In [75]:
import tqdm
model_0 = ECAPAModel(**vars(args))
model_0.load_parameters("/kaggle/input/ecapa_sv/pytorch/default/7/model_finetune_ph2_ckp6.model")
def get_raw_scores(test_list, test_path, output_file = "/kaggle/working/trial_test_list.txt"):
    scores = model_0.test_network(test_list, test_path)
    with open(test_list, "r") as f:
        with open("trial_test_list.txt", "w") as out_f:
            for idx, line in enumerate(f.readlines()):
                test_id, enroll_id = line.split() 
                out_f.write(f"enroll_utt_{enroll_id} test_utt_{test_id} {scores[idx]}\n")
                if (idx + 1) % 1000 == 0:
                    print(f"Step : {idx + 1}")
get_raw_scores("/kaggle/input/privatesvtest/prompts_sv.csv", "/kaggle/input/privatesvtest/audio")

05-22 15:17:23 Model para number = 6.65


100%|██████████| 6438/6438 [02:07<00:00, 50.43it/s]


Step : 1000
Step : 2000
Step : 3000
Step : 4000
Step : 5000
Step : 6000
Step : 7000
Step : 8000
Step : 9000
Step : 10000
Step : 11000
Step : 12000
Step : 13000
Step : 14000
Step : 15000


In [11]:
pip install kaldiio

Collecting kaldiio
  Downloading kaldiio-2.18.1-py3-none-any.whl.metadata (13 kB)
Downloading kaldiio-2.18.1-py3-none-any.whl (29 kB)
Installing collected packages: kaldiio
Successfully installed kaldiio-2.18.1
Note: you may need to restart the kernel to use updated packages.


In [76]:
import kaldiio
def get_test_emb(test_list, test_path, ark_file = "test_emb.ark", scp_file = "test_emb.scp"):
    write_specifier = f"ark,scp:{ark_file},{scp_file}"
    with open(test_list, "r") as test_f:
        lines = test_f.readlines()
    with torch.inference_mode(), kaldiio.WriteHelper(write_specifier) as writer:
        idx = 0
        for line in lines:
            idx += 1
            test_id, enroll_id = line.split()
            test_file = os.path.join(test_path, test_id)
            enroll_file = os.path.join(test_path, enroll_id)
            
            test_embedding = model_0.forward(test_file)
            enroll_embedding = model_0.forward(enroll_file)

            test_key = f"test_utt_{test_id}"
            enroll_key = f"enroll_utt_{enroll_id}"
            writer[test_key] = test_embedding.cpu().numpy()
            writer[enroll_key] = enroll_embedding.cpu().numpy()
            if (idx + 1) % 1000 == 0:
                print(f"Step : {idx + 1}")
get_test_emb("/kaggle/input/privatesvtest/prompts_sv.csv", "/kaggle/input/privatesvtest/audio")

Step : 1000
Step : 2000
Step : 3000
Step : 4000
Step : 5000
Step : 6000
Step : 7000
Step : 8000
Step : 9000
Step : 10000
Step : 11000
Step : 12000
Step : 13000
Step : 14000
Step : 15000


In [61]:
import random
with open("/kaggle/input/vietnam-celeb-dataset/vietnam-celeb-t.txt", "r") as f:
    lines = f.readlines()
    sample_lines = random.sample(lines, 30000)

In [62]:
def get_cohort_emb(cohort_loader, ark_file = "cohort_emb.ark", scp_file = "cohort_emb.scp"):
    write_specifier = f"ark,scp:{ark_file},{scp_file}"
    with torch.inference_mode(), kaldiio.WriteHelper(write_specifier) as writer:
		for num, (data, labels) in enumerate(cohort_loader, start=1):
			# Move data to device efficiently
			# Assuming data is already a tensor from the optimized DataLoader
			if isinstance(data, torch.Tensor):
				data = data.to(self.device, non_blocking=True)
			else:
				data = torch.FloatTensor(data).to(self.device, non_blocking=True)
			if isinstance(labels, torch.Tensor):
				labels = labels.to(self.device, non_blocking=True)
			else:
				labels = torch.LongTensor(labels).to(self.device, non_blocking=True)

            embedding = self.speaker_encoder.forward(data, aug=True)

            embedding = embedding.cpu().numpy()

            key = f"utt_{labels}_{num}"
            writer[key] = embedding
            if (num+ 1) % 1000 == 0:
                print(f"Step : {num + 1}")
                
get_cohort_emb("/kaggle/input/csv-files/processed_vietnam_celeb_dataset.csv")

Step : 1000
Step : 3000
Step : 4000
Step : 5000
Step : 6000
Step : 7000
Step : 8000
Step : 9000
Step : 10000
Step : 12000
Step : 13000
Step : 14000
Step : 17000
Step : 18000
Step : 19000
Step : 20000
Step : 21000
Step : 22000
Step : 24000
Step : 25000
Step : 26000
Step : 27000
Step : 28000
Step : 30000


In [41]:
with open("cohort_emb.scp",  "r") as f:
    print(len(f.readlines()))

6438


## Calibration

### Wave to Duration

In [19]:
#!/usr/bin/env python3
# encoding: utf-8
import os
import sys

import torchaudio

def wav2dur(scp, dur_scp, file_path):
    torchaudio.set_audio_backend("sox_io")
    
    with open(scp, 'r') as f, open(dur_scp, 'w') as fout:
        cnt = 0
        total_duration = 0
        for l in sample_lines:
            items = l.strip().split()
            wav_id = items[0]
            fname = items[1]
            cnt += 1
            path = os.path.join(file_path, wav_id, fname)
            if not os.path.exists(path):
                continue
            waveform, rate = torchaudio.load(path)
            frames = len(waveform[0])
            duration = frames / float(rate)
            total_duration += duration
            fout.write('{} {}\n'.format(wav_id, duration))
        print('process {} utts'.format(cnt))
        print('total {} s'.format(total_duration))
wav2dur("/kaggle/input/vietnam-celeb-dataset/vietnam-celeb-t.txt", "vn_cl_dur.scp", "/kaggle/input/vietnam-celeb-dataset/full-dataset/data")

process 30000 utts
total 158434.16118750002 s


In [20]:
with open("/kaggle/working/vn_cl_dur", "r") as dur_f:
    print(len(dur_f.readlines()))

21926


### Generate Calibration Trials

In [21]:
pip install fire

Note: you may need to restart the kernel to use updated packages.


In [28]:
import fire
import logging
import random
from tqdm import tqdm


def generate_calibration_trials(utt2dur, trial_path, each_trial_num=10000):
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    logging.info('Generate calibration trial ...')
    short_spk2utt = {}
    long_spk2utt = {}

    with open(utt2dur, 'r') as f:
        for line in f.readlines():
            utt, dur = line.strip().split()
            dur = float(dur)
            spk = utt.split('/')[0]

            if 2 < dur < 6:
                if spk not in short_spk2utt:
                    short_spk2utt[spk] = []
                short_spk2utt[spk].append(utt)

            if dur > 6:
                if spk not in long_spk2utt:
                    long_spk2utt[spk] = []
                long_spk2utt[spk].append(utt)

    long_spks = list(long_spk2utt.keys())
    short_spks = list(short_spk2utt.keys())

    for spk in long_spks:
        if spk not in short_spks:
            long_spk2utt.pop(spk, None)
    long_spks = list(long_spk2utt.keys())
    for spk in short_spks:
        if spk not in long_spks:
            short_spk2utt.pop(spk, None)
    short_spks = list(short_spk2utt.keys())

    with open(trial_path, 'w') as f:
        for _ in tqdm(range(each_trial_num // 2)):
            enroll_spk = random.choice(short_spks)
            spk_index = short_spks.index(enroll_spk)
            nontarget_spk = random.choice(short_spks[:spk_index] +
                                          short_spks[spk_index + 1:])

            # short2short
            enroll_utt, test_utt = random.choices(short_spk2utt[enroll_spk],
                                                  k=2)
            f.write("{} {} {}\n".format(enroll_utt, test_utt, 'target'))
            test_utt = random.choice(short_spk2utt[nontarget_spk])
            f.write("{} {} {}\n".format(enroll_utt, test_utt, 'nontarget'))

            # short2long
            enroll_utt = random.choice(short_spk2utt[enroll_spk])
            test_utt = random.choice(long_spk2utt[enroll_spk])
            f.write("{} {} {}\n".format(enroll_utt, test_utt, 'target'))
            test_utt = random.choice(long_spk2utt[nontarget_spk])
            f.write("{} {} {}\n".format(enroll_utt, test_utt, 'nontarget'))

            # long2long
            enroll_utt, test_utt = random.choices(long_spk2utt[enroll_spk],
                                                  k=2)
            f.write("{} {} {}\n".format(enroll_utt, test_utt, 'target'))
            test_utt = random.choice(long_spk2utt[nontarget_spk])
            f.write("{} {} {}\n".format(enroll_utt, test_utt, 'nontarget'))
generate_calibration_trials("/kaggle/working/vn_cl_dur.scp", "vn_cl_trials")

100%|██████████| 5000/5000 [00:00<00:00, 59451.51it/s]


In [30]:
# Copyright (c) 2022 Chengdong Liang (liangchengdong@mail.nwpu.edu.cn)
#               2024 Zhengyang Chen (chenzhengyang117@gmail.com)
#               2024 Bing Han (hanbing97@sjtu.edu.cn)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import fire
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim



def gather_calibration_factors(wav_dur_scp, max_dur, score_norm_file,
                               calibration_factor_file, drop_duration=False):
    if not drop_duration:
        wav_idx, dur_list = zip(*read_table(wav_dur_scp))
        wavidx2dur = {
            idx: min(float(dur), max_dur)
            for idx, dur in zip(wav_idx, dur_list)
        }

    def reorder_values(value_1, value_2):
        max_value = max(value_1, value_2)
        min_value = min(value_1, value_2)
        return "{:.4f} {:.4f} {:.4f} {:.4f}".format(min_value, max_value,
                                                    max_value - min_value,
                                                    max_value / min_value)

    # read factor from asnorm results
    assert os.path.exists(
        score_norm_file), "score norm file ({}) does not exist !!!".format(
            score_norm_file)

    with open(score_norm_file, 'r', encoding='utf-8') as fin:
        with open(calibration_factor_file, 'w', encoding='utf-8') as fout:
            lines = fin.readlines()
            for line in tqdm(lines):
                line = line.strip().split()
                idx1, idx2 = line[0], line[1]
                if drop_duration:
                    dur_str = ""
                else:
                    dur_str = reorder_values(wavidx2dur[idx1], wavidx2dur[idx2])
                mag_str = reorder_values(float(line[3]), float(line[4]))
                cohort_mean_str = reorder_values(float(line[5]),
                                                 float(line[6]))
                fout.write('{} {} {} {} {} {}\n'.format(
                    line[0], line[1], line[2], dur_str, mag_str,
                    cohort_mean_str))


class LinearModel(nn.Module):

    def __init__(self, input_dim):
        super(LinearModel, self).__init__()
        self.linear = nn.Linear(input_dim, 1)
        nn.init.constant_(self.linear.weight, 1.0 / input_dim)
        nn.init.constant_(self.linear.bias, 0)

    def forward(self, x):
        out = self.linear(x)
        return out


def cllr(target_llrs, nontarget_llrs):
    """
    Calculate the CLLR of the scores
    """

    def negative_log_sigmoid(lodds):
        """-log(sigmoid(log_odds))"""
        return torch.log1p(torch.exp(-lodds))

    return 0.5 * (torch.mean(negative_log_sigmoid(target_llrs)) + torch.mean(
        negative_log_sigmoid(-nontarget_llrs))) / np.log(2)


def train_calibration_model(calibration_factor_file, save_model_path):
    max_epochs = 50
    target_llrs_list = []
    nontarget_llrs_list = []
    with open(calibration_factor_file, 'r', encoding='utf-8') as fin:
        lines = fin.readlines()
        for line in lines:
            line = line.strip().split()
            if line[2] == "tgt" or line[2] == "target":
                target_llrs_list.append([float(v) for v in line[3:]])
            else:
                nontarget_llrs_list.append([float(v) for v in line[3:]])

    # build training set
    target_llrs = torch.tensor(target_llrs_list, dtype=torch.float64)
    nontarget_llrs = torch.tensor(nontarget_llrs_list, dtype=torch.float64)
    start_cllr = cllr(target_llrs, nontarget_llrs)

    # create model
    model = LinearModel(target_llrs.shape[-1])
    model.double()
    criterion = cllr

    # build optimizer
    optimizer = optim.LBFGS(model.parameters(), lr=0.01)

    best_loss = 1000000.0
    for i in range(max_epochs):

        def closure():
            optimizer.zero_grad()
            new_nontarget_llrs = model(nontarget_llrs)
            new_target_llrs = model(target_llrs)
            loss = criterion(new_target_llrs, new_nontarget_llrs)
            loss.backward()
            return loss

        loss = optimizer.step(closure)
        if (best_loss - loss < 1e-4):
            break
        else:
            if loss < best_loss:
                best_loss = loss

    torch.save(model.state_dict(), save_model_path)


def infer_calibration(calibration_factor_file, save_model_path,
                      calibration_score_file):
    llrs_list = []
    with open(calibration_factor_file, 'r', encoding='utf-8') as fin:
        lines = fin.readlines()
        for line in lines:
            line = line.strip().split()
            llrs_list.append([float(v) for v in line[3:]])

    llrs = torch.tensor(llrs_list, dtype=torch.float64)

    model = LinearModel(llrs.shape[-1])
    model.load_state_dict(torch.load(save_model_path))
    model.eval()
    model.double()
    outputs = model(llrs)

    with open(calibration_score_file, "w", encoding='utf-8') as fout:
        for i, s in enumerate(lines):
            line = lines[i].strip().split()
            score = outputs[i].item()
            fout.write('{} {} {} {}\n'.format(line[0], line[1], score,
                                              line[2]))

gather_calibration_factors("/kaggle/working/vn_cl_dur.scp", 20, score_norm_file, "vn_cl_factors.calibration", False)
train_calibration_model("/kaggle/working/vn_cl_factors.calibration", "calibration_model.pt")
infer_calibration("/kaggle/working/vn_cl_factors", "/kaggle/working/calibration_model.pt", "predictions.txt")

  0%|          | 0/9895 [00:00<?, ?it/s]


KeyError: 'enroll_utt_audio00002.wav'

In [17]:
import pandas as pd
import random


def generate_speaker_pairs(csv_file, output_file = "eval_pairs.txt", num_pairs=10000):
    # Load the dataset
    data = pd.read_csv(csv_file)
    
    # Group audio files by speaker ID
    speakers = data.groupby('speaker')['audio_name'].apply(list).to_dict()
    all_speakers = list(speakers.keys())
    pairs = []
    
    # Generate positive pairs (same speaker)
    while len(pairs) < num_pairs // 2:
        speaker = random.choice(all_speakers)
        if len(speakers[speaker]) < 2:
            continue
        file1, file2 = random.sample(speakers[speaker], 2)
        pairs.append([1, file1, file2])
    
    # Generate negative pairs (different speakers)
    while len(pairs) < num_pairs:
        sp1, sp2 = random.sample(all_speakers, 2)
        file1 = random.choice(speakers[sp1])
        file2 = random.choice(speakers[sp2])
        pairs.append([0, file1, file2])
    
    with open(output_file, "w") as fout:
        for pair in pairs:
            fout.write(f"{pair[0]} {pair[1]} {pair[2]}\n")
test_path = "/kaggle/input/vispeech/metadata/noisy_testset.csv"
generate_speaker_pairs(test_path)


## Testing

### ViSpeech

In [6]:
import pandas as pd
import random


def generate_speaker_pairs(csv_file, output_file = "test_pairs", num_pairs=10000):
    # Load the dataset
    data = pd.read_csv(csv_file)
    
    # Group audio files by speaker ID
    speakers = data.groupby('speaker')['audio_name'].apply(list).to_dict()
    all_speakers = list(speakers.keys())
    pairs = []
    
    # Generate positive pairs (same speaker)
    while len(pairs) < num_pairs // 2:
        speaker = random.choice(all_speakers)
        if len(speakers[speaker]) < 2:
            continue
        file1, file2 = random.sample(speakers[speaker], 2)
        pairs.append([1, file1, file2])
    
    # Generate negative pairs (different speakers)
    while len(pairs) < num_pairs:
        sp1, sp2 = random.sample(all_speakers, 2)
        file1 = random.choice(speakers[sp1])
        file2 = random.choice(speakers[sp2])
        pairs.append([0, file1, file2])
    
    with open(output_file)
    print(f"Generated {num_pairs} pairs saved to {output_file}")
test_path = "/kaggle/input/vispeech/metadata/noisy_testset.csv"
generate_speaker_pairs(test_path)


Generated 10000 pairs saved to test_pairs


In [7]:
model_0 = ECAPAModel(**vars(args))
model_0.load_parameters("/kaggle/working/exps/exp1/model/model_0002.model")
model_0.test_network("/kaggle/input/privatesvtest/prompts_sv.csv", "/kaggle/input/privatesvtest/audio")

05-21 08:35:36 Model para number = 6.65


100%|██████████| 6438/6438 [04:00<00:00, 26.82it/s]


In [10]:
model_0 = ECAPAModel(**vars(args))
model_0.load_parameters("/kaggle/input/ecapa_sv/pytorch/default/4/model_20_05_ckp20.model")
model_0.eval_network(eval_list = "/kaggle/input/voxvietnam/test_list_gt.csv", eval_path = "/kaggle/input/voxvietnam/wav")

05-21 02:25:55 Model para number = 6.65


100%|██████████| 17786/17786 [05:16<00:00, 56.12it/s]


(3.5575244999328772, 0.5105657143967374)

In [9]:
with open("predictions.txt", "r") as f:
    print(len(f.readlines()))

15971


In [None]:
from torch.utils.data import Dataset
class PrivateSVDataset(Dataset):
    def __init__(self, val_path, root_dir, sr=16000):
        """
        Dataset for speaker verification validation using pre-defined utterance pairs
        
        Args:
            val_path: Path to validation file with pre-defined pairs
            root_dir: Root directory containing the audio files
            sr: Sample rate
            duration: Max duration in seconds
        """
        self.root_dir = root_dir
        self.sr = sr
        # Store pairs and labels
        self.pairs = []
        # Read validation file with pairs
        with open(val_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) == 2:
                    utt_path1, utt_path2 = parts
                    
                    # Create full paths
                    audio_path1 = os.path.join(self.root_dir, utt_path1)
                    audio_path2 = os.path.join(self.root_dir, utt_path2)
                    
                    # Check if both files exist
                    if os.path.exists(audio_path1) and os.path.exists(audio_path2):
                        self.pairs.append((audio_path1, audio_path2))
    
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        audio_path1, audio_path2 = self.pairs[idx]
        
        # Load and process first waveform
        waveform1 = preprocess_audio(audio_path1)
        
        # Load and process second waveform
        waveform2 = preprocess_audio(audio_path2)
        
        return {
            'input_values': waveform1.squeeze(),
            'input_values2': waveform2.squeeze()
        }
csv_path = "/kaggle/input/privatesvtest/prompts_sv.csv"
root_dir = "/kaggle/input/privatesvtest/audio"
test_dataset = PrivateSVDataset(csv_path, root_dir)

In [None]:
from safetensors.torch import load_file

device = "cuda" if torch.cuda.is_available() else "cpu"
model_0 = ECAPA_TDNN().to(device)

state_dict = load_file("/kaggle/working/checkpoint/checkpoint-2360/model.safetensors")
model_0.load_state_dict(state_dict)

In [None]:
from torch.nn import CosineSimilarity

cosine_sim = CosineSimilarity(dim = 1, eps = 1e-6)

model_0.eval()
with torch.inference_mode(), open("predictions.txt", "w") as txtfile:
    for idx in range(len(test_dataset)):
        data = test_dataset[idx]
        test_data = data["input_values"]
        enroll_data = data["input_values2"]

        test_output = model_0(test_data.unsqueeze(0).to(device))
        enroll_output = model_0(enroll_data.unsqueeze(0).to(device))

        logit = cosine_sim(test_output, enroll_output)
        out_logit = logit[0].detach().item()
        txtfile.write(f"{out_logit:.2f}")
        txtfile.write("\n")
        if (idx + 1) % 1000 == 0:
            print(f"Step : {idx + 1}")

In [None]:
with open("predictions.txt", "r") as f:
    print(len(f.read().splitlines()))