In [34]:
import torchaudio
from speechbrain.pretrained import EncoderClassifier
import wespeakerruntime as wespeaker

In [15]:
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", run_opts={"device":"cuda:0"})

In [28]:
import torch
import kaldiio
import os
from tqdm import tqdm
from kaldi_io import read_vec_flt
from Process_Data.Datasets.KaldiDataset import ScriptVerifyDataset
from TrainAndTest.common_func import verification_test

In [37]:
def speechbrain_ecapa(upath, classifier):
    signal, fs = torchaudio.load(upath)
    embeddings = classifier.encode_batch(signal)
    return embeddings.squeeze().detach().cpu().numpy()

def wespeaker_resnet(wav_file, speaker, lang='chs'):
    ans = speaker.extract_embedding(wav_file).squeeze()
    return ans

In [3]:
lstm_dir  = '/home/yangwenhao/project/lstm_speaker_verification'
root_path ='/home/yangwenhao/project/SpeakerVerification-pytorch/'

In [41]:
test_set = 'cnceleb'
# test_set = 'aishell2'
# test_set = 'aidata'

In [42]:
wav_scp = lstm_dir + '/data/{}/test/wav.scp'.format(test_set)
trials  = lstm_dir + '/data/{}/test/trials'.format(test_set)

xvector_dir = 'data/resnet_test/{}'.format(test_set)
if not os.path.exists(xvector_dir):
    os.makedirs(xvector_dir)

scp_file = xvector_dir + '/xvectors.scp'
ark_file = xvector_dir + '/xvectors.ark'
writer = kaldiio.WriteHelper('ark,scp:%s,%s' % (ark_file, scp_file))

In [46]:
with open(wav_scp, 'r') as f:
    for l in tqdm(f.readlines(), ncols=50):
        
        lst = l.split()
        if len(lst) == 2:
            uid, upath = lst
        else:
            uid, upath = lst[0], lst[4]
        # signal, fs = torchaudio.load(upath)
        # embeddings = classifier.encode_batch(signal)
        embeddings = speechbrain_ecapa(upath, classifier)
        # embeddings = wespeaker_resnet(upath, speaker)
        writer(str(uid), embeddings)

100%|███████| 17973/17973 [12:46<00:00, 23.45it/s]


In [None]:
verify_dir = ScriptVerifyDataset(dir=lstm_dir + '/data/{}/test'.format(test_set), trials_file='trials',
                                     xvectors_dir=xvector_dir,
                                     loader=read_vec_flt)

kwargs = {'num_workers': 4, 'pin_memory': False}
verify_loader = torch.utils.data.DataLoader(verify_dir, batch_size=128, shuffle=False, **kwargs)

eer, eer_threshold, mindcf_01, mindcf_001 = verification_test(test_loader=verify_loader,
                                                              dist_type='cos',
                                                              log_interval=10,
                                                              xvector_dir=xvector_dir,
                                                              epoch=12)

In [48]:
mix3 = 100. * eer * mindcf_01 * mindcf_001
mix2 = 100. * eer * mindcf_001
mix8 = 100. * eer * mindcf_01

print('          \33[91mTrain EER: {:.4f}%, Threshold: {:.4f}, '
      'mindcf-0.01: {:.4f}, mindcf-0.001: {:.4f}, mix2,3: {:.4f}, {:.4f}. \33[0m'.format(100. * eer,
                                                                                         eer_threshold,
                                                                                         mindcf_01, mindcf_001, mix2, mix3))
# ecapa-tdnn vox2
# aishell2
# Train EER: 7.8288%, Threshold: 0.3257, mindcf-0.01: 0.6036, mindcf-0.001: 0.7667, mix2,3: 6.0026, 3.6231.

# aidata
# Train EER: 8.8043%, Threshold: 0.3393, mindcf-0.01: 0.6619, mindcf-0.001: 0.9150, mix2,3: 8.0558, 5.3321.

# cnceleb
# Train EER: 15.2781%, Threshold: 0.2710, mindcf-0.01: 0.6522, mindcf-0.001: 0.7755, mix2,3: 11.8482, 7.7278

          [91mTrain EER: 15.2781%, Threshold: 0.2710, mindcf-0.01: 0.6522, mindcf-0.001: 0.7755, mix2,3: 11.8482, 7.7278. [0m


In [9]:
signal, fs = torchaudio.load('/home/yangwenhao/dataset/AISHELL-2/iOS/data/wav/C0005/IC0005W0004.wav')
embeddings = classifier.encode_batch(signal)

In [35]:
speaker = wespeaker.Speaker(lang='chs')
ans = speaker.extract_embedding('/home/yangwenhao/dataset/AISHELL-2/iOS/data/wav/C0005/IC0005W0004.wav')

Downloading https://wespeaker-1256283475.cos.ap-shanghai.myqcloud.com/models/cnceleb/cnceleb_resnet34_LM.onnx to /home/yangwenhao/.wespeaker/chs


cnceleb_resnet34_LM.onnx: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 25.3M/25.3M [00:02<00:00, 11.9MB/s]


In [45]:
torchaudio.load('/home/yangwenhao/dataset/CN-Celeb/eval/test/id00800-singing-01-001.flac')

(tensor([[0., 0., 0.,  ..., 0., 0., 0.]]), 16000)