In [None]:
# ! pip install speechbrain whisper jiwer -q

In [192]:
import pandas as pd
import whisper
import jiwer
import intel_extension_for_pytorch as ipex
import torch
import torchaudio
import pathlib
import collections

import warnings
warnings.filterwarnings("ignore")


asr = whisper.load_model("base", device="cpu",)

def calculate_wer_cer(reference_text, audio_file_path):
    
    # Transcribe the audio file
    result = asr.transcribe(audio_file_path, )
    transcribed_text = result['text']
    
    # Calculate WER
    wer = jiwer.wer(reference_text, transcribed_text)
    
    # Calculate CER
    cer = jiwer.cer(reference_text, transcribed_text)
    
    return wer, cer


In [194]:
path = pathlib.Path("experiments/2300v2/")

with open(path.joinpath("valid ground truth.txt"), "r") as f:
    text = f.read().lower()

reference_audio = path.joinpath("valid_long.flac")

In [195]:
wer_ref, cer_ref = calculate_wer_cer(text, reference_audio.as_posix())
wer_ref, cer_ref

(0.11764705882352941, 0.017699115044247787)

In [196]:

asr_knn_dict = collections.defaultdict(dict)

for item in path.joinpath("to_male/results/knn-vc").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_knn_dict[item.stem + "_male"] = {"wer": wer, "cer": cer}
    
for item in path.joinpath("to_female/results/knn-vc").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_knn_dict[item.stem + "_female"] = {"wer": wer, "cer": cer}

In [197]:
asr_xnot_dict = collections.defaultdict(dict)

for item in path.joinpath("to_male/results/xnot-vc").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_xnot_dict[item.stem + "_male"] = {"wer": wer, "cer": cer}
    
for item in path.joinpath("to_female/results/xnot-vc").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_xnot_dict[item.stem + "_female"] = {"wer": wer, "cer": cer}

In [198]:
knn_df = pd.DataFrame.from_dict(asr_knn_dict).T
xnot_df = pd.DataFrame.from_dict(asr_xnot_dict).T

In [199]:
knn_df.describe()

Unnamed: 0,wer,cer
count,20.0,20.0
mean,0.145588,0.025885
std,0.02778,0.009334
min,0.117647,0.017699
25%,0.117647,0.017699
50%,0.147059,0.026549
75%,0.147059,0.026549
max,0.205882,0.053097


In [200]:
xnot_df.describe()

Unnamed: 0,wer,cer
count,20.0,20.0
mean,0.130882,0.021681
std,0.015012,0.005156
min,0.117647,0.017699
25%,0.117647,0.017699
50%,0.117647,0.017699
75%,0.147059,0.026549
max,0.147059,0.035398


### Synthvoice detection

In [111]:
# Load model directly
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification

extractor = AutoFeatureExtractor.from_pretrained("MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection")
model = AutoModelForAudioClassification.from_pretrained("MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection")

In [141]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("audio-classification", model="MattyB95/AST-ASVspoof5-Synthetic-Voice-Detection")

In [142]:
batch = extractor(*librosa.load("experiments/2300/valid_short.flac", sr=16000), return_tensors="pt")

In [143]:
torch.softmax(model(**batch).logits, dim=-1)

tensor([[1., 0.]], grad_fn=<SoftmaxBackward0>)

In [148]:
pipe("experiments/2300/to_female/female_3575_v2.flac")

[{'score': 0.999961256980896, 'label': 'Bonafide'},
 {'score': 3.877157359966077e-05, 'label': 'Spoof'}]

In [108]:
asd_knn = []

for item in path.joinpath("to_male/results/knn-vc").iterdir():
    
    score = pipe(item.as_posix())[0]["score"]
    asd_knn.append(score)
    
for item in path.joinpath("to_female/results/knn-vc").iterdir():
    
    score = pipe(item.as_posix())[0]["score"]
    asd_knn.append(score)
    
pd.Series(asd_knn)

0     1.0
1     1.0
2     1.0
3     1.0
4     1.0
5     1.0
6     1.0
7     1.0
8     1.0
9     1.0
10    1.0
11    1.0
12    1.0
13    1.0
14    1.0
15    1.0
16    1.0
17    1.0
18    1.0
19    1.0
dtype: float64

### EER calculation

In [218]:
path = pathlib.Path("experiments/2300v2/")

In [219]:
from speechbrain.inference import EncoderClassifier
speaker_enc = EncoderClassifier.from_hparams(
  "speechbrain/spkrec-xvect-voxceleb"
)

speaker_enc.eval();


2024-08-11 03:11:19,388 - speechbrain.utils.fetching - INFO - Fetch hyperparams.yaml: Using existing file/symlink in pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/hyperparams.yaml.
2024-08-11 03:11:19,389 - speechbrain.utils.fetching - INFO - Fetch custom.py: Delegating to Huggingface hub, source speechbrain/spkrec-xvect-voxceleb.
2024-08-11 03:11:19,830 - speechbrain.utils.fetching - INFO - Fetch embedding_model.ckpt: Using existing file/symlink in pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/embedding_model.ckpt.
2024-08-11 03:11:19,831 - speechbrain.utils.fetching - INFO - Fetch mean_var_norm_emb.ckpt: Using existing file/symlink in pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/mean_var_norm_emb.ckpt.
2024-08-11 03:11:19,832 - speechbrain.utils.fetching - INFO - Fetch classifier.ckpt: Using existing file/symlink in pretrained_models/EncoderClassifier-e3dcc8e5060144ec1668cd02c05772cd/classifier.ckpt.
2024-08-11 03:

In [220]:
def get_speaker_matrix(path, glob):

    matrix = None

    for p in path.joinpath("to_male").rglob(glob):

        signal, fs = torchaudio.load(p)
        embeddings = speaker_enc.encode_batch(signal)

        if matrix is None:
            matrix = embeddings
        else: 
            matrix = torch.cat((matrix, embeddings), dim=0)
            
    for p in path.joinpath("to_female").rglob(glob):

        signal, fs = torchaudio.load(p)
        embeddings = speaker_enc.encode_batch(signal)
        matrix = torch.cat((matrix, embeddings), dim=0)
            
    return matrix.squeeze(1)


In [224]:
ref_matrix = get_speaker_matrix(path, "*_30.flac")

In [226]:
true_matrix = get_speaker_matrix(path, "*_v2.flac")

In [227]:
knn_matrix = get_speaker_matrix(path, "knnvc*.flac")

In [228]:
xnot_matrix = get_speaker_matrix(path, "xnotvc*.flac")

In [229]:
torch.nn.functional.cosine_similarity(ref_matrix, true_matrix)

tensor([0.9294, 0.9473, 0.9373, 0.9429, 0.9315, 0.9551, 0.9448, 0.9471, 0.9487,
        0.9338, 0.9673, 0.9563, 0.9355, 0.9556, 0.8978, 0.9488, 0.9977, 0.9586,
        0.9258, 0.9605])

In [230]:
torch.nn.functional.cosine_similarity(ref_matrix, knn_matrix)

tensor([0.9423, 0.9498, 0.9358, 0.9440, 0.9435, 0.9475, 0.9773, 0.9360, 0.9557,
        0.9281, 0.9238, 0.9515, 0.9440, 0.9464, 0.9166, 0.9796, 0.9837, 0.9561,
        0.9306, 0.9413])

In [231]:
torch.nn.functional.cosine_similarity(ref_matrix, xnot_matrix)

tensor([0.9535, 0.9504, 0.9737, 0.9474, 0.9325, 0.9385, 0.9278, 0.9300, 0.9539,
        0.9487, 0.9546, 0.9809, 0.9560, 0.9624, 0.9098, 0.9500, 0.9626, 0.9336,
        0.9437, 0.9220])

##### KNN-VC EER

In [232]:
speechbrain.utils.metric_stats.EER(
    torch.nn.functional.cosine_similarity(ref_matrix, true_matrix),
    torch.nn.functional.cosine_similarity(ref_matrix, knn_matrix)
)

(0.44999998807907104, 0.9448345303535461)

##### XNOT-VC EER

In [233]:
speechbrain.utils.metric_stats.EER(
    torch.nn.functional.cosine_similarity(ref_matrix, true_matrix),
    torch.nn.functional.cosine_similarity(ref_matrix, xnot_matrix)
)

(0.550000011920929, 0.9473746418952942)