In [None]:
pip install speechbrain whisper jiwer pandas -q

In [None]:
pip install git+https://github.com/openai/whisper.git

In [2]:
import pandas as pd
import whisper
import jiwer
#import intel_extension_for_pytorch as ipex
import torch
import torchaudio
import pathlib
import collections

import warnings
warnings.filterwarnings("ignore")


asr = whisper.load_model("base", device="cpu",)

def calculate_wer_cer(reference_text, audio_file_path):
    
    # Transcribe the audio file
    result = asr.transcribe(audio_file_path, )
    transcribed_text = result['text']
    
    # Calculate WER
    wer = jiwer.wer(reference_text, transcribed_text)
    
    # Calculate CER
    cer = jiwer.cer(reference_text, transcribed_text)
    
    return wer, cer


In [2]:
path = pathlib.Path("experiments/2300/")

with open(path.joinpath("valid ground truth.txt"), "r") as f:
    text = f.read().lower()

reference_audio = path.joinpath("valid_long.flac")

In [None]:
wer_ref, cer_ref = calculate_wer_cer(text, reference_audio.as_posix())
wer_ref, cer_ref

In [None]:

asr_knn_dict = collections.defaultdict(dict)

for item in path.joinpath("to_male/results/linreg").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_knn_dict[item.stem + "_male"] = {"wer": wer, "cer": cer}
    
for item in path.joinpath("to_female/results/linreg").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_knn_dict[item.stem + "_female"] = {"wer": wer, "cer": cer}

In [6]:
asr_xnot_dict = collections.defaultdict(dict)

for item in path.joinpath("to_male/results/xnot-vc").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_xnot_dict[item.stem + "_male"] = {"wer": wer, "cer": cer}
    
for item in path.joinpath("to_female/results/xnot-vc").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_xnot_dict[item.stem + "_female"] = {"wer": wer, "cer": cer}

In [6]:
asr_linreg_dict = collections.defaultdict(dict)

for item in path.joinpath("to_male/results/linreg").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_linreg_dict[item.stem + "_male"] = {"wer": wer, "cer": cer}
    
for item in path.joinpath("to_female/results/linreg").iterdir():
    
    wer, cer = calculate_wer_cer(text, item.as_posix())
    
    asr_linreg_dict[item.stem + "_female"] = {"wer": wer, "cer": cer}

In [7]:
linreg_df = pd.DataFrame.from_dict(asr_linreg_dict).T

In [None]:
linreg_df.describe()

### EER calculation

In [3]:
path = pathlib.Path("experiments/2300/")

In [20]:
import speechbrain.inference

In [4]:
from speechbrain.inference import EncoderClassifier
speaker_enc = EncoderClassifier.from_hparams(
  "speechbrain/spkrec-xvect-voxceleb"
)

speaker_enc.eval();


In [5]:
def get_speaker_matrix(path, glob):

    matrix = None

    for p in path.joinpath("to_male").rglob(glob):

        signal, fs = torchaudio.load(p)
        embeddings = speaker_enc.encode_batch(signal)

        if matrix is None:
            matrix = embeddings
        else: 
            matrix = torch.cat((matrix, embeddings), dim=0)
            
    for p in path.joinpath("to_female").rglob(glob):

        signal, fs = torchaudio.load(p)
        embeddings = speaker_enc.encode_batch(signal)
        matrix = torch.cat((matrix, embeddings), dim=0)
            
    return matrix.squeeze(1)


In [6]:
ref_matrix = get_speaker_matrix(path, "*_30.flac")

AttributeError: 'NoneType' object has no attribute 'squeeze'

In [None]:
true_matrix = get_speaker_matrix(path, "*_v2.flac")

In [30]:
knn_matrix = get_speaker_matrix(path, "knnvc*.flac")

In [31]:
xnot_matrix = get_speaker_matrix(path, "xnotvc*.flac")

In [None]:
torch.nn.functional.cosine_similarity(ref_matrix, true_matrix)

In [None]:
torch.nn.functional.cosine_similarity(ref_matrix, knn_matrix)

In [None]:
torch.nn.functional.cosine_similarity(ref_matrix, xnot_matrix)

##### KNN-VC EER

In [26]:
import speechbrain

In [None]:
speechbrain.utils.metric_stats.EER(
    torch.nn.functional.cosine_similarity(ref_matrix, true_matrix),
    torch.nn.functional.cosine_similarity(ref_matrix, knn_matrix)
)

##### XNOT-VC EER

In [None]:
speechbrain.utils.metric_stats.EER(
    torch.nn.functional.cosine_similarity(ref_matrix, true_matrix),
    torch.nn.functional.cosine_similarity(ref_matrix, xnot_matrix)
)