In [1]:
import os
os.chdir("..")

In [2]:
import torch
import soundfile as sf
import whisper

from torchaudio import transforms as T
from srtool.models import SRModelPreTrained
from IPython.display import Audio, display

In [3]:
pt_path = "./zoo/sr-model.pt"
threshhold = 0.6122
audio_files = [
    "./notebooks/demo-data/my1.ogg",
    "./notebooks/demo-data/my2.ogg",
    "./notebooks/demo-data/another.wav"
]

In [4]:
model = SRModelPreTrained(pt_path).cpu().eval()
model.load_state_dict(torch.load(pt_path)["model_state"])

<All keys matched successfully>

In [5]:
def read_audio(path: str, resample_rate: int = 16_000):
    wav, sr = sf.read(path, dtype="float32")
    resampler = T.Resample(sr, resample_rate)
    
    return resampler(torch.from_numpy(wav))


In [None]:
wav_lst = [read_audio(p) for p in audio_files]
for wav in wav_lst:
    display(Audio(wav, rate=16_000))

In [7]:
mels = [whisper.log_mel_spectrogram(p)[None, ...] for p in audio_files]

In [8]:
with torch.no_grad():
    embeddins = torch.concat([model(mel) for mel in mels])
embeddins = (embeddins.T / torch.sqrt(torch.sum(embeddins**2, dim=1))).T

In [11]:
scores = embeddins @ embeddins.T
scores

tensor([[ 1.0000,  0.7672, -0.0127],
        [ 0.7672,  1.0000,  0.0462],
        [-0.0127,  0.0462,  1.0000]])

In [15]:
is_ver = scores > threshhold
print(f"{threshhold=}")
print(f"m1 vs my2: {is_ver[0][1]}, srore {scores[0][1]: .4f}")
print(f"m1 vs another: {is_ver[0][2]}, srore {scores[0][2]: .4f}")
print(f"m2 vs another: {is_ver[1][2]}, srore {scores[1][2]: .4f}")

threshhold=0.6122
m1 vs my2: True, srore  0.7672
m1 vs another: False, srore -0.0127
m2 vs another: False, srore  0.0462
