In [None]:
!pip install speechbrain torchaudio torch

Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (f

In [None]:
from speechbrain.pretrained import EncoderClassifier
import torchaudio
import torch
import os
import numpy as np

def audio_to_embedding(audio_path):
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")

    try:
        model = EncoderClassifier.from_hparams(
            source="speechbrain/spkrec-ecapa-voxceleb",
            savedir="pretrained_models/spkrec-ecapa-voxceleb"
        )

        waveform, sample_rate = torchaudio.load(audio_path)

        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)

        min_length = 16000
        if waveform.shape[1] < min_length:
            padding = min_length - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, padding))

        embedding = model.encode_batch(waveform)

        return embedding.squeeze().numpy()  # This is your actual embedding

    except Exception as e:
        raise RuntimeError(f"Error processing audio: {str(e)}")

def test_embedding(embedding):
    """Test if embedding is valid and print its details"""

    print("\n=== EMBEDDING TEST RESULTS ===")
    print(f"Type: {type(embedding)}")
    print(f"Shape: {embedding.shape}")
    print(f"Data type: {embedding.dtype}")
    print(f"Min value: {embedding.min():.6f}")
    print(f"Max value: {embedding.max():.6f}")
    print(f"Mean: {embedding.mean():.6f}")
    print(f"Standard deviation: {embedding.std():.6f}")

    # Check if it's a valid embedding
    if isinstance(embedding, np.ndarray) and len(embedding.shape) == 1 and embedding.shape[0] > 0:
        print("✓ Embedding appears valid!")
        print(f"First 10 values: {embedding[:10]}")
        print(f"Last 10 values: {embedding[-10:]}")
    else:
        print("✗ Embedding might be invalid")

    return True

# Example usage
if __name__ == "__main__":
    audio_file = "/content/Record (online-voice-recorder.com).wav"

    try:
        embedding = audio_to_embedding(audio_file)  # <- This variable has the actual embedding
        test_embedding(embedding)  # Test the embedding

    except (FileNotFoundError, RuntimeError) as e:
        print(f"Error: {e}")

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/pretraine


=== EMBEDDING TEST RESULTS ===
Type: <class 'numpy.ndarray'>
Shape: (192,)
Data type: float32
Min value: -50.638191
Max value: 60.834442
Mean: -1.172150
Standard deviation: 19.791809
✓ Embedding appears valid!
First 10 values: [-35.883713   -2.964323    3.8657475  -3.2494326  20.805824   -6.5359387
   2.4346886 -40.008907   -2.3557591  12.9617405]
Last 10 values: [-11.734119   -17.552246     5.19502      0.16280097  41.12293
 -25.946217   -17.677643    -2.6609719  -32.74647      7.811638  ]
