In [44]:
!pip install speechbrain torchaudio torch pinecone



In [45]:
import os
import numpy as np
import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier
import pinecone
import uuid
from typing import List, Union

In [46]:
PINECONE_API_KEY = "pcsk_2yzKnb_DusX4M95CU1KTjQxkZFPdYWbtFghFc7kUD2cHzpUT4hWPLmMbPgEgT5NgoX3Fib"
PINECONE_ENV = "us-east-1"
INDEX_NAME = "voice-biometrics"
EMBEDDING_DIM = 192  # Matches ECAPA-TDNN output (spkrec-ecapa-voxceleb)
UPSERT_BATCH_SIZE = 100

In [47]:
def audio_to_embedding(audio_path):
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")

    try:
        # Load the pretrained model
        model = EncoderClassifier.from_hparams(
            source="speechbrain/spkrec-ecapa-voxceleb",
            savedir="pretrained_models/spkrec-ecapa-voxceleb"
        )

        # Load audio file
        waveform, sample_rate = torchaudio.load(audio_path)

        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Resample to 16kHz if necessary
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        # Pad audio if shorter than 1 second
        min_length = 16000
        if waveform.shape[1] < min_length:
            padding = min_length - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, padding))

        # Generate embedding
        embedding = model.encode_batch(waveform)

        return embedding.squeeze().numpy()

    except Exception as e:
        raise RuntimeError(f"Error processing audio: {str(e)}")

In [48]:
def test_embedding(embedding, file_name=""):
    print(f"\n=== TEST: {file_name} ===")
    print(f"Type: {type(embedding)}")
    print(f"Shape: {embedding.shape}")
    print(f"Min: {embedding.min():.4f}, Max: {embedding.max():.4f}, Mean: {embedding.mean():.4f}, Std: {embedding.std():.4f}")
    print("First 5:", embedding[:5], "... Last 5:", embedding[-5:])

    if isinstance(embedding, np.ndarray) and embedding.ndim == 1 and embedding.shape[0] > 0:
        print("✓ Valid embedding")
    else:
        print("✗ Invalid embedding")

In [49]:
def init_pinecone():
    # Initialize Pinecone client
    pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

    # Check existing indexes
    existing_indexes = pc.list_indexes().names()
    if INDEX_NAME in existing_indexes:
        index_description = pc.describe_index(INDEX_NAME)
        if index_description.dimension != EMBEDDING_DIM:
            print(f"Index '{INDEX_NAME}' exists with dimension {index_description.dimension}, but {EMBEDDING_DIM} is required. Deleting and recreating...")
            pc.delete_index(INDEX_NAME)
            print(f"Deleted index '{INDEX_NAME}'.")
            pc.create_index(
                name=INDEX_NAME,
                dimension=EMBEDDING_DIM,
                metric="cosine",
                spec=pinecone.PodSpec(environment=PINECONE_ENV, pod_type="p1.x1")
            )
            print(f"Index '{INDEX_NAME}' created with dimension {EMBEDDING_DIM}.")
        else:
            print(f"Index '{INDEX_NAME}' already exists with correct dimension ({EMBEDDING_DIM}).")
    else:
        print(f"Index '{INDEX_NAME}' not found. Creating index...")
        pc.create_index(
            name=INDEX_NAME,
            dimension=EMBEDDING_DIM,
            metric="cosine",
            spec=pinecone.PodSpec(environment=PINECONE_ENV, pod_type="p1.x1")
        )
        print(f"Index '{INDEX_NAME}' created with dimension {EMBEDDING_DIM}.")

    return pc.Index(INDEX_NAME)

In [50]:
def format_embeddings(vectors: List[np.ndarray], ids: List[str], metadata_list: List[dict] = None):
    formatted = []
    for i, vec in enumerate(vectors):
        vec_id = ids[i]
        vec_data = vec.tolist()
        metadata = metadata_list[i] if metadata_list else None
        formatted.append((vec_id, vec_data, metadata) if metadata else (vec_id, vec_data))
    return formatted

In [51]:
def batch_upsert(index, data: List[Union[tuple, list]], batch_size: int = 100):
    try:
        for i in range(0, len(data), batch_size):
            batch = data[i:i + batch_size]
            index.upsert(vectors=batch)
            print(f"✅ Upserted batch {i // batch_size + 1}: {len(batch)} vectors")
    except Exception as e:
        raise RuntimeError(f"Error upserting vectors: {str(e)}")

In [52]:
def process_audio_directory(directory_path):
    embeddings = []
    ids = []
    metadata_list = []

    # Initialize Pinecone index
    index = init_pinecone()
    print("🔍 Scanning for audio files...")

    # Process each audio file in the directory
    for file_name in os.listdir(directory_path):
        if file_name.lower().endswith((".wav", ".mp3")):
            file_path = os.path.join(directory_path, file_name)

            try:
                embedding = audio_to_embedding(file_path)
                test_embedding(embedding, file_name)

                embeddings.append(embedding)
                ids.append(str(uuid.uuid4()))
                metadata_list.append({"file_name": file_name, "source": "audio-directory"})

            except Exception as e:
                print(f"❌ Skipping {file_name}: {e}")

    # Upsert embeddings if any were generated
    if embeddings:
        formatted = format_embeddings(embeddings, ids, metadata_list)
        batch_upsert(index, formatted, UPSERT_BATCH_SIZE)
        print(f"\n🎉 Finished processing {len(embeddings)} audio files.")
    else:
        print("\n⚠️ No valid audio files processed.")

In [53]:
process_audio_directory("/content/sample_data/audio_samples")

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Index 'voice-biometrics' already exists with correct dimension (192).
🔍 Scanning for audio files...


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["clas


=== TEST: Uma.mp4.wav ===
Type: <class 'numpy.ndarray'>
Shape: (192,)
Min: -52.4973, Max: 38.0868, Mean: -1.6393, Std: 17.8343
First 5: [ 30.635386  -19.247034    9.84472    -6.8594832  23.763046 ] ... Last 5: [ 4.378494 19.961443 18.883963 11.850898 -9.757335]
✓ Valid embedding
✅ Upserted batch 1: 1 vectors

🎉 Finished processing 1 audio files.
