In [12]:
import sounddevice as sd
from scipy.io.wavfile import write

number='test'
fs = 44100  # Sample rate
seconds = 5  # Duration of recording

myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
sd.wait()  # Wait until recording is finished
write(f'authenticated_speaker_{number}.wav', fs, myrecording)  # Save as WAV file

In [1]:
from speechbrain.pretrained import SpeakerRecognition
import numpy as np
import librosa
import torch

# Load the pretrained ECAPA-TDNN model
verifier = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmpdir")

def extract_embedding(audio_path):
    """Extract speaker embeddings from an audio file."""
    # Load the audio file
    signal, sample_rate = librosa.load(audio_path, sr=16000)  # Ensure 16kHz sampling rate

    # Convert to PyTorch tensor and add batch dimension
    signal = torch.tensor(signal).unsqueeze(0)  # Shape: [1, num_samples]
    
    # Extract embedding
    embedding = verifier.encode_batch(signal)
    return embedding.squeeze().numpy()  # Convert back to NumPy for further processing

# Example usage
embedding = extract_embedding("authenticated_speaker_test.wav")
print("Extracted Embedding Shape:", embedding.shape)  # Should be a 1D vector


  from .autonotebook import tqdm as notebook_tqdm
INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
  from speechbrain.pretrained import SpeakerRecognition
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch 

Extracted Embedding Shape: (192,)


In [9]:
import faiss

# Initialize FAISS index (L2 normalized embeddings for cosine similarity)
embedding_dim = 192  # Dimension of ECAPA-TDNN embeddings
index = faiss.IndexFlatIP(embedding_dim)  # IP = Inner Product (cosine similarity with normalized embeddings)

# Function to add embeddings to FAISS
def add_to_index(embedding, speaker_id, index, metadata):
    """
    Add an embedding to the FAISS index and store metadata.
    """
    embedding = embedding / np.linalg.norm(embedding)  # Normalize for cosine similarity
    index.add(np.array([embedding], dtype=np.float32))  # Add to FAISS
    metadata.append(speaker_id)  # Save speaker ID for reference

# Metadata storage
metadata = []
number_of_samples=3
# Example usage (Add 5 embeddings from authenticated speaker)
for i in range(1, number_of_samples+1):
    emb = extract_embedding(f"authenticated_speaker_{i}.wav")
    add_to_index(emb, speaker_id="authenticated_user", index=index, metadata=metadata)

print(f"Index contains {index.ntotal} embeddings.")


Index contains 3 embeddings.


In [13]:
def verify_speaker(test_audio_path,number_of_samples, index, metadata, threshold=0.8):
    """
    Verify if a test speaker matches the authenticated speaker.
    """
    # Extract embedding for the test audio
    test_embedding = extract_embedding(test_audio_path)
    test_embedding = test_embedding / np.linalg.norm(test_embedding)  # Normalize
    
    # Search the FAISS index
    distances, indices = index.search(np.array([test_embedding], dtype=np.float32), k=number_of_samples)  # Top 5 matches
    mean_similarity = np.mean(distances)  # Average similarity
    
    print("Cosine Similarities:", distances.flatten())
    
    # Check if similarity meets the threshold
    if mean_similarity > threshold:
        speaker_id = metadata[indices[0][0]]
        return f"Match Found! Speaker ID: {speaker_id}, Similarity: {mean_similarity:.2f}"
    else:
        return f"No Match Found. Similarity: {mean_similarity:.2f}"

# Test verification
result = verify_speaker("authenticated_speaker_test.wav",number_of_samples, index, metadata,0.5)
print(result)


Cosine Similarities: [0.6344024  0.55637985 0.36214453]
Match Found! Speaker ID: authenticated_user, Similarity: 0.52
