In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pydub moviepy ffmpeg-python noisereduce speechbrain torchaudio torch pinecone sentencepiece --prefer-binary

Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting noisereduce
  Downloading noisereduce-3.0.3-py3-none-any.whl.metadata (14 kB)
Collecting speechbrain
  Downloading speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading 

In [2]:
import os
import uuid
import torch
import torchaudio
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import noisereduce as nr
from pydub import AudioSegment
from speechbrain.pretrained import EncoderClassifier
import pinecone
from typing import List, Union, Dict, Any
import zipfile
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean, cosine
import itertools
import warnings
warnings.filterwarnings('ignore')

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
  from speechbrain.pretrained import EncoderClassifier


In [3]:
zip_path = '/content/drive/MyDrive/Capstone Reports/recordings.zip'
extract_path = '/content/sample_data/audio_samples/recordings'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print("✅ Extracted to:", extract_path)

✅ Extracted to: /content/sample_data/audio_samples/recordings


In [4]:
input_folder = "/content/sample_data/audio_samples/recordings/recordings"
output_wav_folder = "/content/sample_data/audio_samples/audio_wav"
embedding_output_path = "/content/sample_data/audio_samples/embeddings.csv"

In [5]:
def convert_to_wav_universal(input_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    supported_formats = (".mp3", ".wav", ".ogg", ".oga", ".flac", ".mp4", ".m4a")

    for file_name in os.listdir(input_folder):
        input_path = os.path.join(input_folder, file_name)
        if file_name.lower().endswith(supported_formats):
            try:
                audio = AudioSegment.from_file(input_path)
                output_name = os.path.splitext(file_name)[0] + ".wav"
                output_path = os.path.join(output_folder, output_name)
                audio.export(output_path, format="wav")
                print(f"Converted: {file_name}")
            except Exception as e:
                print(f"Error converting {file_name}: {e}")
        else:
            print(f"Skipping unsupported file: {file_name}")

In [6]:
def audio_to_embedding_enhanced(audio_path):
    try:
        audio = AudioSegment.from_file(audio_path)
        normalized_audio = audio.apply_gain(-audio.dBFS)
        temp_path = audio_path.replace(".wav", "_normalized.wav")
        normalized_audio.export(temp_path, format="wav")

        waveform, sample_rate = torchaudio.load(temp_path)
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        waveform_np = waveform.squeeze().cpu().numpy()
        denoised = nr.reduce_noise(y=waveform_np, sr=16000)
        waveform = torch.tensor(denoised).unsqueeze(0)

        vad = torchaudio.transforms.Vad(sample_rate=16000)
        waveform = vad(waveform)

        if waveform.numel() == 0:
            raise ValueError("No voiced segment found.")

        waveform = waveform / waveform.abs().max()
        if waveform.shape[1] < 16000:
            pad_amt = 16000 - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, pad_amt))

        waveform = waveform.unsqueeze(0).squeeze(1)
        embedding = model.encode_batch(waveform)

        os.remove(temp_path)
        return embedding.squeeze().numpy()

    except Exception as e:
        print(f"Error in audio_to_embedding_enhanced: {e}")
        return None

In [7]:
PINECONE_API_KEY = "pcsk_2yzKnb_DusX4M95CU1KTjQxkZFPdYWbtFghFc7kUD2cHzpUT4hWPLmMbPgEgT5NgoX3Fib"
PINECONE_ENV = "us-east-1"
INDEX_NAME = "voice-biometrics"
EMBEDDING_DIM = 192
UPSERT_BATCH_SIZE = 100

In [8]:
model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb",
    run_opts={"skip_vad": True}
)

INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


hyperparams.yaml: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/hyperparams.yaml' -> '/content/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered parameter transfer hook for _load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load_if_possible
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embeddin

embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/embedding_model.ckpt' -> '/content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/mean_var_norm_emb.ckpt' -> '/content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/classifier.ckpt' -> '/content/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["classifier"] = /content/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


label_encoder.txt: 0.00B [00:00, ?B/s]

DEBUG:speechbrain.utils.fetching:Fetch: Local file found, creating symlink '/root/.cache/huggingface/hub/models--speechbrain--spkrec-ecapa-voxceleb/snapshots/0f99f2d0ebe89ac095bcc5903c4dd8f72b367286/label_encoder.txt' -> '/content/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["label_encoder"] = /content/pretrained_models/spkrec-ecapa-voxceleb/label_encoder.ckpt
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): embedding_model -> /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from local path): mean_var_norm_emb -> /content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
DEBUG:speechbrain.utils.parameter_transfer:Redirecting (loading from l

In [9]:
def init_pinecone():
    pc = pinecone.Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)
    existing_indexes = pc.list_indexes().names()
    if INDEX_NAME not in existing_indexes:
        pc.create_index(
            name=INDEX_NAME,
            dimension=EMBEDDING_DIM,
            metric="cosine",
            spec=pinecone.PodSpec(environment=PINECONE_ENV)
        )
    return pc.Index(INDEX_NAME)

In [10]:
def format_embeddings(vectors: List[np.ndarray], ids: List[str], metadata_list: List[dict] = None):
    return [
        (ids[i], vectors[i].tolist(), metadata_list[i] if metadata_list else None)
        for i in range(len(vectors))
    ]

# Keep track of the total number of upserted vectors across all calls
total_upserted_vectors = 0

def batch_upsert(index, data: List[Union[tuple, list]], batch_size: int = 10):
    global total_upserted_vectors
    for i in range(0, len(data), batch_size):
        index.upsert(vectors=data[i:i + batch_size])
        current_batch_size = len(data[i:i + batch_size])
        total_upserted_vectors += current_batch_size
        current_batch_number = (total_upserted_vectors + batch_size - 1) // batch_size # Calculate batch number based on total upserted
        print(f"✅ Upserted batch {current_batch_number} (Total vectors: {total_upserted_vectors})")

In [16]:
def process_audio_directory(input_folder, wav_folder, save_csv=True, upsert_to_pinecone=True, limit=None):
    convert_to_wav_universal(input_folder, wav_folder)

    embeddings = []
    ids = []
    metadata_list = []

    index = init_pinecone() if upsert_to_pinecone else None

    # Get list of all wav files
    wav_files = [f for f in os.listdir(wav_folder) if f.endswith(".wav")]

    # Apply limit if specified
    if limit is not None:
        wav_files = wav_files[:limit]

    # Process in batches
    batch_size = 100
    for i in range(0, len(wav_files), batch_size):
        batch_files = wav_files[i:i + batch_size]
        batch_embeddings = []
        batch_ids = []
        batch_metadata = []

        for fname in batch_files:
            path = os.path.join(wav_folder, fname)
            emb = audio_to_embedding_enhanced(path)
            if emb is not None:
                uid = str(uuid.uuid4())
                batch_embeddings.append(emb)
                batch_ids.append(uid)
                batch_metadata.append({"file_name": fname})
                print(f"✓ Processed: {fname}")

        embeddings.extend(batch_embeddings)
        ids.extend(batch_ids)
        metadata_list.extend(batch_metadata)

        if upsert_to_pinecone and batch_embeddings:
            formatted = format_embeddings(batch_embeddings, batch_ids, batch_metadata)
            batch_upsert(index, formatted, UPSERT_BATCH_SIZE)

    if save_csv and embeddings:
        df = pd.DataFrame(embeddings, index=ids)
        df.index.name = 'id'
        df.to_csv(embedding_output_path)
        print(f"📄 Embeddings saved to: {embedding_output_path}")

    return embeddings, ids, metadata_list

In [12]:
def similarity_search(audio_path: str, index, top_k: int = 5):
    query_embedding = audio_to_embedding_enhanced(audio_path)
    return index.query(vector=query_embedding.tolist(), top_k=top_k, include_metadata=True)

def test_similarity_search(audio_path: str, index, top_k: int = 5):
    print(f"\n🔍 Similarity Search for: {os.path.basename(audio_path)}")
    try:
        results = similarity_search(audio_path, index, top_k)
        for i, match in enumerate(results['matches'], 1):
            metadata = match.get('metadata', {})
            print(f"{i}. {metadata.get('file_name', 'Unknown')} (Score: {match['score']:.4f})")
    except Exception as e:
        print(f"❌ Error during search: {e}")

In [17]:
# Process and upsert a limited number of audio files
embeddings, ids, metadata_list = process_audio_directory(
    input_folder,
    output_wav_folder,
    save_csv=True,         # Set to True if you want to save embeddings to a CSV
    upsert_to_pinecone=True, # Set to True to upsert to Pinecone
    limit=100              # Set a number if you want to process only a limited number of files
)

Converted: bulgarian8.mp3
Converted: english417.mp3
Converted: dutch11.mp3
Converted: english405.mp3
Converted: english387.mp3
Converted: serbian15.wav
Converted: arabic9.wav
Converted: spanish83.wav
Converted: albanian2.mp3
Converted: french2.mp3
Converted: kikongo1.mp3
Converted: polish30.wav
Converted: hindi14.mp3
Converted: amharic14.wav
Converted: english173.mp3
Converted: dutch15.wav
Converted: russian40.mp3
Converted: english551.wav
Converted: english244.mp3
Converted: english306.mp3
Converted: dutch6.mp3
Converted: italian22.mp3
Converted: taiwanese6.mp3
Converted: spanish97.mp3
Converted: swedish17.mp3
Converted: miskito1.wav
Converted: japanese21.mp3
Converted: amharic2.wav
Converted: english187.wav
Converted: english557.mp3
Converted: wolof3.mp3
Converted: mandarin7.wav
Converted: amharic20.wav
Converted: amharic10.wav
Converted: norwegian4.wav
Converted: swedish20.mp3
Converted: arabic40.wav
Converted: bengali3.mp3
Converted: arabic36.wav
Converted: spanish103.wav
Converted

In [39]:
# Test similarity search
index = init_pinecone()
test_audio = "/content/sample_data/audio_samples/audio_wav/tamil2.wav"
test_similarity_search(test_audio, index, top_k=3)


🔍 Similarity Search for: tamil2.wav
1. tamil2.wav (Score: 1.0000)
2. tamil2.wav (Score: 1.0000)
3. tamil2.wav (Score: 1.0000)


In [33]:
def select_random_subset(embeddings, metadata_list, subset_size=100):
    if len(embeddings) <= subset_size:
        return embeddings, metadata_list

    indices = np.random.choice(len(embeddings), subset_size, replace=False)
    subset_embeddings = [embeddings[i] for i in indices]
    subset_metadata = [metadata_list[i] for i in indices]

    return subset_embeddings, subset_metadata

In [34]:
def cosine_similarity(a, b):
  # Ensure inputs are numpy arrays and handle potential division by zero
  a = np.asarray(a)
  b = np.asarray(b)
  dot_product = np.dot(a, b)
  norm_a = np.linalg.norm(a)
  norm_b = np.linalg.norm(b)
  if norm_a == 0 or norm_b == 0:
      return 0.0  # Or handle as appropriate, returning 0 for zero vectors
  return dot_product / (norm_a * norm_b)

In [35]:
def evaluate_embeddings(embeddings, labels):
    n_samples = len(embeddings)
    similarity_scores = []
    y_true = []

    # Calculate pairwise similarity and true labels
    for i in range(n_samples):
        for j in range(i + 1, n_samples):
            # Calculate cosine similarity using the defined function
            score = cosine_similarity(embeddings[i], embeddings[j])
            similarity_scores.append(score)

            # Determine true label (1 if same label, 0 if different)
            true_label = 1 if labels[i] == labels[j] else 0
            y_true.append(true_label)

    y_true = np.array(y_true)
    similarity_scores = np.array(similarity_scores)

    threshold = np.mean(similarity_scores)

    y_pred = (similarity_scores >= threshold).astype(int)

    return y_true, y_pred, similarity_scores

In [37]:
# Check if embeddings and metadata_list are available
if 'embeddings' in locals() and embeddings and 'metadata_list' in locals() and metadata_list:
    # Pick a random subset for evaluation (for example, 100 samples)
    subset_embeddings, subset_metadata = select_random_subset(
        embeddings, metadata_list, subset_size=100)

    # Derive labels from the metadata_list
    subset_labels = [metadata['file_name'].split('.')[0].split('_')[0] for metadata in subset_metadata]

    # Evaluate the embeddings
    y_true, y_pred, similarity_scores = evaluate_embeddings(subset_embeddings, subset_labels)


    # Calculate evaluation metrics
    accuracy = accuracy_score(y_true, y_pred)
    # Use zero_division=1 to avoid warnings if there are no positive predictions
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=1)


    # Calculate EER
    if len(np.unique(y_true)) > 1:
      fpr, tpr, thresholds = roc_curve(y_true, similarity_scores)
      # Find the threshold where (1 - tpr) is closest to fpr
      eer = fpr[np.nanargmin(np.absolute((1 - tpr) - fpr))]
      print(f"Approximate EER: {eer:.4f}")

      # Plot ROC curve
      plt.figure(figsize=(10, 5))
      plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc(fpr, tpr))
      plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
      plt.xlim([0.0, 1.0])
      plt.ylim([0.0, 1.05])
      plt.xlabel('False Positive Rate (FPR)')
      plt.ylabel('True Positive Rate (TPR)')
      plt.title('Receiver Operating Characteristic (ROC) Curve')
      plt.legend(loc="lower right")
      plt.show()
    else:
      print("Cannot calculate EER or plot ROC curve: Not enough unique classes in the subset for evaluation.")


    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

else:
    print("Embeddings or metadata_list not found. Please run the audio processing step first.")

Cannot calculate EER or plot ROC curve: Not enough unique classes in the subset for evaluation.
Accuracy: 0.5493
Precision: 0.0000, Recall: 1.0000, F1: 0.0000
