In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [12]:
! pip install pydub moviepy ffmpeg-python

from pydub import AudioSegment
from moviepy.editor import VideoFileClip
import os




In [13]:
import pandas as pd
import numpy as np

# Convert dict to DataFrame
df = pd.DataFrame.from_dict(embeddings, orient='index')
df.index.name = 'filename'

# Save to CSV
embedding_output_path = "/content/drive/MyDrive/voicebiometrics/output_file/embeddings.csv"
df.to_csv(embedding_output_path)

print(f"Embeddings saved to: {embedding_output_path}")


Embeddings saved to: /content/drive/MyDrive/voicebiometrics/output_file/embeddings.csv


In [14]:
import os

input_folder = "/content/drive/MyDrive/voicebiometrics/audio_file"
files = os.listdir(input_folder)

print("Files in input folder:")
for f in files:
    print(f)


Files in input folder:
WhatsApp Ptt 2025-06-02 at 1.24.21 PM.ogg
WhatsApp Audio 2025-06-02 at 1.28.13 PM.mp4
WhatsApp Audio 2025-06-02 at 11.29.34 AM.mp4


In [15]:
from pydub import AudioSegment
import os

def convert_to_wav_universal(input_folder, output_folder):
    supported_formats = (".mp3", ".wav", ".ogg", ".oga", ".flac", ".mp4", ".m4a")
    os.makedirs(output_folder, exist_ok=True)

    for file_name in os.listdir(input_folder):
        input_path = os.path.join(input_folder, file_name)
        if not file_name.lower().endswith(supported_formats):
            print(f" Skipping unsupported file: {file_name}")
            continue

        try:
            # Try to load the file using pydub
            audio = AudioSegment.from_file(input_path)
            output_name = os.path.splitext(file_name)[0] + ".wav"
            output_path = os.path.join(output_folder, output_name)

            audio.export(output_path, format="wav")
            print(f"Converted: {file_name} → {output_name}")
        except Exception as e:
            print(f"Error converting {file_name}: {e}")

    print("✅ All supported media files have been processed.")


In [16]:
input_folder = "/content/drive/MyDrive/voicebiometrics/audio_file"
output_wav_folder = "/content/drive/MyDrive/voicebiometrics/output_file/audio_wav"

convert_to_wav_universal(input_folder, output_wav_folder)


Converted: WhatsApp Ptt 2025-06-02 at 1.24.21 PM.ogg → WhatsApp Ptt 2025-06-02 at 1.24.21 PM.wav
Converted: WhatsApp Audio 2025-06-02 at 1.28.13 PM.mp4 → WhatsApp Audio 2025-06-02 at 1.28.13 PM.wav
Converted: WhatsApp Audio 2025-06-02 at 11.29.34 AM.mp4 → WhatsApp Audio 2025-06-02 at 11.29.34 AM.wav
✅ All supported media files have been processed.


In [32]:
import torch
import torchaudio
import torchaudio.transforms as T
import noisereduce as nr
from speechbrain.pretrained import EncoderClassifier
import os

def audio_to_embedding_enhanced(audio_path):
    if not os.path.exists(audio_path):
        raise FileNotFoundError(f"Audio file not found: {audio_path}")

    try:
        # Load pretrained ECAPA-TDNN classifier
        # It's more efficient to load the model once outside the function if processing many files
        # but for a single file processing, loading here is fine.
        model = EncoderClassifier.from_hparams(
            source="speechbrain/spkrec-ecapa-voxceleb",
            savedir="pretrained_models/spkrec-ecapa-voxceleb"
        )

        waveform, sample_rate = torchaudio.load(audio_path)

        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Resample to 16 kHz if needed
        if sample_rate != 16000:
            resampler = T.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
            sample_rate = 16000 # Update sample rate after resampling

        # Noise reduction with numpy
        # Ensure tensor is on CPU for numpy conversion
        waveform_np = waveform.squeeze().cpu().numpy()
        # Check if denoised has enough samples, nr.reduce_noise might return empty or very short array
        if waveform_np.size > 0:
            denoised = nr.reduce_noise(y=waveform_np, sr=sample_rate)
            waveform = torch.tensor(denoised).unsqueeze(0)
        else:
            # Handle cases where noise reduction results in empty data
            raise RuntimeError("Noise reduction resulted in empty audio.")


        # Silence removal via VAD
        vad = T.Vad(sample_rate=sample_rate)
        waveform = vad(waveform)

        if waveform.numel() == 0:
            raise RuntimeError("No voiced audio after VAD")

        # Normalize waveform amplitude
        # Handle case where waveform might be all zeros after processing
        if waveform.abs().max() > 1e-6: # Use a small threshold instead of checking for exactly zero
             waveform = waveform / waveform.abs().max()
        else:
            # Handle cases where audio is essentially silent
            raise RuntimeError("Audio is too quiet after processing.")


        # Pad waveform to minimum 1 second (16000 samples) with constant padding
        min_len = 16000
        if waveform.shape[1] < min_len:
            padding = min_len - waveform.shape[1]
            # pad at end, dim=1 because shape = [channels, time]
            waveform = torch.nn.functional.pad(waveform, (0, padding), mode='constant', value=0)

        # Add batch dimension -> shape: [batch, channels, time]
        # Then squeeze the channel dimension -> shape: [batch, time]
        waveform = waveform.unsqueeze(0).squeeze(1) # Changed line


        # Encode batch (NO lengths argument)
        # Pass the length of the waveform explicitly if needed, but ECAPA-TDNN
        # should handle batch inputs without lengths if all are padded to the same length.
        # Let's stick to the original call without lengths as the padding should handle it.
        embedding = model.encode_batch(waveform)


        return embedding.squeeze().cpu().numpy()

    except Exception as e:
        # Re-raise the exception with more context, including the filename
        raise RuntimeError(f"Error processing audio file {os.path.basename(audio_path)}: {str(e)}")

In [33]:
# List WAV files
import os

wav_dir = "/content/drive/MyDrive/voicebiometrics/output_file/audio_wav"
wav_files = [f for f in os.listdir(wav_dir) if f.endswith(".wav")]

# Pick one file to test
test_wav = os.path.join(wav_dir, wav_files[0])  # for example, first file

# Now call your function with that full path
embedding = audio_to_embedding_enhanced(test_wav)
print(embedding.shape)


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/pretraine

(192,)


In [34]:
import os

def process_all_wav_files(wav_folder):
    embeddings = {}
    wav_files = os.listdir(wav_folder)  # Get all files in the folder

    for fname in wav_files:
        full_path = os.path.join(wav_folder, fname)
        try:
            emb = audio_to_embedding_enhanced(full_path)
            embeddings[fname] = emb
            print(f"✅ Processed: {fname}")
        except Exception as e:
            print(f"❌ Failed: {fname} → {e}")

    return embeddings


In [35]:

# Convert to WAV
convert_to_wav_universal(input_folder, output_wav_folder)

#  Extract embeddings for all WAV files
embeddings = process_all_wav_files(output_wav_folder)


Converted: WhatsApp Ptt 2025-06-02 at 1.24.21 PM.ogg → WhatsApp Ptt 2025-06-02 at 1.24.21 PM.wav
Converted: WhatsApp Audio 2025-06-02 at 1.28.13 PM.mp4 → WhatsApp Audio 2025-06-02 at 1.28.13 PM.wav


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Converted: WhatsApp Audio 2025-06-02 at 11.29.34 AM.mp4 → WhatsApp Audio 2025-06-02 at 11.29.34 AM.wav
✅ All supported media files have been processed.


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["clas

✅ Processed: WhatsApp Ptt 2025-06-02 at 1.24.21 PM.wav


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["clas

❌ Failed: WhatsApp Audio 2025-06-02 at 1.28.13 PM.wav → Error processing audio file WhatsApp Audio 2025-06-02 at 1.28.13 PM.wav: No voiced audio after VAD


DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in pretrained_models/spkrec-ecapa-voxceleb.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/pretrained_models/spkrec-ecapa-voxceleb/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/pretrained_models/spkrec-ecapa-voxceleb/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/pretrained_models/spkrec-ecapa-voxceleb/classifier.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["clas

❌ Failed: WhatsApp Audio 2025-06-02 at 11.29.34 AM.wav → Error processing audio file WhatsApp Audio 2025-06-02 at 11.29.34 AM.wav: No voiced audio after VAD
