<a href="https://colab.research.google.com/github/Rahul6700/TalkToMe/blob/main/testVR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install resemblyzer numpy scipy librosa soundfile torch audioread

Collecting resemblyzer
  Downloading Resemblyzer-0.1.4-py3-none-any.whl.metadata (5.8 kB)
Collecting webrtcvad>=2.0.10 (from resemblyzer)
  Downloading webrtcvad-2.0.10.tar.gz (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting typing (from resemblyzer)
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Down

In [None]:
import os
import numpy as np
import librosa
import soundfile as sf
from resemblyzer import preprocess_wav, VoiceEncoder
from scipy.spatial.distance import cosine
import zipfile

def load_and_preprocess_audio(file_path, target_sr=16000):
    """
    Load and preprocess audio file with noise reduction and normalization

    Args:
        file_path (str): Path to audio file
        target_sr (int): Target sampling rate

    Returns:
        numpy.ndarray: Preprocessed audio signal
    """
    try:
        # Load audio file with librosa (supports multiple formats)
        audio, orig_sr = librosa.load(file_path, sr=None)

        # Resample if needed
        if orig_sr != target_sr:
            audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)

        # Noise reduction using spectral gating
        def noise_reduce(y):
            # Compute the spectrogram
            stft = librosa.stft(y)

            # Estimate noise
            noise_thresh = np.median(np.abs(stft), axis=1)

            # Create a mask to reduce noise
            mask = np.abs(stft) > noise_thresh[:, np.newaxis]

            # Apply mask
            cleaned_stft = stft * mask

            # Convert back to time domain
            return librosa.istft(cleaned_stft)

        # Apply noise reduction
        audio = noise_reduce(audio)

        # Normalize audio (RMS)
        audio = audio / np.sqrt(np.mean(audio**2))

        return audio
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def extract_voice_embedding(audio):
    """
    Extract voice embedding from preprocessed audio

    Args:
        audio (numpy.ndarray): Preprocessed audio signal

    Returns:
        numpy.ndarray: Voice embedding vector
    """
    try:
        # Create voice encoder
        encoder = VoiceEncoder()

        # Generate embedding
        embedding = encoder.embed_utterance(audio)

        return embedding
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

def compare_voices(input_embedding, reference_embeddings, top_n=5):
    """
    Compare input voice embedding with reference embeddings

    Args:
        input_embedding (numpy.ndarray): Embedding of input voice
        reference_embeddings (dict): Dictionary of reference voice embeddings
        top_n (int): Number of top matches to return

    Returns:
        list: Top N matches with similarity scores
    """
    similarities = []

    for name, ref_embedding in reference_embeddings.items():
        # Calculate cosine similarity (lower score means more similar)
        similarity_score = 1 - cosine(input_embedding, ref_embedding)
        similarities.append((name, similarity_score))

    # Sort similarities in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_n]

def process_speaker_recognition(input_file, reference_folder):
    """
    Perform comprehensive speaker recognition

    Args:
        input_file (str): Path to input voice file
        reference_folder (str): Path to folder with reference voice files

    Returns:
        list: Top matches with similarity scores
    """
    # Preprocess input voice
    input_audio = load_and_preprocess_audio(input_file)

    if input_audio is None:
        return [("Error", "Could not process input voice file")]

    # Extract embedding for input voice
    input_embedding = extract_voice_embedding(input_audio)

    if input_embedding is None:
        return [("Error", "Could not generate input voice embedding")]

    # Extract embeddings for all reference voices
    reference_embeddings = {}
    for filename in os.listdir(reference_folder):
        # Support multiple audio formats
        if filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a')):
            filepath = os.path.join(reference_folder, filename)

            # Preprocess reference audio
            ref_audio = load_and_preprocess_audio(filepath)

            if ref_audio is not None:
                # Extract embedding
                embedding = extract_voice_embedding(ref_audio)

                if embedding is not None:
                    # Use filename (without extension) as the speaker name
                    speaker_name = os.path.splitext(filename)[0]
                    reference_embeddings[speaker_name] = embedding

    # Compare embeddings and get top matches
    top_matches = compare_voices(input_embedding, reference_embeddings)

    return top_matches

# Unzip the folder
def unzip_folder(zip_path, extract_to='.'):
    """
    Unzip a folder

    Args:
        zip_path (str): Path to zip file
        extract_to (str): Destination folder
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Example usage
# First, unzip the folder
unzip_folder('present_maam.zip')

# Then run speaker recognition
input_voice_file = 'rahultest5.m4a'  # Replace with your input voice file
reference_voices_folder = 'present_maam'  # Folder extracted from zip

# Perform speaker recognition
results = process_speaker_recognition(input_voice_file, reference_voices_folder)

# Print results
print("Top 5 Matches:")
for name, score in results:
    print(f"Speaker: {name}, Similarity Score: {score:.4f}")

  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.02 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.03 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.01 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.01 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.01 seconds.
Loaded the voice encoder model on cpu in 0.02 seconds.
Loaded the voice encoder model on cpu in 0.01 seconds.
Top 5 Matches:
Speaker: RS_m_4, Similarity Score: 0.8081
Speaker: KP_M_3, Similarity Score: 0.7765
Speaker: RS_m_1, Similarity Score: 0.7408
Speaker: RM_M_7, Similarity Score: 0.7307
Speaker: RS_m_2, Similarity Score: 0.7194


using speechbrain (taking a good amount of time to run tho)

In [None]:
!pip install speechbrain torchaudio librosa numpy scipy

Collecting speechbrain
  Downloading speechbrain-1.0.2-py3-none-any.whl.metadata (23 kB)
Collecting hyperpyyaml (from speechbrain)
  Downloading HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.9->speechbrain)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9->speechbrain)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.9->speechbrain)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3

In [None]:
import os
import numpy as np
import librosa
import soundfile as sf
import torch
import torchaudio
from scipy.spatial.distance import cosine
import zipfile

def load_and_preprocess_audio(file_path, target_sr=16000):
    """
    Load and preprocess audio file with noise reduction and normalization

    Args:
        file_path (str): Path to audio file
        target_sr (int): Target sampling rate

    Returns:
        numpy.ndarray: Preprocessed audio signal
    """
    try:
        # Load audio file with librosa (supports multiple formats)
        audio, orig_sr = librosa.load(file_path, sr=None)

        # Resample if needed
        if orig_sr != target_sr:
            audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)

        # Noise reduction using spectral gating
        def noise_reduce(y):
            # Compute the spectrogram
            stft = librosa.stft(y)

            # Estimate noise
            noise_thresh = np.median(np.abs(stft), axis=1)

            # Create a mask to reduce noise
            mask = np.abs(stft) > noise_thresh[:, np.newaxis]

            # Apply mask
            cleaned_stft = stft * mask

            # Convert back to time domain
            return librosa.istft(cleaned_stft)

        # Apply noise reduction
        audio = noise_reduce(audio)

        # Normalize audio (RMS)
        audio = audio / np.sqrt(np.mean(audio**2))

        return audio
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def extract_voice_embedding(audio):
    """
    Extract voice embedding from preprocessed audio

    Args:
        audio (numpy.ndarray): Preprocessed audio signal

    Returns:
        numpy.ndarray: Voice embedding vector
    """
    try:
        # Import SpeechBrain speaker recognition
        from speechbrain.pretrained import SpeakerRecognition

        # Initialize model
        model = SpeakerRecognition.from_hparams(
            source="speechbrain/spkrec-ecapa-voxceleb",
            savedir="pretrained_models/spkrec-ecapa-voxceleb"
        )

        # Convert numpy array to torch tensor
        # Ensure the audio is in the right shape and type
        audio_tensor = torch.tensor(audio).float().unsqueeze(0)

        # Ensure the tensor is the right length (3 seconds at 16kHz)
        target_length = 3 * 16000
        if audio_tensor.shape[1] > target_length:
            audio_tensor = audio_tensor[:, :target_length]
        elif audio_tensor.shape[1] < target_length:
            padding = torch.zeros(1, target_length - audio_tensor.shape[1])
            audio_tensor = torch.cat([audio_tensor, padding], dim=1)

        # Extract embedding
        embedding = model.encode_batch(audio_tensor)

        return embedding.squeeze().numpy()

    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

def compare_voices(input_embedding, reference_embeddings, top_n=5):
    """
    Compare input voice embedding with reference embeddings

    Args:
        input_embedding (numpy.ndarray): Embedding of input voice
        reference_embeddings (dict): Dictionary of reference voice embeddings
        top_n (int): Number of top matches to return

    Returns:
        list: Top N matches with similarity scores
    """
    similarities = []

    for name, ref_embedding in reference_embeddings.items():
        # Calculate cosine similarity (lower score means more similar)
        similarity_score = 1 - cosine(input_embedding, ref_embedding)
        similarities.append((name, similarity_score))

    # Sort similarities in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_n]

def process_speaker_recognition(input_file, reference_folder):
    """
    Perform comprehensive speaker recognition

    Args:
        input_file (str): Path to input voice file
        reference_folder (str): Path to folder with reference voice files

    Returns:
        list: Top matches with similarity scores
    """
    # Preprocess input voice
    input_audio = load_and_preprocess_audio(input_file)

    if input_audio is None:
        return [("Error", "Could not process input voice file")]

    # Extract embedding for input voice
    input_embedding = extract_voice_embedding(input_audio)

    if input_embedding is None:
        return [("Error", "Could not generate input voice embedding")]

    # Extract embeddings for all reference voices
    reference_embeddings = {}
    for filename in os.listdir(reference_folder):
        # Support multiple audio formats
        if filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a')):
            filepath = os.path.join(reference_folder, filename)

            # Preprocess reference audio
            ref_audio = load_and_preprocess_audio(filepath)

            if ref_audio is not None:
                # Extract embedding
                embedding = extract_voice_embedding(ref_audio)

                if embedding is not None:
                    # Use filename (without extension) as the speaker name
                    speaker_name = os.path.splitext(filename)[0]
                    reference_embeddings[speaker_name] = embedding

    # Compare embeddings and get top matches
    top_matches = compare_voices(input_embedding, reference_embeddings)

    return top_matches

# Unzip the folder
def unzip_folder(zip_path, extract_to='.'):
    """
    Unzip a folder

    Args:
        zip_path (str): Path to zip file
        extract_to (str): Destination folder
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Example usage
# First, unzip the folder
unzip_folder('present_maam.zip')

# Then run speaker recognition
input_voice_file = 'rahultest1.m4a'  # Replace with your input voice file
reference_voices_folder = 'present_maam'  # Folder extracted from zip

# Perform speaker recognition
results = process_speaker_recognition(input_voice_file, reference_voices_folder)

# Print results
print("Top 5 Matches:")
for name, score in results:
    print(f"Speaker: {name}, Similarity Score: {score:.4f}")

  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, du

Top 5 Matches:
Speaker: RS_m_5, Similarity Score: 0.6402
Speaker: RM_M_3, Similarity Score: 0.5164
Speaker: RS_m_4, Similarity Score: 0.3876
Speaker: KP_M_6, Similarity Score: 0.3652
Speaker: RM_M_9, Similarity Score: 0.3405


Using MFCC's manually (with no model)


In [None]:
!pip install numpy librosa soundfile torch torchaudio scipy



In [None]:
import os
import numpy as np
import librosa
import soundfile as sf
import torch
import torchaudio
from scipy.spatial.distance import cosine
import zipfile

def load_and_preprocess_audio(file_path, target_sr=16000):
    """
    Load and preprocess audio file with noise reduction and normalization

    Args:
        file_path (str): Path to audio file
        target_sr (int): Target sampling rate

    Returns:
        numpy.ndarray: Preprocessed audio signal
    """
    try:
        # Load audio file with librosa (supports multiple formats)
        audio, orig_sr = librosa.load(file_path, sr=None)

        # Resample if needed
        if orig_sr != target_sr:
            audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)

        # Noise reduction using spectral gating
        def noise_reduce(y):
            # Compute the spectrogram
            stft = librosa.stft(y)

            # Estimate noise
            noise_thresh = np.median(np.abs(stft), axis=1)

            # Create a mask to reduce noise
            mask = np.abs(stft) > noise_thresh[:, np.newaxis]

            # Apply mask
            cleaned_stft = stft * mask

            # Convert back to time domain
            return librosa.istft(cleaned_stft)

        # Apply noise reduction
        audio = noise_reduce(audio)

        # Normalize audio (RMS)
        audio = audio / np.sqrt(np.mean(audio**2))

        return audio
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def extract_voice_embedding(audio):
    """
    Extract voice embedding from preprocessed audio using MFCC features

    Args:
        audio (numpy.ndarray): Preprocessed audio signal

    Returns:
        numpy.ndarray: Voice embedding vector
    """
    try:
        # Ensure audio is the right length
        target_length = 3 * 16000  # 3 seconds at 16kHz
        if len(audio) > target_length:
            audio = audio[:target_length]
        elif len(audio) < target_length:
            audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')

        # Extract MFCC features
        mfccs = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13)

        # Compute the mean and standard deviation of MFCCs
        mfcc_mean = np.mean(mfccs, axis=1)
        mfcc_std = np.std(mfccs, axis=1)

        # Combine mean and std into a single embedding vector
        embedding = np.concatenate([mfcc_mean, mfcc_std])

        return embedding

    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

def compare_voices(input_embedding, reference_embeddings, top_n=5):
    """
    Compare input voice embedding with reference embeddings

    Args:
        input_embedding (numpy.ndarray): Embedding of input voice
        reference_embeddings (dict): Dictionary of reference voice embeddings
        top_n (int): Number of top matches to return

    Returns:
        list: Top N matches with similarity scores
    """
    similarities = []

    for name, ref_embedding in reference_embeddings.items():
        # Calculate cosine similarity (lower score means more similar)
        similarity_score = 1 - cosine(input_embedding, ref_embedding)
        similarities.append((name, similarity_score))

    # Sort similarities in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_n]

def process_speaker_recognition(input_file, reference_folder):
    """
    Perform comprehensive speaker recognition

    Args:
        input_file (str): Path to input voice file
        reference_folder (str): Path to folder with reference voice files

    Returns:
        list: Top matches with similarity scores
    """
    # Preprocess input voice
    input_audio = load_and_preprocess_audio(input_file)

    if input_audio is None:
        return [("Error", "Could not process input voice file")]

    # Extract embedding for input voice
    input_embedding = extract_voice_embedding(input_audio)

    if input_embedding is None:
        return [("Error", "Could not generate input voice embedding")]

    # Extract embeddings for all reference voices
    reference_embeddings = {}
    for filename in os.listdir(reference_folder):
        # Support multiple audio formats
        if filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a')):
            filepath = os.path.join(reference_folder, filename)

            # Preprocess reference audio
            ref_audio = load_and_preprocess_audio(filepath)

            if ref_audio is not None:
                # Extract embedding
                embedding = extract_voice_embedding(ref_audio)

                if embedding is not None:
                    # Use filename (without extension) as the speaker name
                    speaker_name = os.path.splitext(filename)[0]
                    reference_embeddings[speaker_name] = embedding

    # Compare embeddings and get top matches
    top_matches = compare_voices(input_embedding, reference_embeddings)

    return top_matches

# Unzip the folder
def unzip_folder(zip_path, extract_to='.'):
    """
    Unzip a folder

    Args:
        zip_path (str): Path to zip file
        extract_to (str): Destination folder
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Example usage
# First, unzip the folder
unzip_folder('present_maam.zip')

# Then run speaker recognition
input_voice_file = 'rahultest5.m4a'  # Replace with your input voice file
reference_voices_folder = 'present_maam'  # Folder extracted from zip

# Perform speaker recognition
results = process_speaker_recognition(input_voice_file, reference_voices_folder)

# Print results
print("Top 5 Matches:")
for name, score in results:
    print(f"Speaker: {name}, Similarity Score: {score:.4f}")

  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, du

KeyboardInterrupt: 

Now trying MFCC's + LPCC's

In [None]:
import os
import numpy as np
import librosa
import soundfile as sf
import torch
import torchaudio
from scipy.spatial.distance import cosine
import zipfile

def load_and_preprocess_audio(file_path, target_sr=16000):
    """
    Load and preprocess audio file with noise reduction and normalization

    Args:
        file_path (str): Path to audio file
        target_sr (int): Target sampling rate

    Returns:
        numpy.ndarray: Preprocessed audio signal
    """
    try:
        # Load audio file with librosa (supports multiple formats)
        audio, orig_sr = librosa.load(file_path, sr=None)

        # Resample if needed
        if orig_sr != target_sr:
            audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)

        # Noise reduction using spectral gating
        def noise_reduce(y):
            # Compute the spectrogram
            stft = librosa.stft(y)

            # Estimate noise
            noise_thresh = np.median(np.abs(stft), axis=1)

            # Create a mask to reduce noise
            mask = np.abs(stft) > noise_thresh[:, np.newaxis]

            # Apply mask
            cleaned_stft = stft * mask

            # Convert back to time domain
            return librosa.istft(cleaned_stft)

        # Apply noise reduction
        audio = noise_reduce(audio)

        # Normalize audio (RMS)
        audio = audio / np.sqrt(np.mean(audio**2))

        return audio
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def extract_lpcc(audio, sr=16000, n_lpcc=13):
    """
    Extract Linear Prediction Cepstral Coefficients (LPCC) features

    Args:
        audio (numpy.ndarray): Audio signal
        sr (int): Sampling rate
        n_lpcc (int): Number of LPCC coefficients to extract

    Returns:
        numpy.ndarray: LPCC features
    """
    try:
        # Apply pre-emphasis filter
        preemphasis = 0.97
        audio = np.append(audio[0], audio[1:] - preemphasis * audio[:-1])

        # Step 1: Compute LPC coefficients
        lpc_coeffs = librosa.lpc(audio, order=n_lpcc)

        # Handle potential instability in LPC
        if np.isnan(lpc_coeffs).any() or np.isinf(lpc_coeffs).any():
            print("Warning: LPC coefficients contain NaN or Inf values")
            return np.random.normal(0, 0.1, n_lpcc)  # Return random coefficients

        # Step 2: Convert LPC to LPCC
        lpcc = np.zeros(n_lpcc)

        # First LPCC coefficient
        lpcc[0] = -np.log(lpc_coeffs[0]) if lpc_coeffs[0] > 0 else 0

        # Rest of LPCC coefficients
        for n in range(1, n_lpcc):
            lpcc[n] = lpc_coeffs[n]

            for k in range(1, n):
                lpcc[n] += lpcc[k] * lpc_coeffs[n-k] * (n-k) / n

            lpcc[n] = -lpcc[n]

        return lpcc

    except Exception as e:
        print(f"Error extracting LPCC: {e}")
        return np.random.normal(0, 0.1, n_lpcc)  # Return random coefficients

def extract_voice_embedding(audio):
    """
    Extract voice embedding from preprocessed audio using MFCC and LPCC features

    Args:
        audio (numpy.ndarray): Preprocessed audio signal

    Returns:
        numpy.ndarray: Voice embedding vector
    """
    try:
        # Ensure audio is the right length
        target_length = 3 * 16000  # 3 seconds at 16kHz
        if len(audio) > target_length:
            audio = audio[:target_length]
        elif len(audio) < target_length:
            audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')

        # Extract MFCC features
        mfccs = librosa.feature.mfcc(y=audio, sr=16000, n_mfcc=13)

        # Compute the mean and standard deviation of MFCCs
        mfcc_mean = np.mean(mfccs, axis=1)
        mfcc_std = np.std(mfccs, axis=1)

        # Extract LPCC features for several frames and compute statistics
        frame_length = int(0.025 * 16000)  # 25ms
        hop_length = int(0.010 * 16000)    # 10ms

        lpcc_features = []
        for i in range(0, len(audio) - frame_length, hop_length):
            frame = audio[i:i + frame_length]
            lpcc = extract_lpcc(frame, sr=16000, n_lpcc=13)
            lpcc_features.append(lpcc)

        lpcc_features = np.array(lpcc_features)

        # Compute mean and standard deviation of LPCC features
        lpcc_mean = np.mean(lpcc_features, axis=0)
        lpcc_std = np.std(lpcc_features, axis=0)

        # Combine MFCC and LPCC features into a single embedding vector
        embedding = np.concatenate([mfcc_mean, mfcc_std, lpcc_mean, lpcc_std])

        # Add small random noise to prevent perfect matches
        np.random.seed(42)  # For reproducibility
        embedding += np.random.normal(0, 1e-5, embedding.shape)

        return embedding

    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

def compare_voices(input_embedding, reference_embeddings, top_n=5):
    """
    Compare input voice embedding with reference embeddings

    Args:
        input_embedding (numpy.ndarray): Embedding of input voice
        reference_embeddings (dict): Dictionary of reference voice embeddings
        top_n (int): Number of top matches to return

    Returns:
        list: Top N matches with similarity scores
    """
    similarities = []

    # Debug: check if input embedding has valid values
    if np.all(np.isnan(input_embedding)) or np.all(input_embedding == 0):
        print("Warning: Input embedding contains all NaN or zero values")
        return [("Error", 0.0)]

    for name, ref_embedding in reference_embeddings.items():
        # Skip invalid embeddings
        if np.all(np.isnan(ref_embedding)) or np.all(ref_embedding == 0):
            print(f"Warning: Reference embedding for {name} contains all NaN or zero values")
            continue

        # 1. Cosine similarity (bounded between -1 and 1)
        cosine_sim = 1 - cosine(input_embedding, ref_embedding)

        # 2. Modified Euclidean similarity calculation
        euclidean_dist = np.linalg.norm(input_embedding - ref_embedding)
        # Use a more stable approach for Euclidean similarity
        euclidean_sim = 1 / (1 + euclidean_dist)  # This will be between 0 and 1

        # Combined score (weighted average)
        similarity_score = 0.7 * cosine_sim + 0.3 * euclidean_sim

        # Ensure the score is within a reasonable range
        similarity_score = max(-1.0, min(1.0, similarity_score))

        similarities.append((name, similarity_score))

    # Sort similarities in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_n]

def process_speaker_recognition(input_file, reference_folder):
    """
    Perform comprehensive speaker recognition

    Args:
        input_file (str): Path to input voice file
        reference_folder (str): Path to folder with reference voice files

    Returns:
        list: Top matches with similarity scores
    """
    # Preprocess input voice
    input_audio = load_and_preprocess_audio(input_file)

    if input_audio is None:
        return [("Error", "Could not process input voice file")]

    # Extract embedding for input voice
    input_embedding = extract_voice_embedding(input_audio)

    if input_embedding is None:
        return [("Error", "Could not generate input voice embedding")]

    # Print some stats about the input embedding
    print(f"Input embedding shape: {input_embedding.shape}")
    print(f"Input embedding mean: {np.mean(input_embedding)}")
    print(f"Input embedding std: {np.std(input_embedding)}")

    # Extract embeddings for all reference voices
    reference_embeddings = {}
    for filename in os.listdir(reference_folder):
        # Support multiple audio formats
        if filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a')):
            filepath = os.path.join(reference_folder, filename)

            # Preprocess reference audio
            ref_audio = load_and_preprocess_audio(filepath)

            if ref_audio is not None:
                # Extract embedding
                embedding = extract_voice_embedding(ref_audio)

                if embedding is not None:
                    # Use filename (without extension) as the speaker name
                    speaker_name = os.path.splitext(filename)[0]
                    reference_embeddings[speaker_name] = embedding

    # Before running comparisons, print some stats
    if len(reference_embeddings) > 0:
        print(f"Found {len(reference_embeddings)} reference embeddings")

        # Check if all embeddings are identical
        first_embedding = next(iter(reference_embeddings.values()))
        all_identical = all(np.array_equal(embedding, first_embedding)
                           for embedding in reference_embeddings.values())
        if all_identical:
            print("WARNING: All reference embeddings are identical!")
    else:
        print("No reference embeddings found!")

    # Compare embeddings and get top matches
    top_matches = compare_voices(input_embedding, reference_embeddings)

    return top_matches

# Unzip the folder
def unzip_folder(zip_path, extract_to='.'):
    """
￼
OK

    Unzip a folder

    Args:
        zip_path (str): Path to zip file
        extract_to (str): Destination folder
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Example usage
# First, unzip the folder
unzip_folder('present_maam.zip')

# Then run speaker recognition
input_voice_file = 'rahultest5.m4a'  # Replace with your input voice file
reference_voices_folder = 'present_maam'  # Folder extracted from zip

# Perform speaker recognition
results = process_speaker_recognition(input_voice_file, reference_voices_folder)

# Print results
print("Top 5 Matches:")
for name, score in results:
    if isinstance(score, float):  # Check if score is a float before formatting
        print(f"Speaker: {name}, Similarity Score: {score:.4f}")
    else:
        print(f"Speaker: {name}, Error: {score}")  # Print the error message

  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Input embedding shape: (52,)
Input embedding mean: 5.07429958834272
Input embedding std: 44.485705163779315


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, du

Found 42 reference embeddings
Top 5 Matches:
Speaker: RM_M_2, Similarity Score: 0.6794
Speaker: RS_m_1, Similarity Score: 0.6736
Speaker: RS_m_2, Similarity Score: 0.6707
Speaker: ns_m_14, Similarity Score: 0.6706
Speaker: RM_M_4, Similarity Score: 0.6689


MFCC's + i-vectors

In [None]:
import os
import numpy as np
import librosa
import soundfile as sf
import torch
import torchaudio
from scipy.spatial.distance import cosine
import zipfile
from sklearn.decomposition import PCA
from sklearn import mixture

def load_and_preprocess_audio(file_path, target_sr=16000):
    """
    Load and preprocess audio file with noise reduction and normalization

    Args:
        file_path (str): Path to audio file
        target_sr (int): Target sampling rate

    Returns:
        numpy.ndarray: Preprocessed audio signal
    """
    try:
        # Load audio file with librosa (supports multiple formats)
        audio, orig_sr = librosa.load(file_path, sr=None)

        # Resample if needed
        if orig_sr != target_sr:
            audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=target_sr)

        # Noise reduction using spectral gating
        def noise_reduce(y):
            # Compute the spectrogram
            stft = librosa.stft(y)

            # Estimate noise
            noise_thresh = np.median(np.abs(stft), axis=1)

            # Create a mask to reduce noise
            mask = np.abs(stft) > noise_thresh[:, np.newaxis]

            # Apply mask
            cleaned_stft = stft * mask

            # Convert back to time domain
            return librosa.istft(cleaned_stft)

        # Apply noise reduction
        audio = noise_reduce(audio)

        # Normalize audio (RMS)
        audio = audio / np.sqrt(np.mean(audio**2))

        return audio
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def extract_mfcc(audio, sr=16000, n_mfcc=20):
    """
    Extract MFCC features

    Args:
        audio (numpy.ndarray): Audio signal
        sr (int): Sampling rate
        n_mfcc (int): Number of MFCC coefficients to extract

    Returns:
        numpy.ndarray: MFCC features
    """
    try:
        # Extract MFCC features
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc,
                                     n_fft=512, hop_length=160)  # 32ms windows with 10ms hop

        # Add delta and delta-delta features (velocity and acceleration)
        delta_mfccs = librosa.feature.delta(mfccs)
        delta2_mfccs = librosa.feature.delta(mfccs, order=2)

        # Stack all features
        features = np.vstack([mfccs, delta_mfccs, delta2_mfccs])

        return features
    except Exception as e:
        print(f"Error extracting MFCC: {e}")
        return None

def extract_ivector(mfcc_features, ubm_model, tv_matrix, num_components=64, tv_dim=100):
    """
    Extract i-vector from MFCC features using a Universal Background Model (UBM)
    and Total Variability (TV) matrix

    Args:
        mfcc_features (numpy.ndarray): MFCC features
        ubm_model: GMM-UBM model
        tv_matrix (numpy.ndarray): Total Variability matrix
        num_components (int): Number of Gaussian components in UBM
        tv_dim (int): Dimension of i-vector

    Returns:
        numpy.ndarray: i-vector
    """
    try:
        # If we don't have a pre-trained UBM, create a simple one
        if ubm_model is None:
            # Create a simulated UBM (in real applications, this would be pre-trained)
            ubm_model = mixture.GaussianMixture(n_components=num_components,
                                               covariance_type='diag')
            # Transpose to get features in rows for scikit-learn
            ubm_model.fit(mfcc_features.T)

        # Get frame posteriors for each Gaussian component
        posteriors = ubm_model.predict_proba(mfcc_features.T)

        # Calculate first-order statistics
        # Zero-order statistics
        N = np.sum(posteriors, axis=0)

        # First-order statistics
        F = np.zeros((num_components, mfcc_features.shape[0]))
        for c in range(num_components):
            F[c] = np.sum(posteriors[:, c:c+1] * mfcc_features.T, axis=0)

        # If we don't have a pre-trained TV matrix, create a simulated one
        if tv_matrix is None:
            # Create a simulated TV matrix (in real applications, this would be pre-trained)
            tv_matrix = np.random.normal(0, 1, (num_components * mfcc_features.shape[0], tv_dim))

        # Extract i-vector (simplified computation)
        # In a real-world scenario, this would involve solving for the MAP point estimate
        # Here, we use a simplified approach for demonstration
        ivector = np.dot(tv_matrix.T, F.flatten())

        # Length-normalize the i-vector
        ivector = ivector / np.linalg.norm(ivector)

        return ivector
    except Exception as e:
        print(f"Error extracting i-vector: {e}")
        return np.random.normal(0, 0.1, tv_dim)  # Return random vector in case of failure

def train_ubm_and_tv(audio_files, n_components=64, tv_dim=100):
    """
    Train a UBM model and generate a TV matrix from a set of audio files

    Args:
        audio_files (list): List of audio file paths
        n_components (int): Number of Gaussian components
        tv_dim (int): Dimension of i-vector

    Returns:
        tuple: (UBM model, TV matrix)
    """
    # Accumulate features from all files
    all_features = []

    for file_path in audio_files:
        audio = load_and_preprocess_audio(file_path)
        if audio is not None:
            mfcc_features = extract_mfcc(audio)
            if mfcc_features is not None:
                all_features.append(mfcc_features.T)  # Transpose for scikit-learn

    if not all_features:
        print("No valid features found for UBM training")
        return None, None

    # Concatenate all features
    combined_features = np.vstack(all_features)

    # Train UBM
    print(f"Training UBM with {combined_features.shape[0]} frames...")
    ubm = mixture.GaussianMixture(n_components=n_components, covariance_type='diag')
    ubm.fit(combined_features)

    # Generate a simplified TV matrix
    # In real applications, this would be learned from data
    feature_dim = all_features[0].shape[1]
    tv_matrix = np.random.normal(0, 1, (n_components * feature_dim, tv_dim))

    return ubm, tv_matrix

def extract_voice_embedding(audio, ubm_model=None, tv_matrix=None):
    """
    Extract voice embedding from preprocessed audio using MFCC and i-vector features

    Args:
        audio (numpy.ndarray): Preprocessed audio signal
        ubm_model: GMM-UBM model
        tv_matrix: Total Variability matrix

    Returns:
        numpy.ndarray: Voice embedding vector
    """
    try:
        # Ensure audio is the right length
        target_length = 3 * 16000  # 3 seconds at 16kHz
        if len(audio) > target_length:
            audio = audio[:target_length]
        elif len(audio) < target_length:
            audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')

        # Extract MFCC features
        mfcc_features = extract_mfcc(audio, sr=16000, n_mfcc=20)

        if mfcc_features is None:
            return None

        # Compute MFCC statistics
        mfcc_mean = np.mean(mfcc_features, axis=1)
        mfcc_std = np.std(mfcc_features, axis=1)

        # Extract i-vector
        ivector = extract_ivector(mfcc_features, ubm_model, tv_matrix)

        # Combine MFCC statistics and i-vector into a single embedding
        embedding = np.concatenate([mfcc_mean, mfcc_std, ivector])

        # Add small random noise to prevent perfect matches
        np.random.seed(42)  # For reproducibility
        embedding += np.random.normal(0, 1e-5, embedding.shape)

        # Normalize the embedding
        embedding = embedding / np.linalg.norm(embedding)

        return embedding

    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

def compare_voices(input_embedding, reference_embeddings, top_n=5):
    """
    Compare input voice embedding with reference embeddings

    Args:
        input_embedding (numpy.ndarray): Embedding of input voice
        reference_embeddings (dict): Dictionary of reference voice embeddings
        top_n (int): Number of top matches to return

    Returns:
        list: Top N matches with similarity scores
    """
    similarities = []

    # Debug: check if input embedding has valid values
    if np.all(np.isnan(input_embedding)) or np.all(input_embedding == 0):
        print("Warning: Input embedding contains all NaN or zero values")
        return [("Error", 0.0)]

    for name, ref_embedding in reference_embeddings.items():
        # Skip invalid embeddings
        if np.all(np.isnan(ref_embedding)) or np.all(ref_embedding == 0):
            print(f"Warning: Reference embedding for {name} contains all NaN or zero values")
            continue

        # 1. Cosine similarity (bounded between -1 and 1)
        cosine_sim = 1 - cosine(input_embedding, ref_embedding)

        # 2. Modified Euclidean similarity calculation
        euclidean_dist = np.linalg.norm(input_embedding - ref_embedding)
        # Use a more stable approach for Euclidean similarity
        euclidean_sim = 1 / (1 + euclidean_dist)  # This will be between 0 and 1

        # Combined score (weighted average)
        similarity_score = 0.7 * cosine_sim + 0.3 * euclidean_sim

        # Ensure the score is within a reasonable range
        similarity_score = max(-1.0, min(1.0, similarity_score))

        similarities.append((name, similarity_score))

    # Sort similarities in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_n]

def process_speaker_recognition(input_file, reference_folder):
    """
    Perform comprehensive speaker recognition

    Args:
        input_file (str): Path to input voice file
        reference_folder (str): Path to folder with reference voice files

    Returns:
        list: Top matches with similarity scores
    """
    # Get all audio files for UBM training
    all_audio_files = []
    for filename in os.listdir(reference_folder):
        if filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a')):
            filepath = os.path.join(reference_folder, filename)
            all_audio_files.append(filepath)

    # Add the input file to training data
    all_audio_files.append(input_file)

    # Train UBM and generate TV matrix
    ubm_model, tv_matrix = train_ubm_and_tv(all_audio_files)

    # Preprocess input voice
    input_audio = load_and_preprocess_audio(input_file)

    if input_audio is None:
        return [("Error", "Could not process input voice file")]

    # Extract embedding for input voice
    input_embedding = extract_voice_embedding(input_audio, ubm_model, tv_matrix)

    if input_embedding is None:
        return [("Error", "Could not generate input voice embedding")]

    # Print some stats about the input embedding
    print(f"Input embedding shape: {input_embedding.shape}")
    print(f"Input embedding mean: {np.mean(input_embedding)}")
    print(f"Input embedding std: {np.std(input_embedding)}")

    # Extract embeddings for all reference voices
    reference_embeddings = {}
    for filename in os.listdir(reference_folder):
        # Support multiple audio formats
        if filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a')):
            filepath = os.path.join(reference_folder, filename)

            # Preprocess reference audio
            ref_audio = load_and_preprocess_audio(filepath)

            if ref_audio is not None:
                # Extract embedding
                embedding = extract_voice_embedding(ref_audio, ubm_model, tv_matrix)

                if embedding is not None:
                    # Use filename (without extension) as the speaker name
                    speaker_name = os.path.splitext(filename)[0]
                    reference_embeddings[speaker_name] = embedding

    # Before running comparisons, print some stats
    if len(reference_embeddings) > 0:
        print(f"Found {len(reference_embeddings)} reference embeddings")

        # Check if all embeddings are identical
        first_embedding = next(iter(reference_embeddings.values()))
        all_identical = all(np.array_equal(embedding, first_embedding)
                           for embedding in reference_embeddings.values())
        if all_identical:
            print("WARNING: All reference embeddings are identical!")
    else:
        print("No reference embeddings found!")

    # Compare embeddings and get top matches
    top_matches = compare_voices(input_embedding, reference_embeddings)

    return top_matches

# Unzip the folder
def unzip_folder(zip_path, extract_to='.'):
    """
    Unzip a folder

    Args:
        zip_path (str): Path to zip file
        extract_to (str): Destination folder
    """
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Example usage
if __name__ == "__main__":
    # First, unzip the folder
    unzip_folder('present_maam.zip')

    # Then run speaker recognition
    input_voice_file = 'rahultest5.m4a'  # Replace with your input voice file
    reference_voices_folder = 'present_maam'  # Folder extracted from zip

    # Perform speaker recognition
    results = process_speaker_recognition(input_voice_file, reference_voices_folder)

    # Print results
    print("\nTop 5 Matches:")
    for name, score in results:
        print(f"Speaker: {name}, Similarity Score: {score:.4f}")

  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, du

Training UBM with 7910 frames...


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Input embedding shape: (220,)
Input embedding mean: 0.002736559266223857
Input embedding std: 0.06736442524683922


  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
  audio, orig_sr = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, du

Found 42 reference embeddings

Top 5 Matches:
Speaker: RM_M_2, Similarity Score: 0.9436
Speaker: RS_m_2, Similarity Score: 0.9435
Speaker: RM_M_7, Similarity Score: 0.9415
Speaker: RS_m_1, Similarity Score: 0.9407
Speaker: RS_m_4, Similarity Score: 0.9375


# using the speechbrain model (optimized)

In [None]:
import os
import time
import logging
import numpy as np
import torchaudio
import torch
import zipfile
from speechbrain.pretrained import SpeakerRecognition

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load SpeechBrain model once
model = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb"
)

def load_and_preprocess_audio(file_path, target_sr=16000):
    start_time = time.time()
    try:
        # Load audio with torchaudio
        audio, orig_sr = torchaudio.load(file_path)

        # Resample if needed
        if orig_sr != target_sr:
            transform = torchaudio.transforms.Resample(orig_sr, target_sr)
            audio = transform(audio)

        # Convert to mono
        if audio.shape[0] > 1:
            audio = torch.mean(audio, dim=0, keepdim=True)

        # Convert tensor to NumPy array
        audio = audio.squeeze().numpy()

        # Normalize audio (RMS)
        audio = audio / np.sqrt(np.mean(audio**2))

        logging.info(f"Processed {file_path} in {time.time() - start_time:.2f}s")
        return audio
    except Exception as e:
        logging.error(f"Error processing {file_path}: {e}")
        return None

def extract_voice_embedding(audio):
    start_time = time.time()
    try:
        # Convert numpy array to torch tensor
        audio_tensor = torch.tensor(audio).float().unsqueeze(0)

        # Ensure tensor length (3 seconds at 16kHz)
        target_length = 3 * 16000
        if audio_tensor.shape[1] > target_length:
            audio_tensor = audio_tensor[:, :target_length]
        elif audio_tensor.shape[1] < target_length:
            padding = torch.zeros(1, target_length - audio_tensor.shape[1])
            audio_tensor = torch.cat([audio_tensor, padding], dim=1)

        # Extract embedding
        embedding = model.encode_batch(audio_tensor).squeeze().numpy()

        logging.info(f"Extracted embedding in {time.time() - start_time:.2f}s")
        return embedding
    except Exception as e:
        logging.error(f"Error generating embedding: {e}")
        return None

def compare_voices(input_embedding, reference_embeddings, top_n=5):
    start_time = time.time()
    similarities = []

    for name, ref_embedding in reference_embeddings.items():
        # Compute cosine similarity using NumPy
        similarity_score = np.dot(input_embedding, ref_embedding) / (np.linalg.norm(input_embedding) * np.linalg.norm(ref_embedding))
        similarities.append((name, similarity_score))

    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)

    logging.info(f"Compared voices in {time.time() - start_time:.2f}s")
    return similarities[:top_n]

def process_speaker_recognition(input_file, reference_folder):
    input_audio = load_and_preprocess_audio(input_file)
    if input_audio is None:
        return [("Error", "Could not process input voice file")]

    input_embedding = extract_voice_embedding(input_audio)
    if input_embedding is None:
        return [("Error", "Could not generate input voice embedding")]

    reference_embeddings = {}
    for filename in os.listdir(reference_folder):
        if filename.lower().endswith(('.wav', '.mp3', '.flac', '.ogg', '.m4a')):
            filepath = os.path.join(reference_folder, filename)
            ref_audio = load_and_preprocess_audio(filepath)
            if ref_audio is not None:
                embedding = extract_voice_embedding(ref_audio)
                if embedding is not None:
                    reference_embeddings[os.path.splitext(filename)[0]] = embedding

    return compare_voices(input_embedding, reference_embeddings)

def unzip_folder(zip_path, extract_to='.'):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Example usage
unzip_folder('present_maam.zip')
input_voice_file = 'rahultest5.m4a'
reference_voices_folder = 'present_maam'
results = process_speaker_recognition(input_voice_file, reference_voices_folder)
print("Top 5 Matches:")
for name, score in results:
    print(f"Speaker: {name}, Similarity Score: {score:.4f}")


Top 5 Matches:
Speaker: RS_m_5, Similarity Score: 0.5395
Speaker: RM_M_4, Similarity Score: 0.4146
Speaker: RM_M_9, Similarity Score: 0.3657
Speaker: RM_M_7, Similarity Score: 0.3604
Speaker: RS_m_1, Similarity Score: 0.3444
