In [58]:
# !pip install speechbrain

In [73]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import random
import pickle

In [83]:
# load the dataset
data_path = "/kaggle/input/speakerid-dataset"
audios = os.listdir(data_path)
audios.remove("test_audio_ranwa.wav")

In [84]:
import torchaudio
import torch
from speechbrain.inference.speaker import EncoderClassifier

# Load embeddings model
classifier = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    run_opts={"device": "cuda"}   # <--- THIS PUTS THE MODEL ON GPU
)

In [85]:
def get_audio_embeddings(data_path, audios):
    data = []
    for audio in tqdm(audios, total=len(audios)):
        # Load audio
        signal, fs = torchaudio.load(os.path.join(data_path,audio))
        signal = torch.mean(signal, dim=0, keepdim=True)  # now shape [1, num_samples] - convert stereo to mono
        
        # Move signal to GPU
        signal = signal.to("cuda")
        
        # Compute embeddings
        embeddings = classifier.encode_batch(signal)

        # convert to list
        embeddings = embeddings.squeeze()                         # -> (192,)
        embeddings_list = embeddings.cpu().tolist()
        data.append(("_".join(audio.split("_")[:2]), embeddings_list)) 
    return data

In [86]:
train_set = get_audio_embeddings(data_path, audios)

100%|██████████| 180/180 [00:05<00:00, 34.44it/s]


In [87]:
# shuffle our data
random.shuffle(train_set)

# separate target and features
x = [sample[1] for sample in train_set]
y = [sample[0] for sample in train_set]

# reduce size of embedding array
pca = PCA(n_components=20)
x_pca = pca.fit_transform(list(x))  # from 192 to 20 parameters

# encode the labels 
name2idx = {"ranwa_khaled": 0, "nour_adel": 1, "nour_nader": 2}
idx2name = {0:"ranwa_khaled", 1:"nour_adel", 2:"nour_nader"}

y = [name2idx[name] for name in y]

# 90% train, 10% test
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.1, random_state=42)

# train logistic regression
LRmodel = LogisticRegression()
LRmodel.fit(x_train, y_train)

In [88]:
from sklearn.metrics import accuracy_score

y_pred = LRmodel.predict(x_test)
accuracy_score(y_pred, y_test)

1.0

In [89]:
# Save the lr model to a file
filename = 'speakerID_model.pkl'
pickle.dump(LRmodel, open(filename, 'wb'))

# To load the model later
loaded_model = pickle.load(open(filename, 'rb'))
# Now you can use loaded_model for predictions

In [90]:
# 2. Save the trained PCA model
filename = 'pca_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(pca, file)

print(f"PCA model saved to {filename}")

# 3. Load the saved PCA model (in a new session or script)
with open(filename, 'rb') as file:
    loaded_pca = pickle.load(file)

print("PCA model loaded successfully.")

PCA model saved to pca_model.pkl
PCA model loaded successfully.


In [91]:
LRmodel.predict([x_test[0]])

array([2])

In [96]:
sample_path = "/kaggle/input/speakerid-dataset/test_audio_ranwa.wav"

def prep_audio_sample(audio):
    # Load audio
    signal, fs = torchaudio.load(audio)
    signal = torch.mean(signal, dim=0, keepdim=True)  # now shape [1, num_samples] - convert stereo to mono
    
    # Move signal to GPU
    signal = signal.to("cuda")
    
    # Compute embeddings
    embeddings = classifier.encode_batch(signal)

    # convert to list
    embeddings = embeddings.squeeze()                         # -> (192,)
    embeddings_list = embeddings.cpu().tolist()
    
    return pca.transform([embeddings_list])
    
prediction = LRmodel.predict(prep_audio_sample(sample_path))

print(idx2name[prediction[0]])

ranwa_khaled
