In [1]:
import os
import numpy as np
import librosa
import librosa.display
import glob
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense, Dropout, MaxPooling1D, BatchNormalization
import warnings

warnings.filterwarnings("ignore")


In [2]:
DATASET_PATH = r"C:\Users\dell\.cache\kagglehub\datasets\vjcalling\speaker-recognition-audio-dataset\versions\1\50_speakers_audio_data"


In [3]:
def extract_mfcc(audio_path, n_mfcc=40, max_pad_length=200):
    """Extract MFCC features from an audio file."""
    try:
        y, sr = librosa.load(audio_path, sr=22050)  # Load audio
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)  # Extract MFCC
        
        # Pad or truncate MFCC features to a fixed length
        if mfcc.shape[1] < max_pad_length:
            pad_width = max_pad_length - mfcc.shape[1]
            mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
        else:
            mfcc = mfcc[:, :max_pad_length]
        
        return mfcc
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None


In [4]:
def load_speaker_data(dataset_path, max_files_per_speaker=50):
    """Load all audio files and extract MFCC features, assigning labels based on speaker folders."""
    speakers = os.listdir(dataset_path)  # Get all speaker folder names
    speakers.sort()  # Ensure consistent labeling
    print(f"Found {len(speakers)} speakers.")
    
    X = []  # Features
    y = []  # Labels
    speaker_to_label = {speaker: idx for idx, speaker in enumerate(speakers)}

    for speaker in speakers:
        speaker_folder = os.path.join(dataset_path, speaker)
        if os.path.isdir(speaker_folder):  # Ensure it's a directory
            print(f"Processing {speaker}...")
            audio_files = glob.glob(os.path.join(speaker_folder, "*.wav"))
            random.shuffle(audio_files)  # Shuffle for randomness
            audio_files = audio_files[:max_files_per_speaker]  # Limit files per speaker
            
            for audio_file in audio_files:
                mfcc_features = extract_mfcc(audio_file)
                if mfcc_features is not None:
                    X.append(mfcc_features)
                    y.append(speaker_to_label[speaker])
    
    return np.array(X), np.array(y), speaker_to_label

# Load dataset
X, y, speaker_to_label = load_speaker_data(DATASET_PATH)

# Check dataset shape
print(f"Extracted {X.shape[0]} samples with shape {X.shape[1:]} features.")


Found 50 speakers.
Processing Speaker0026...
Processing Speaker0027...
Processing Speaker0028...
Processing Speaker0029...
Processing Speaker0030...
Processing Speaker0031...
Processing Speaker0032...
Processing Speaker0033...
Processing Speaker0034...
Processing Speaker0035...
Processing Speaker0036...
Processing Speaker0037...
Processing Speaker0038...
Processing Speaker0039...
Processing Speaker0040...
Processing Speaker0041...
Processing Speaker0042...
Processing Speaker0043...
Processing Speaker0044...
Processing Speaker0045...
Processing Speaker0046...
Processing Speaker0047...
Processing Speaker0048...
Processing Speaker0049...
Processing Speaker0050...
Processing Speaker_0000...
Processing Speaker_0001...
Processing Speaker_0002...
Processing Speaker_0003...
Processing Speaker_0004...
Processing Speaker_0005...
Processing Speaker_0006...
Processing Speaker_0007...
Processing Speaker_0008...
Processing Speaker_0009...
Processing Speaker_0010...
Processing Speaker_0011...
Process

In [5]:
# Encode labels as categorical values
num_classes = len(set(y))
y = tf.keras.utils.to_categorical(y, num_classes=num_classes)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features (mean=0, variance=1)
scaler = StandardScaler()
X_train = np.array([scaler.fit_transform(x) for x in X_train])
X_test = np.array([scaler.transform(x) for x in X_test])

# Reshape for CNN input (batch, time_steps, features)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (1667, 40, 200, 1), y_train shape: (1667, 50)
X_test shape: (417, 40, 200, 1), y_test shape: (417, 50)


In [6]:
def build_model(input_shape, num_classes):
    model = Sequential([
        Conv1D(64, kernel_size=3, activation="relu", input_shape=(input_shape[0], input_shape[1])),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        Conv1D(128, kernel_size=3, activation="relu"),
        BatchNormalization(),
        MaxPooling1D(pool_size=2),
        Dropout(0.3),
        
        LSTM(128, return_sequences=True),
        LSTM(64),

        Dense(64, activation="relu"),
        Dropout(0.3),
        Dense(num_classes, activation="softmax")  # Output layer
    ])
    
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model

# Build model
input_shape = X_train.shape[1:3]  # (Time steps, Features)
model = build_model(input_shape, num_classes)
model.summary()


In [8]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32)

# Save model
model.save("speaker_verification_model.h5")


Epoch 1/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.9268 - loss: 0.2564 - val_accuracy: 0.4053 - val_loss: 3.4829
Epoch 2/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9140 - loss: 0.2696 - val_accuracy: 0.4676 - val_loss: 3.4447
Epoch 3/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.9495 - loss: 0.1710 - val_accuracy: 0.4460 - val_loss: 3.3814
Epoch 4/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 59ms/step - accuracy: 0.9460 - loss: 0.1752 - val_accuracy: 0.4580 - val_loss: 3.4001
Epoch 5/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 50ms/step - accuracy: 0.9461 - loss: 0.1679 - val_accuracy: 0.4748 - val_loss: 3.1972
Epoch 6/100
[1m53/53[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9445 - loss: 0.1775 - val_accuracy: 0.4652 - val_loss: 3.5097
Epoch 7/100
[1m53/53[0m [



In [9]:
# Evaluate model on test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.5525 - loss: 3.0810
Test Accuracy: 52.04%


In [10]:
def verify_speaker(audio_path):
    """Predict the speaker for a given audio file."""
    mfcc_features = extract_mfcc(audio_path)
    if mfcc_features is None:
        print("Error processing file.")
        return
    
    mfcc_features = scaler.transform(mfcc_features)  # Scale features
    mfcc_features = mfcc_features.reshape(1, mfcc_features.shape[0], mfcc_features.shape[1], 1)

    prediction = model.predict(mfcc_features)
    speaker_label = list(speaker_to_label.keys())[np.argmax(prediction)]
    confidence = np.max(prediction) * 100

    print(f"Predicted Speaker: {speaker_label} with {confidence:.2f}% confidence.")


In [11]:
# Test with a random audio file
test_audio = glob.glob(os.path.join(DATASET_PATH, "Speaker_0005", "*.wav"))[0]
verify_speaker(test_audio)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted Speaker: Speaker0041 with 99.90% confidence.
