In [1]:
import os
import librosa
import numpy as np

In [2]:
# Function to extract MFCC + delta features from audio files
def extract_features(file_path):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_delta = librosa.feature.delta(mfccs)
    mfccs_delta2 = librosa.feature.delta(mfccs, order=2)
    
    # Concatenate MFCC and delta features
    combined = np.concatenate((mfccs, mfccs_delta, mfccs_delta2), axis=0)
    combined_scaled = np.mean(combined.T, axis=0)
    
    return combined_scaled


In [3]:
# Function to load the dataset and extract features for each file
def load_dataset(data_path):
    labels = []
    features = []
    for speaker in os.listdir(data_path):
        speaker_path = os.path.join(data_path, speaker)
        if os.path.isdir(speaker_path):
            for file in os.listdir(speaker_path):
                file_path = os.path.join(speaker_path, file)
                if file_path.endswith('.wav'):
                    features.append(extract_features(file_path))
                    labels.append(speaker)
    return np.array(features), np.array(labels)

# Load dataset
data_path = r"C:\Users\Hp\Downloads\Speech Recognition\dataset"
features, labels = load_dataset(data_path)


In [None]:
# Encode labels (speaker names)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size=0.2, random_state=42)