In [124]:
import os
import numpy as np
import librosa
from sklearn.mixture import GaussianMixture
import pickle

In [125]:
SR = 16000
WINDOW_LENGTH = 0.02 
HOP_LENGTH = 0.01
N_MFCC = 13
FMAX=None

In [126]:
def extract_mfcc(file_path, sr=SR, window_length=WINDOW_LENGTH, hop_length=HOP_LENGTH, n_mfcc=N_MFCC, fmax=FMAX):
    y, sr = librosa.load(file_path, sr=sr)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, 
                                n_fft=int(sr * window_length), 
                                hop_length=int(sr * hop_length),
                                fmax=fmax)
    return mfcc.T

In [127]:
def train_gmm_for_speaker(data_folder):
    speakers = os.listdir(data_folder)
    gmm_models = {}
    
    for speaker in speakers:
        speaker_path = os.path.join(data_folder, speaker)
        if os.path.isdir(speaker_path):
            mfccs = []
            for file in os.listdir(speaker_path):
                file_path = os.path.join(speaker_path, file)
                if file_path.endswith('.wav'):
                    mfcc = extract_mfcc(file_path)
                    mfccs.append(mfcc)
            mfccs = np.vstack(mfccs)
            gmm = GaussianMixture(n_components=16, covariance_type='diag', n_init=3)
            gmm.fit(mfccs)
            gmm_models[speaker] = gmm
    
    return gmm_models

In [129]:
def predict_speaker(gmm_models, file_path):
    mfcc = extract_mfcc(file_path)
    best_speaker = None
    highest_score = float('-inf')

    for speaker, gmm in gmm_models.items():
        score = gmm.score(mfcc)
        if score > highest_score:
            highest_score = score
            best_speaker = speaker

    return best_speaker

In [130]:
data_folder = r"C:\Users\User\Desktop\speech technologies\16000_pcm_speeches"
gmm_model_path = 'gmm_models.pkl'

In [131]:
gmm_models = train_gmm_for_speaker(data_folder)
with open(gmm_model_path, 'wb') as f:
    pickle.dump(gmm_models, f)

  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)


In [132]:
with open(gmm_model_path, 'rb') as f:
    gmm_models = pickle.load(f)

In [134]:
sample_file_path = r"C:\Users\User\Desktop\speech technologies\16000_pcm_speeches\other\exercise_bike.wav"
predicted_speaker = predict_speaker(gmm_models, sample_file_path)
print(f'The predicted speaker is: {predicted_speaker}')

The predicted speaker is: other


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
