In [3]:
import numpy as np
import librosa
from pydub import AudioSegment
from pydub.utils import mediainfo
from sklearn import preprocessing
import warnings
warnings.filterwarnings('ignore')

In [4]:
def mfcc_extraction(audio_filename, #.wav filename
                    hop_duration, #hop_length in seconds, e.g., 0.015s (i.e., 15ms)
                    num_mfcc #number of mfcc features
                   ): 
    speech = AudioSegment.from_wav(audio_filename) #Read audio data from file
    samples = speech.get_array_of_samples() #samples x(t)
    sampling_rate = speech.frame_rate #sampling rate f
 
    mfcc = librosa.feature.mfcc(np.float32(samples), 
                                sr = sampling_rate, 
                                hop_length = int(sampling_rate * hop_duration), 
                                n_mfcc = num_mfcc)
 
    return mfcc.T

In [5]:
from sklearn.mixture import GaussianMixture
def learningGMM(features, #list of feature vectors, each feature vector is an array
                n_components, #the number of components
                max_iter #maximum number of iterations
               ):
    gmm = GaussianMixture(n_components = n_components, max_iter = max_iter)
    gmm.fit(features)
    return gmm

In [7]:
import os
path = 'SpeakerData/'
speakers = os.listdir(path + 'Train/')
print(speakers)

['Anthony', 'AppleEater', 'Ara', 'Argail', 'Ariyan', 'Arjuan', 'Artem', 'Arthur', 'Artk', 'Arun', 'Arvala', 'Asalkeld', 'Asladic', 'Asp', 'Azmisov', 'B', 'Bachroxx', 'Bae', 'Bahoke', 'Bareford', 'Bart', 'Bassel', 'Beady', 'Beez', 'BelmontGuy']


In [8]:
#this list is used to store the MFCC features of all training data of all speakers
mfcc_all_speakers = [] 
hop_duration = 0.015 #15ms
num_mfcc = 12
for s in speakers:
    sub_path = path + 'Train/' + s + '/'
    sub_file_names = [os.path.join(sub_path, f) for f in os.listdir(sub_path)]
    mfcc_one_speaker = np.asarray(())
    for fn in sub_file_names:
        mfcc_one_file = mfcc_extraction(fn, hop_duration, num_mfcc)
        if mfcc_one_speaker.size == 0:
            mfcc_one_speaker = mfcc_one_file
        else:
            mfcc_one_speaker = np.vstack((mfcc_one_speaker, mfcc_one_file))
    mfcc_all_speakers.append(mfcc_one_speaker)

In [35]:
#this list is used to store the MFCC features of all test data of all speakers
test_mfcc_all_speakers = [] 
hop_duration = 0.015 #15ms
num_mfcc = 12
for s in speakers:
    sub_path = path + 'Test/' + s + '/'
    sub_file_names = [os.path.join(sub_path, f) for f in os.listdir(sub_path)]
    mfcc_one_speaker = np.asarray(())
    for fn in sub_file_names:
        mfcc_one_file = mfcc_extraction(fn, hop_duration, num_mfcc)
        if mfcc_one_speaker.size == 0:
            mfcc_one_speaker = mfcc_one_file
        else:
            mfcc_one_speaker = np.vstack((mfcc_one_speaker, mfcc_one_file))
    test_mfcc_all_speakers.append(mfcc_one_speaker)

In [16]:
import pickle
for i in range(0, len(speakers)):
    with open('TrainingFeatures/' + speakers[i] + '_mfcc.fea','wb') as f:
        pickle.dump(mfcc_all_speakers[i], f)

In [27]:
n_components = 5
max_iter = 50
gmms = [] #list of GMMs, each is for a speaker
for i in range(0, len(speakers)):
    gmm = learningGMM(mfcc_all_speakers[i], 
                      n_components, 
                      max_iter)
    gmms.append(gmm)

In [32]:
for i in range(len(speakers)):
    with open('Models/' + speakers[i] + '.gmm', 'wb') as f: #'wb' is for binary write
        pickle.dump(gmms[i], f)

In [34]:
gmms = []
for i in range(len(speakers)):
    with open('Models/' + speakers[i] + '.gmm', 'rb') as f: #'wb' is for binary write
        gmm = pickle.load(f)
        gmms.append(gmm)

---- 

In [78]:
def speaker_recognition(audio_file_name, gmms):
    spkers = []
    item = mfcc_extraction(audio_file_name,hop_duration, num_mfcc)
    for i in range(0,len(gmms)):
        spkers.append(gmms[i].score(item))
    index_max = np.argmax(spkers)
    speaker_id = index_max
    return speaker_id

In [79]:
speaker_id = speaker_recognition('SpeakerData/Test/Ara/a0522.wav', gmms)
print(speakers[speaker_id])

Ara


----

In [114]:
import os
path = 'SpeakerData/'
test_file_names = []
test_file_labels = []
for i in range(0, len(speakers)):
    sub_path = path + 'Test/' + speakers[i] + '/'
    sub_file_names = [os.path.join(sub_path, f) for f in os.listdir(sub_path)]
    sub_speaker_labels = [i] * len(sub_file_names)
    test_file_names += sub_file_names
    test_file_labels += sub_speaker_labels

In [133]:
truelabels = []
for i in range(0,len(test_file_labels)):
    speakerid = speakers[test_file_labels[i]]
    truelabels.append(speakerid)
print(truelabels)

['Anthony', 'Anthony', 'Anthony', 'Anthony', 'Anthony', 'Anthony', 'Anthony', 'AppleEater', 'AppleEater', 'AppleEater', 'AppleEater', 'AppleEater', 'AppleEater', 'AppleEater', 'Ara', 'Ara', 'Ara', 'Ara', 'Ara', 'Ara', 'Ara', 'Argail', 'Argail', 'Argail', 'Argail', 'Argail', 'Argail', 'Argail', 'Ariyan', 'Ariyan', 'Ariyan', 'Ariyan', 'Ariyan', 'Ariyan', 'Ariyan', 'Arjuan', 'Arjuan', 'Arjuan', 'Arjuan', 'Arjuan', 'Arjuan', 'Arjuan', 'Artem', 'Artem', 'Artem', 'Artem', 'Artem', 'Artem', 'Artem', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Artk', 'Artk', 'Artk', 'Artk', 'Artk', 'Artk', 'Artk', 'Arun', 'Arun', 'Arun', 'Arun', 'Arun', 'Arun', 'Arun', 'Arvala', 'Arvala', 'Arvala', 'Arvala', 'Arvala', 'Arvala', 'Arvala', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asladic', 'Asladic', 'Asladic', 'Asladic', 'Asladic', 'Asladic', 'Asladic', 'Asp', 'Asp', 'Asp', 'Asp', 'Asp', 'Asp', 'Asp', 'Azmisov', 'Azmisov', 'Azmisov', 'Azmiso

In [107]:
testmfccfeatures = []
predictlab = []
for i in range(0,len(test_file_names)):
    speaker_id = speaker_recognition(test_file_names[i],gmms)
    predict = speakers[speaker_id]
    predictlab.append(predict)
print(predictlab)

['Anthony', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'AppleEater', 'AppleEater', 'AppleEater', 'AppleEater', 'AppleEater', 'AppleEater', 'AppleEater', 'Ara', 'Ara', 'Ara', 'Ara', 'Ara', 'Ara', 'Ara', 'Argail', 'Argail', 'Argail', 'Argail', 'Argail', 'Argail', 'Argail', 'Ariyan', 'Ariyan', 'Ariyan', 'Ariyan', 'Ariyan', 'Ariyan', 'Ariyan', 'Arjuan', 'Arjuan', 'Arjuan', 'Arjuan', 'Arjuan', 'Arjuan', 'Arjuan', 'Artem', 'Artem', 'Artem', 'Artem', 'Artem', 'Artem', 'Artem', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Arthur', 'Artk', 'Artk', 'Artk', 'Artk', 'Artk', 'Artk', 'Artk', 'Arun', 'Arun', 'Arun', 'Arun', 'Arun', 'Arun', 'Arun', 'Arvala', 'Arvala', 'Arvala', 'Arvala', 'Arvala', 'Arvala', 'Arvala', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asalkeld', 'Asladic', 'Asladic', 'Asladic', 'Asladic', 'Asladic', 'Asladic', 'Asladic', 'Asp', 'Asp', 'Asp', 'Asp', 'Asp', 'Asp', 'Asp', 'Azmisov', 'Azmisov', 'Azmisov', 'Azmisov', 'A

In [134]:
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
cm = confusion_matrix(truelabels,predictlab)
acc = accuracy_score(truelabels,predictlab)
print(cm)
print(acc*100)

[[1 0 0 0 0 0 0 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 0 0 1 