diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..c1eaaa6 Binary files /dev/null and b/.DS_Store differ diff --git a/Code/.DS_Store b/Code/.DS_Store new file mode 100644 index 0000000..5008ddf Binary files /dev/null and b/Code/.DS_Store differ diff --git a/Code/ModelsTrainer.py b/Code/ModelsTrainer.py index dc16da2..ffe7543 100644 --- a/Code/ModelsTrainer.py +++ b/Code/ModelsTrainer.py @@ -2,7 +2,7 @@ import pickle import warnings import numpy as np -from sklearn.mixture import GMM +from sklearn.mixture import GaussianMixture as GMM from FeaturesExtractor import FeaturesExtractor from SilenceEliminator import SilenceEliminator @@ -52,7 +52,7 @@ print("ValueError: Shape mismatch") # adapt gmm - gmm = GMM(n_components = 16, n_iter = 200, covariance_type='diag', n_init = 3) + gmm = GMM(n_components = 16, covariance_type='diag', n_init = 3) gmm.fit(features) # dumping the trained gaussian model diff --git a/Code/SilenceEliminator.py b/Code/SilenceEliminator.py index ae87796..2ba629e 100644 --- a/Code/SilenceEliminator.py +++ b/Code/SilenceEliminator.py @@ -7,6 +7,8 @@ import subprocess import numpy as np from subprocess import Popen, PIPE +import scipy.io.wavfile + class SilenceEliminator: @@ -26,8 +28,8 @@ def ffmpeg_silence_eliminator(self, input_path, output_path): representing the certainty of the decision. """ # filter silence in mp3 file - filter_command = ["ffmpeg", "-i", input_path, "-af", "silenceremove=1:0:0.05:-1:1:-36dB", "-ac", "1", "-ss", "0","-t","90", output_path, "-y"] - out = subprocess.Popen(filter_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + filter_command = "ffmpeg -i "+ input_path +" -af silenceremove=1:0:-36dB "+"-ac"+" 1"+" -ss"+" 0"+" -t"+" 90 " + output_path + " -y" + out = subprocess.Popen(filter_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True) out.wait() with_silence_duration = os.popen("ffprobe -i '" + input_path + "' -show_format -v quiet | sed -n 's/duration=//p'").read() @@ -41,11 +43,8 @@ def ffmpeg_silence_eliminator(self, input_path, output_path): print("Cannot convert float to string") # convert file to wave and read array - load_command = ["ffmpeg", "-i", output_path, "-f", "wav", "-" ] - p = Popen(load_command, stdin=PIPE, stdout=PIPE, stderr=PIPE) - data = p.communicate()[0] - audio_np = np.frombuffer(data[data.find(b'\x00data')+ 9:], np.int16) + sample_rate, signal = scipy.io.wavfile.read(output_path) # delete temp silence free file, as we only need the array os.remove(output_path) - return audio_np, no_silence_duration + return signal, no_silence_duration \ No newline at end of file diff --git a/Code/SpeakerIdentifier.py b/Code/SpeakerIdentifier.py index 874b901..ea9119c 100644 --- a/Code/SpeakerIdentifier.py +++ b/Code/SpeakerIdentifier.py @@ -2,8 +2,8 @@ import pickle import warnings import numpy as np -from SilenceEliminator import SilenceEliminator from FeaturesExtractor import FeaturesExtractor +import scipy.io.wavfile warnings.filterwarnings("ignore") @@ -32,11 +32,9 @@ for path in file_paths[:]: if os.path.basename(path).split('_')[0] in db.keys(): features_extractor = FeaturesExtractor() - silence_eliminator = SilenceEliminator() - silence_eliminated_wave_file_path ="temp-" + os.path.basename(path).split('.')[0] + ".wav" - audio, duration_string = silence_eliminator.ffmpeg_silence_eliminator(path, silence_eliminated_wave_file_path) - vector = features_extractor.accelerated_get_features_vector(path, audio, 8000) + sample_rate, signal = scipy.io.wavfile.read(path) + vector = features_extractor.accelerated_get_features_vector(path, signal, 8000) if vector.shape != (0,): print(vector.shape) diff --git a/README.md b/README.md index 8ceeca1..06ae2e1 100644 --- a/README.md +++ b/README.md @@ -41,4 +41,4 @@ This script require the follwing modules/libraries: ## Results and disscussion - The code can be further optimized using multi-threading, acceleration libs and multi-processing. -- The accuracy can be further improved using GMM normalization aka a UBM-GMM system. +- The accuracy can be further improved using GMM normalization aka a UBM-GMM system. \ No newline at end of file diff --git a/Run.py b/Run.py index 58963b1..b8b01d9 100644 --- a/Run.py +++ b/Run.py @@ -4,19 +4,20 @@ if __name__== "__main__": - # download dataset - print("# Download dataset zip file") - zip_url = "http://www.openslr.org/resources/45/ST-AEDS-20180100_1-OS.tgz" - urllib.request.urlretrieve(zip_url, 'SLR45.tgz') - - # extract and manage dataset files - print("# Mange and organize files") - os.system('python3 Code/DataManager.py') - - # train speakers gmm models - print("# Train gender models") - os.system('python3 Code/ModelsTrainer.py') - - # test system and recognise/identify speakers - print(" # Identify genders") - os.system('python3 Code/SpeakerIdentifier.py') + + # download dataset + print("# Download dataset zip file") + zip_url = "http://www.openslr.org/resources/45/ST-AEDS-20180100_1-OS.tgz" + urllib.request.urlretrieve(zip_url, 'SLR45.tgz') + + # extract and manage dataset files + print("# Mange and organize files") + os.system('python3 Code/DataManager.py') + + # train speakers gmm models + print("# Train gender models") + os.system('python3 Code/ModelsTrainer.py') + + # test system and recognise/identify speakers + print(" # Identify genders") + os.system('python3 Code/SpeakerIdentifier.py')