-
Notifications
You must be signed in to change notification settings - Fork 66
/
ModelsTrainer.py
146 lines (116 loc) · 5.81 KB
/
ModelsTrainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import pickle
import warnings
import numpy as np
from FeaturesExtractor import FeaturesExtractor
from hmmlearn import hmm
import pydub
import subprocess
import speech_recognition as sr
from pydub import AudioSegment
from subprocess import Popen, PIPE
from pydub.silence import split_on_silence, detect_nonsilent
warnings.filterwarnings("ignore")
class ModelsTrainer:
def __init__(self, females_files_path, males_files_path):
self.females_training_path = females_files_path
self.males_training_path = males_files_path
self.features_extractor = FeaturesExtractor()
def ffmpeg_silence_eliminator(self, input_path, output_path):
"""
Eliminate silence from voice file using ffmpeg library.
Args:
input_path (str) : Path to get the original voice file from.
output_path (str) : Path to save the processed file to.
Returns:
(list) : List including True for successful authentication, False otherwise and a percentage value
representing the certainty of the decision.
"""
# filter silence in mp3 file
filter_command = ["ffmpeg", "-i", input_path, "-af", "silenceremove=1:0:0.05:-1:1:-36dB", "-ac", "1", "-ss", "0","-t","90", output_path, "-y"]
out = subprocess.Popen(filter_command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
out.wait()
with_silence_duration = os.popen("ffprobe -i '" + input_path + "' -show_format -v quiet | sed -n 's/duration=//p'").read()
no_silence_duration = os.popen("ffprobe -i '" + output_path + "' -show_format -v quiet | sed -n 's/duration=//p'").read()
# print duration specs
try:
print("%-32s %-7s %-50s" % ("ORIGINAL SAMPLE DURATION", ":", float(with_silence_duration)))
print("%-23s %-7s %-50s" % ("SILENCE FILTERED SAMPLE DURATION", ":", float(no_silence_duration)))
except:
print("WaveHandlerError: Cannot convert float to string", with_silence_duration, no_silence_duration)
# convert file to wave and read array
load_command = ["ffmpeg", "-i", output_path, "-f", "wav", "-" ]
p = Popen(load_command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
data = p.communicate()[0]
audio_np = np.frombuffer(data[data.find(b'\x00data')+ 9:], np.int16)
# delete temp silence free file, as we only need the array
#os.remove(output_path)
return audio_np, no_silence_duration
def process(self):
females, males = self.get_file_paths(self.females_training_path,
self.males_training_path)
files = females + males
# collect voice features
features = {"female" : np.asarray(()), "male" : np.asarray(())}
for file in files:
print("%10s %8s %1s" % ("--> TESTING", ":", os.path.basename(file)))
print(features["female"].shape, features["male"].shape)
# extract MFCC & delta MFCC features from audio
try:
# vector = self.features_extractor.extract_features(file.split('.')[0] + "_without_silence.wav")
vector = self.features_extractor.extract_features(file)
spk_gmm = hmm.GaussianHMM(n_components=16)
spk_gmm.fit(vector)
spk_vec = spk_gmm.means_
gender = file.split("/")[1][:-1]
print(gender)
# stack super vectors
if features[gender].size == 0: features[gender] = spk_vec
else : features[gender] = np.vstack((features[gender], spk_vec))
except:
pass
# save models
self.save_gmm(features["female"], "females")
self.save_gmm(features["male"], "males")
def get_file_paths(self, females_training_path, males_training_path):
# get file paths
females = [ os.path.join(females_training_path, f) for f in os.listdir(females_training_path) ]
males = [ os.path.join(males_training_path, f) for f in os.listdir(males_training_path) ]
return females, males
def collect_features(self, files):
"""
Collect voice features from various speakers of the same gender.
Args:
files (list) : List of voice file paths.
Returns:
(array) : Extracted features matrix.
"""
features = np.asarray(())
# extract features for each speaker
for file in files:
print("%5s %10s" % ("PROCESSNG ", file))
self.ffmpeg_silence_eliminator(file, file.split('.')[0] + "_without_silence.wav")
# extract MFCC & delta MFCC features from audio
try:
vector = self.features_extractor.extract_features(file.split('.')[0] + "_without_silence.wav")
# stack the features
if features.size == 0: features = vector
else: features = np.vstack((features, vector))
except : pass
os.remove(file.split('.')[0] + "_without_silence.wav")
return features
def save_gmm(self, gmm, name):
""" Save Gaussian mixture model using pickle.
Args:
gmm : Gaussian mixture model.
name (str) : File name.
"""
import os
path = os.path.dirname(__file__)
filename = path + "/" + name + ".hmm"
with open(filename, 'wb') as gmm_file:
pickle.dump(gmm, gmm_file)
print ("%5s %10s" % ("SAVING", filename,))
if __name__== "__main__":
models_trainer = ModelsTrainer("TrainingData/females", "TrainingData/males")
models_trainer.process()