diff --git a/README.md b/README.md index 470a9384db..a66906e29c 100755 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ TensorFlowASR implements some automatic speech recognition architectures such as ## What's New? +- (11/3/2020) Reduce differences between `librosa.stft` and `tf.signal.stft` - (10/31/2020) Update DeepSpeech2 and Supported Jasper [https://arxiv.org/abs/1904.03288](https://arxiv.org/abs/1904.03288) - (10/18/2020) Supported Streaming Transducer [https://arxiv.org/abs/1811.06621](https://arxiv.org/abs/1811.06621) - (10/15/2020) Add gradients accumulation and Refactor to TensorflowASR diff --git a/setup.py b/setup.py index 76f20fcf05..1fbdda09f5 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ setuptools.setup( name="TensorFlowASR", - version="0.2.9", + version="0.2.10", author="Huy Le Nguyen", author_email="nlhuy.cs.16@gmail.com", description="Almost State-of-the-art Automatic Speech Recognition using Tensorflow 2", diff --git a/tensorflow_asr/featurizers/speech_featurizers.py b/tensorflow_asr/featurizers/speech_featurizers.py index 03e4470787..25283d4a8e 100755 --- a/tensorflow_asr/featurizers/speech_featurizers.py +++ b/tensorflow_asr/featurizers/speech_featurizers.py @@ -245,7 +245,7 @@ def shape(self) -> list: def stft(self, signal): return np.square( np.abs(librosa.core.stft(signal, n_fft=self.nfft, hop_length=self.frame_step, - win_length=self.frame_length, center=True, window="hann"))) + win_length=self.frame_length, center=False, window="hann"))) def power_to_db(self, S, ref=1.0, amin=1e-10, top_db=80.0): return librosa.power_to_db(S, ref=ref, amin=amin, top_db=top_db) @@ -302,7 +302,7 @@ def compute_pitch(self, signal: np.ndarray) -> np.ndarray: pitches, _ = librosa.core.piptrack( y=signal, sr=self.sample_rate, n_fft=self.nfft, hop_length=self.frame_step, - fmin=0.0, fmax=int(self.sample_rate / 2), win_length=self.frame_length, center=True + fmin=0.0, fmax=int(self.sample_rate / 2), win_length=self.frame_length, center=False ) pitches = pitches.T diff --git a/tests/speech_featurizer_test.py b/tests/speech_featurizer_test.py index a7934defbf..b4ae71ba39 100755 --- a/tests/speech_featurizer_test.py +++ b/tests/speech_featurizer_test.py @@ -25,25 +25,11 @@ import matplotlib.pyplot as plt from tensorflow_asr.featurizers.speech_featurizers import read_raw_audio, \ TFSpeechFeaturizer, NumpySpeechFeaturizer -from tensorflow_asr.augmentations.augments import UserAugmentation def main(argv): speech_file = argv[1] feature_type = argv[2] - augments = { - # "after": { - # "time_masking": { - # "num_masks": 10, - # "mask_factor": 100, - # "p_upperbound": 0.05 - # }, - # "freq_masking": { - # "mask_factor": 27 - # } - # }, - } - au = UserAugmentation(augments) speech_conf = { "sample_rate": 16000, "frame_ms": 25, @@ -57,9 +43,13 @@ def main(argv): } signal = read_raw_audio(speech_file, speech_conf["sample_rate"]) - sf = NumpySpeechFeaturizer(speech_conf) + nsf = NumpySpeechFeaturizer(speech_conf) + sf = TFSpeechFeaturizer(speech_conf) + ft = nsf.stft(signal) + print(ft.shape, np.mean(ft)) + ft = sf.stft(signal).numpy() + print(ft.shape, np.mean(ft)) ft = sf.extract(signal) - ft = au["after"].augment(ft)[:, :, 0] plt.figure(figsize=(16, 2.5)) ax = plt.gca()