In [1]:
#!/usr/bin/env python3
"""
A file for preprocessing audio recordings from tarteel.io for input into
TensorFlow models.

Filter bank and MFCC background referenced from
[1] https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
and
http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/.

Tensorflow implementation details inspired by API at
https://www.tensorflow.org/api_guides/python/contrib.signal.

Author: Hamzah Khan
Date: Jan. 12, 2019
"""
# Argument constants.
ALL_SURAHS = 0
NUM_SURAHS = 114

OUTPUT_MFCC = "mfcc"
OUTPUT_MEL_FILTER_BANK = "mel_filter_bank"
OUTPUT_LOG_MEL_FILTER_BANK = "log_mel_filter_bank"

# Define constants.

# Unsupported sampling frequencies.
SUPPORTED_FREQUENCIES = [8000, 16000, 32000, 48000]

# Select a pre_emphasis coefficient. 
"""
Typical values for the [pre-emphasis] filter coefficient are 0.95 or 0.97.
"""
pre_emphasis_factor_1 = 0.95
pre_emphasis_factor_2 = 0.97
PRE_EMPHASIS_FACTOR = pre_emphasis_factor_1

# Select the frame splitting constants. Note that frames can and should overlap.
"""
"Typical frame sizes in speech processing range from 20 ms to 40 ms with 50% (+/-10%) overlap between consecutive
frames. Popular settings are 25 ms for the frame size, frame_size = 0.025 and a 10 ms stride (15 ms overlap),
frame_stride = 0.01" [1].
"""
FRAME_SIZE_S = 0.025
FRAME_STRIDE_S = 0.01

"""
For the "Short-Time Fourier-Transform (STFT) (over N points),... N is typically 256 or 512." [1]
"""
# Select the number of points used in the Short-Time Fourier Transform.
STFT_NUM_POINTS_1 = 256
STFT_NUM_POINTS_2 = 512
STFT_NUM_POINTS = STFT_NUM_POINTS_2


"""
"typically 40 filters... The Mel-scale aims to mimic the non-linear human ear perception of sound,
by being more discriminative at lower frequencies and less discriminative at higher frequencies.
[1]"
"""
# Select the number of triangular filters to apply to the power spectrum for frequency band extraction.
NUM_TRIANGULAR_FILTERS = 40

# Select the default number of mel-frequency cepstral coefficents to reduce to from filter banks. This number must be
# less than the number of filters.
NUM_MFCCS = 13

In [7]:
import functools
import json
import numpy as np
import os
import recording_utils
import scipy.io.wavfile
import tensorflow as tf
print(tf.__version__)

from tensorflow.contrib import signal as tf_signal

1.12.0


In [8]:
import tensorflow.contrib.signal


In [9]:
print(tensorflow.contrib.signal.stft)

<function stft at 0xb3a4adf28>


In [None]:
sess=tf.InteractiveSession()

In [11]:
sample_rate_hz, signal_np = scipy.io.wavfile.read("../.audio/s1/a7/1_7_2212954817.wav")
signal = tf.convert_to_tensor(signal_np.transpose(), tf.float32)
print(signal.shape)

frames = tf.contrib.signal.frame(signal, frame_length=400, frame_step=100)

stfts = tf_signal.stft(frames, frame_length=400, frame_step=100, fft_length=512, window_fn=tf.contrib.signal.hamming_window)

magnitude_spectrograms = tf.abs(stfts)
num_spectrogram_bins = 257

# Compute the conversion matrix to mel-frequency space.
linear_to_mel_weight_matrix = tf.contrib.signal.linear_to_mel_weight_matrix(num_mel_bins=40,
                                                                            num_spectrogram_bins=num_spectrogram_bins,
                                                                            sample_rate=sample_rate_hz,
                                                                            lower_edge_hertz=80.0,
                                                                            upper_edge_hertz=24000.00,
                                                                            dtype=tf.float32)

mel_spectrograms = tf.tensordot(magnitude_spectrograms, linear_to_mel_weight_matrix, 1)
mel_spectrograms.set_shape(magnitude_spectrograms.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))

mel_spectrograms.eval()


(2, 180224)


ValueError: Cannot evaluate tensor using `eval()`: No default session is registered. Use `with sess.as_default()` or pass an explicit session to `eval(session=sess)`