In [1]:
import tensorflow as tf
import scipy as sp
import numpy as np
import librosa as li
from preprocessing import Audio, get_MFCC
import IPython.display as ipd
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer((16000, 1)),
#     tf.keras.layers.Reshape((400, 40))
    tf.keras.layers.Conv1D(filters=1, kernel_size=10, strides=1, padding='same', use_bias=False),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Conv1D(filters=20, kernel_size=100, strides=1, padding='same', use_bias=False),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Conv1D(filters=10, kernel_size=400, strides=400, use_bias=False),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Reshape((1, 400)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.GRU(400),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
], name='RootModel-v3.0')

model.summary()

Model: "RootModel-v3.0"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_3 (Conv1D)            (None, 16000, 1)          10        
_________________________________________________________________
dropout (Dropout)            (None, 16000, 1)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 16000, 20)         2000      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16000, 20)         0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 40, 10)            80000     
_________________________________________________________________
dropout_2 (Dropout)          (None, 40, 10)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 400)            

In [4]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[
        tf.keras.metrics.Recall(name='rec'),
        tf.keras.metrics.Precision(name='prec'),
        tf.keras.metrics.AUC(name='auc'),
        tf.keras.metrics.BinaryAccuracy(name='acc'),
    ]
)

In [5]:
model.save('models/conv/{}.h5'.format(model.name))

In [54]:
class AudioEnergies(tf.keras.layers.Layer):
    def __init__(self, num_energies, sr=44100):
        super(AudioEnergies, self).__init__()
        self.num_energies = num_energies
        self.sr = sr
        
    def call(self, inputs):
        coeff = sp.signal.firwin(999, [260, 700], fs=self.sr, pass_zero=False)
        x_filtered = sp.signal.lfilter(coeff, 1.0, inputs.numpy())
        x_normalized = x_filtered / x_filtered.max()
        x_squared = np.square(x_normalized)
        splited = np.array_split(x_squared, self.num_energies)
        e_parts = np.empty((0))
        for part in splited:
            e_parts = np.append(e_parts, sp.integrate.simps(part))
        return tf.Variable(e_parts)

In [55]:
class AudioToLen(tf.keras.layers.Layer):
    def __init__(self, seconds, sr=44100):
        super(AudioToLen, self).__init__()
        self.seconds = seconds
        self.sr = sr
        
    def call(self, inputs):
        inputs, _ = li.effects.trim(inputs)
        inputs = tf.keras.preprocessing.sequence.pad_sequences(
            [inputs],
            maxlen=int(self.sr * self.seconds),
            padding='post',
            truncating='post',
            dtype='float32'
        )[0]
        return tf.Variable(inputs)

In [56]:
get_energies = AudioEnergies(20)
to_len = AudioToLen(1.0)

In [57]:
path = li.util.find_files('AudioData/5 Мира/1 Мира')[10]
audio = Audio(path)

In [58]:
audio.get_energy(audio.data)

array([1.22789420e-01, 4.97587312e+02, 6.28985697e+02, 6.14400374e+02,
       5.04133606e+02, 3.66373191e+02, 2.71443722e+02, 1.60338479e+02,
       4.69507947e+01, 2.50052867e+01, 9.49896980e-02, 7.84737609e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [59]:
get_energies(to_len(audio.data))

<tf.Variable 'audio_energies_9/Variable:0' shape=(20,) dtype=float64, numpy=
array([1.22789420e-01, 4.97587312e+02, 6.28985697e+02, 6.14400374e+02,
       5.04133606e+02, 3.66373191e+02, 2.71443722e+02, 1.60338479e+02,
       4.69507947e+01, 2.50052867e+01, 9.49896980e-02, 7.84737609e-03,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])>

In [63]:
get_MFCC(audio.path)

array([-3.9191589e+02,  1.8042436e+02,  6.3252373e+00,  4.0873180e+01,
        4.7412006e+01,  2.0015783e+01,  6.6600814e+00, -6.7188888e+00,
        7.1545634e+00,  7.3719444e+00,  1.6517767e+00,  6.0478172e+00,
       -3.4924920e+00,  2.6498389e-01,  2.8514979e+00, -1.2290975e+01,
       -1.0826236e+01,  1.5297074e+00,  4.6876464e+00, -1.1106217e+00,
       -5.1002431e+00, -1.4671797e+00,  5.2215022e-01,  1.6060430e+00,
        2.5597248e+00,  2.0427062e-01, -9.3772864e-01, -4.5330834e-01,
       -1.5078343e+00, -5.4215493e+00, -6.8798103e+00, -3.5015306e+00,
       -2.1521769e+00, -4.4268870e+00, -5.8181953e+00, -2.8080699e+00,
       -1.9243237e+00, -5.8705649e+00, -4.7097144e+00, -1.8003622e+00],
      dtype=float32)

In [215]:
def tf_mfcc(audio, padding=False, n_mfcc=None):
    if padding:
        zero_padding = tf.zeros([44100] - tf.shape(audio.data), dtype=tf.float32)
        equal_length = tf.concat([audio.data, zero_padding], 0)
    else:
        equal_length = tf.Variable(audio.data)
        
    spectrograms = tf.signal.stft(equal_length, frame_length=255, frame_step=128)
    spectrograms = tf.abs(spectrograms)
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(1, sample_rate=audio.sr)
    mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
    print(mel_spectrograms.shape)
    mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)
    if n_mfcc is None:
        return mel_spectrograms
    return mel_spectrograms[..., :n_mfcc]

In [216]:
mfccs = tf_mfcc(audio, n_mfcc=20)

(189, 1)


In [217]:
get_MFCC(audio.path)

array([-3.9191589e+02,  1.8042436e+02,  6.3252373e+00,  4.0873180e+01,
        4.7412006e+01,  2.0015783e+01,  6.6600814e+00, -6.7188888e+00,
        7.1545634e+00,  7.3719444e+00,  1.6517767e+00,  6.0478172e+00,
       -3.4924920e+00,  2.6498389e-01,  2.8514979e+00, -1.2290975e+01,
       -1.0826236e+01,  1.5297074e+00,  4.6876464e+00, -1.1106217e+00,
       -5.1002431e+00, -1.4671797e+00,  5.2215022e-01,  1.6060430e+00,
        2.5597248e+00,  2.0427062e-01, -9.3772864e-01, -4.5330834e-01,
       -1.5078343e+00, -5.4215493e+00, -6.8798103e+00, -3.5015306e+00,
       -2.1521769e+00, -4.4268870e+00, -5.8181953e+00, -2.8080699e+00,
       -1.9243237e+00, -5.8705649e+00, -4.7097144e+00, -1.8003622e+00],
      dtype=float32)

In [224]:
li.feature.mfcc(audio.data, audio.sr)

array([[-5.62767883e+02, -5.47699463e+02, -5.29939331e+02,
        -4.74278748e+02, -4.07436798e+02, -3.80634003e+02,
        -3.68292664e+02, -3.61528931e+02, -3.64147125e+02,
        -3.70415009e+02, -3.74785828e+02, -3.59024536e+02,
        -3.38842712e+02, -3.37862976e+02, -3.28354126e+02,
        -3.32942291e+02, -3.47464355e+02, -3.52468048e+02,
        -3.52219269e+02, -3.58187805e+02, -3.58812195e+02,
        -3.62802399e+02, -3.69423676e+02, -3.63715210e+02,
        -3.50365570e+02, -3.37088501e+02, -3.23443848e+02,
        -3.12855652e+02, -3.23266418e+02, -3.50579803e+02,
        -3.40461029e+02, -2.99798615e+02, -2.88007416e+02,
        -3.04983154e+02, -3.36309143e+02, -3.57848297e+02,
        -3.77946503e+02, -3.82922485e+02, -3.67035980e+02,
        -3.84905640e+02, -3.97445312e+02, -4.21553711e+02,
        -4.84827423e+02, -5.24839722e+02, -5.40753357e+02,
        -5.54877197e+02, -5.72177246e+02, -5.73625244e+02],
       [ 8.00271912e+01,  9.31784821e+01,  1.09992462e+