In [6]:
import tensorflow as tf
import librosa
import os
import random
import numpy as np
import data_loader
import config

from tensorflow.keras.layers import Conv1D, MaxPool1D
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout

In [7]:
conf = config.get_config()
f = "soft_vs_hard"
conf["features"] = [f]
examples = data_loader.data_loader(conf)

2021-05-08 22:51:22.936 | INFO     | data_loader:data_loader:29 - Loading csv and checking audio files
2021-05-08 22:51:22.948 | INFO     | data_loader:data_loader:32 - Creating dataset


In [8]:
train = {}
valid = {}

for key, value in examples.items():
    if random.randint(0, 99) < 30:
        valid[key] = value
    else:
        train[key] = value

In [5]:
print(len(train), len(valid))

1357 533


In [27]:
def data_generator(dataset, batch_size = 4):
    examples = list(dataset.items())
    while True:
        x_batch = np.zeros((batch_size, 1024, 1))
        y_batch = np.zeros((batch_size, 3))

        for i in range(0, batch_size):
            key, value = random.choice(examples)
            file_name = os.path.join(conf.get("base_dir"), f"{key}.wav")
            audio, _ = librosa.load(file_name, sr=16000, mono=True)
            start_index = random.randint(0, 32000 - 1025)
            c = 1
            if value[f] < 35:
                c = 0
            elif value[f] > 65:
                c = 2
            x_batch[i] = np.expand_dims(audio[start_index: start_index + 1024], axis=-1)
            y_batch[i, c] = 1.
        yield x_batch, y_batch

In [28]:
def create_model():
    def conv_block(inputs, filters, kernel_size, strides):
        x = Conv1D(filters, kernel_size, strides=strides, padding='same', activation='relu')(inputs)
        return MaxPool1D(2)(x)

    _input = Input(shape=(1024, 1))
    x = conv_block(_input, 64, 64, 4)

    for filters in [128] * 3 + [256, 512]:
        x = conv_block(x, filters, 64, 1)

    x = Flatten()(x)
    x = Dropout(0.4)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.4)(x)
    x = Dense(3, activation="softmax")(x)

    model = tf.keras.models.Model(_input, x)

    model.compile(
        loss="categorical_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )
    return model

In [29]:
m = create_model()

In [30]:
m.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1024, 1)]         0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 256, 64)           4160      
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 128, 64)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 128, 128)          524416    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 64, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 64, 128)           1048704   
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 32, 128)          

In [33]:
_ = m.fit(
    data_generator(train, 4),
    steps_per_epoch=2000,
    validation_data=data_generator(valid, 4),
    validation_steps=500,
    epochs=100,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=12),
        tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", patience=5, verbose=True),
    ],
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 7/100

KeyboardInterrupt: 