## TODO
* remove as_strided from preprocessing, because it's very unstable
* concat real time signal with old data, because CREPE uses big 1d convolutions (512 len), and it needs data from left side

In [1]:
import os
import re
import sys

from scipy.io import wavfile
import numpy as np
from numpy.lib.stride_tricks import as_strided

import sounddevice as sd

In [2]:
from tensorflow.keras.layers import Input, Reshape, Conv2D, BatchNormalization
from tensorflow.keras.layers import MaxPool2D, Dropout, Permute, Flatten, Dense
from tensorflow.keras.models import Model


2021-08-07 23:05:51.328080: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-07 23:05:51.328108: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [30]:
MODEL_SR = 16000
USE_CENTER_PAD = True

CALLS_PER_SEQ = 4  # TODO: increase to 10-20 (should be smooth)
BLOCK_SIZE = MODEL_SR // CALLS_PER_SEQ



In [4]:
def build_and_load_model(model_capacity, filename):
    """
    Build the CNN model and load the weights
    Parameters
    ----------
    model_capacity : 'tiny', 'small', 'medium', 'large', or 'full'
        String specifying the model capacity, which determines the model's
        capacity multiplier to 4 (tiny), 8 (small), 16 (medium), 24 (large),
        or 32 (full). 'full' uses the model size specified in the paper,
        and the others use a reduced number of filters in each convolutional
        layer, resulting in a smaller model that is faster to evaluate at the
        cost of slightly reduced pitch estimation accuracy.
    Returns
    -------
    model : tensorflow.keras.models.Model
        The pre-trained keras model loaded in memory
    """

    capacity_multiplier = {
        'tiny': 4, 'small': 8, 'medium': 16, 'large': 24, 'full': 32
    }[model_capacity]

    layers = [1, 2, 3, 4, 5, 6]
    filters = [n * capacity_multiplier for n in [32, 4, 4, 4, 8, 16]]
    widths = [512, 64, 64, 64, 64, 64]
    strides = [(4, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]

    x = Input(shape=(1024,), name='input', dtype='float32')
    y = Reshape(target_shape=(1024, 1, 1), name='input-reshape')(x)

    for l, f, w, s in zip(layers, filters, widths, strides):
        y = Conv2D(f, (w, 1), strides=s, padding='same',
                   activation='relu', name="conv%d" % l)(y)
        y = BatchNormalization(name="conv%d-BN" % l)(y)
        y = MaxPool2D(pool_size=(2, 1), strides=None, padding='valid',
                      name="conv%d-maxpool" % l)(y)
        y = Dropout(0.25, name="conv%d-dropout" % l)(y)

    y = Permute((2, 1, 3), name="transpose")(y)
    y = Flatten(name="flatten")(y)
    y = Dense(360, activation='sigmoid', name="classifier")(y)

    model = Model(inputs=x, outputs=y)

    model.load_weights(filename)
    model.compile('adam', 'binary_crossentropy')

    return model

In [5]:
model = build_and_load_model("tiny", "../models/model-tiny.h5")

2021-08-07 23:05:54.153073: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-08-07 23:05:54.153131: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-08-07 23:05:54.153160: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gldsn-hw): /proc/driver/nvidia/version does not exist
2021-08-07 23:05:54.153546: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
model

<tensorflow.python.keras.engine.functional.Functional at 0x7faf9847d2b0>

In [122]:
def process_micro_audio(signal, frames, time, status):
    del frames, time, status
    signal = signal.astype(np.float32)
    # pad so that frames are centered around their timestamps (i.e. first frame
    # is zero centered).
#     if USE_CENTER_PAD:
#         signal = np.pad(signal, 512, mode='constant', constant_values=0)
        
    # make 1024-sample frames of the audio with hop length of 10 milliseconds
    step_size = 10
    hop_length = int(MODEL_SR * step_size / 1000)
    n_frames = 1 + int((len(signal) - 1024) / hop_length)
    
#     print(hop_length, n_frames, len(signal) // n_frames)
#     print("itemsize", (signal.itemsize, hop_length * signal.itemsize))
#     frames = as_strided(signal, shape=(1024, n_frames),
#                         strides=(signal.itemsize, hop_length * signal.itemsize))
#     frames = frames.transpose().copy()
    split_idxs = np.arange(1024, len(signal), 1024)
    frames = np.split(signal, split_idxs)

    last = frames[-1]
    if len(last) < 1024:
        need_to_pad = 1024 - len(last)
        right_zeros = need_to_pad // 2
        left_zeros = need_to_pad - right_zeros
        
        frames[-1] = np.concatenate([np.zeros((left_zeros, 1)), last, np.zeros((right_zeros, 1))])
        
    frames = np.concatenate(frames, axis=1)
    

    # normalize each frame -- this is expected by the model
    frames -= np.mean(frames, axis=1)[:, np.newaxis]
    frames /= np.std(frames, axis=1)[:, np.newaxis]

In [123]:
with sd.InputStream(
                    device=6,
                    channels=1, 
                    callback=process_micro_audio,
                    blocksize = BLOCK_SIZE,
                    samplerate = MODEL_SR,
                   ):
    while True:
        pass
#         response = input()
#         if response in ('', 'q', 'Q'):
#             break


[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.00028427806682884693, -0.00023258355213329196, -0.0006642674561589956, -0.0010155584895983338, -0.000752815802115947, -0.0009193705627694726, -0.0011547489557415247, -0.0004204581491649151, -0.00011848619760712609, -0.00030270370189100504, -0.0001690186618361622, -0.0003154212317895144, 0.0005489490577019751, 0.0015600172337144613, 0.0009836333338171244, 0.0001221410057041794, 0.00038523803232237697, -7.36740548745729e-06, -0.0007605932187289, 0.0005516889505088329, 0.0006948462687432766, -0.0005800567450933158, -0.0002064119908027351, -0.00016766840417403728, -0.0010801729513332248, -0.0009149691322818398, -0.0005286458181217313, -0.0010722329607233405, -0.0009310862515121698, -0.0004596635408233851, -0.0008082840358838439, -0.001009028404951095

KeyboardInterrupt: 

In [52]:
sd.query_devices()

  0 HD-Audio Generic: HDMI 0 (hw:0,3), ALSA (0 in, 8 out)
  1 HD-Audio Generic: HDMI 1 (hw:0,7), ALSA (0 in, 8 out)
  2 HD-Audio Generic: HDMI 2 (hw:0,8), ALSA (0 in, 8 out)
  3 HD-Audio Generic: ALC256 Analog (hw:1,0), ALSA (2 in, 2 out)
  4 hdmi, ALSA (0 in, 8 out)
  5 pulse, ALSA (32 in, 32 out)
* 6 default, ALSA (32 in, 32 out)