## TODO
* add time-performance tests

In [1]:
from typing import Optional, List

import os
import re
import sys
import time
import datetime

from scipy.io import wavfile
import numpy as np
from numpy.lib.stride_tricks import as_strided

import sounddevice as sd

%matplotlib inline
from IPython import display
import matplotlib.pyplot as plt
import matplotlib as mpl

from tensorflow.keras.layers import Input, Reshape, Conv2D, BatchNormalization
from tensorflow.keras.layers import MaxPool2D, Dropout, Permute, Flatten, Dense
from tensorflow.keras.models import Model


2021-08-12 12:49:45.612180: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-08-12 12:49:45.612214: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
MODEL_SR = 16000
# USE_CENTER_PAD = True

# CALLS_PER_SEC = 2  # TODO: increase to 10-20 (should be smooth)
# BLOCK_SIZE = MODEL_SR // CALLS_PER_SEC
BLOCK_SIZE = 1024


In [3]:
mpl.rcsetup.all_backends

['GTK3Agg',
 'GTK3Cairo',
 'MacOSX',
 'nbAgg',
 'Qt4Agg',
 'Qt4Cairo',
 'Qt5Agg',
 'Qt5Cairo',
 'TkAgg',
 'TkCairo',
 'WebAgg',
 'WX',
 'WXAgg',
 'WXCairo',
 'agg',
 'cairo',
 'pdf',
 'pgf',
 'ps',
 'svg',
 'template']

In [4]:
mpl.use("Qt5Agg")

In [5]:
def build_and_load_model(model_capacity, filename):
    """
    Build the CNN model and load the weights
    Parameters
    ----------
    model_capacity : 'tiny', 'small', 'medium', 'large', or 'full'
        String specifying the model capacity, which determines the model's
        capacity multiplier to 4 (tiny), 8 (small), 16 (medium), 24 (large),
        or 32 (full). 'full' uses the model size specified in the paper,
        and the others use a reduced number of filters in each convolutional
        layer, resulting in a smaller model that is faster to evaluate at the
        cost of slightly reduced pitch estimation accuracy.
    Returns
    -------
    model : tensorflow.keras.models.Model
        The pre-trained keras model loaded in memory
    """

    capacity_multiplier = {
        'tiny': 4, 'small': 8, 'medium': 16, 'large': 24, 'full': 32
    }[model_capacity]

    layers = [1, 2, 3, 4, 5, 6]
    filters = [n * capacity_multiplier for n in [32, 4, 4, 4, 8, 16]]
    widths = [512, 64, 64, 64, 64, 64]
    strides = [(4, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]

    x = Input(shape=(1024,), name='input', dtype='float32')
    y = Reshape(target_shape=(1024, 1, 1), name='input-reshape')(x)

    for l, f, w, s in zip(layers, filters, widths, strides):
        y = Conv2D(f, (w, 1), strides=s, padding='same',
                   activation='relu', name="conv%d" % l)(y)
        y = BatchNormalization(name="conv%d-BN" % l)(y)
        y = MaxPool2D(pool_size=(2, 1), strides=None, padding='valid',
                      name="conv%d-maxpool" % l)(y)
        y = Dropout(0.25, name="conv%d-dropout" % l)(y)

    y = Permute((2, 1, 3), name="transpose")(y)
    y = Flatten(name="flatten")(y)
    y = Dense(360, activation='sigmoid', name="classifier")(y)

    model = Model(inputs=x, outputs=y)

    model.load_weights(filename)
    model.compile('adam', 'binary_crossentropy')

    return model

In [6]:
model = build_and_load_model("tiny", "../models/model-tiny.h5")

2021-08-12 11:22:26.710123: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-08-12 11:22:26.710166: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2021-08-12 11:22:26.710199: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (gldsn-hw): /proc/driver/nvidia/version does not exist
2021-08-12 11:22:26.710651: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
stream = sd.InputStream(
                samplerate=MODEL_SR,
                blocksize = 1024,
                channels = 1,
)

In [9]:
def detect_pitch_realtime(signal):
    start_time = time.time()
    
    frames = signal[:1024].reshape(1, -1)
    frames -= np.mean(frames, axis=1)[:, np.newaxis]
    frames /= np.std(frames, axis=1)[:, np.newaxis]
    
    model_preds = model(frames, training=False)#, workers=-1, use_multiprocessing=True)
    model_preds = model_preds.numpy()
#     print("time needed", time.time() - start_time)
    batch_pitch = model_preds.argmax(axis=1)
    confidence = model_preds.max(axis=1)
    
    return batch_pitch, confidence

In [10]:
RECORD_SECONDS = 300
stream.start()

plt.ylim((60, 300))

n_points = 200
points = []

fig, ax = plt.subplots()

# animated=True tells matplotlib to only draw the artist when we
# explicitly request it
(plotted_data,) = ax.plot(range(n_points), [60] * 100 + [300] * 100, animated=True)
plt.show(block=False)
plt.pause(0.1)

bg = fig.canvas.copy_from_bbox(fig.bbox)
# draw the animated artist, this uses a cached renderer
ax.draw_artist(plotted_data)
# show the result to the screen, this pushes the updated RGBA buffer from the
# renderer to the GUI framework so you can see it
fig.canvas.blit(fig.bbox)


for block_idx in range(0, int(MODEL_SR / BLOCK_SIZE * RECORD_SECONDS)):
    audio_arr, is_overflowed = stream.read(BLOCK_SIZE)
#     if block_idx % 2 == 0:
    if True:
        if is_overflowed:
            raise OverflowError()

        model_preds, confidence = detect_pitch_realtime(audio_arr)
        model_preds = model_preds.astype(np.float32)
        model_preds[confidence < 0.5] = None
        points += list(model_preds)

    else:
        points.append(np.nan)
        
    if len(points) >= n_points and len(points) % 1 == 0:
#         display.clear_output(wait=True)
#         plt.clf()
    
        fig.canvas.restore_region(bg)

#         plt.scatter(range(n_points), points[-1 * n_points:])
        
        new_plot_data = np.array(points[-1 * n_points:])
        not_nan_mask = np.logical_not(np.isnan(new_plot_data))
        indexes = np.arange(len(new_plot_data))
#         plotted_data.set_data(indexes[not_nan_mask], new_plot_data[not_nan_mask])
        plotted_data.set_data(indexes, new_plot_data)
#         plt.draw()
#         plt.pause(1e-17)

        ax.draw_artist(plotted_data)
        fig.canvas.blit(fig.bbox)
        fig.canvas.flush_events()

#         display.display(plt.gcf())
    
stream.stop()

In [None]:
sd.query_devices()