## Get predictions from wav file

We use https://github.com/devicehive/devicehive-audio-analysis
We need the code from th audio directory from above repo

We need the vvgish models in the models directory

We need our own trained model in teh models directory

We need to point to our model and our class labels csv file

https://github.com/tensorflow/models/tree/master/research/audioset

In [1]:
## Import packages
import os
import csv
import numpy as np
from scipy.io import wavfile

import tensorflow as tf
from tensorflow import flags

import vggish_input
import vggish_postprocess
import vggish_params
import vggish_slim

In [2]:
# volgens mij heb ik dit niet nodig : alle parameters zitten in vggish_params
# FLAGS = flags.FLAGS

# flags.DEFINE_string(
#     'wav_file', 'wav_files/audio.wav',
#     'Path to a wav file. Should contain signed 16-bit PCM samples.')

# flags.DEFINE_string(
#     'pca_params', 'models/vggish_pca_params.npz',
#     'Path to the VGGish PCA parameters file.')

# flags.DEFINE_string(
#     'checkpoint', 'models/vggish_model.ckpt',
#     'Path to the VGGish checkpoint file.')

wav_file = 'wav_files/audio.wav'

MODEL_CHECKPOINT_FILE = 'models/serval03/model.ckpt-13810'
CLASS_LABELS_INDICES = 'audioset/class_labels_indices_amsterdam2.csv'

In [3]:
## Ik verdeel het parsen van een wav-file naar embeddings in stappen:
# Stap 1a: lezen van wav-file, input is array met samples die db aanduiden. Ook sample rate (per sec) wordt gelezen
# Stap 1b: Bij 2d array (stereo, ipv mono) bereken gemiddelde, daarna normaliseren (delen door 32.768)
# Stap 2: Bepaal examples in vorm [batch size, num frames, num bands].
    # Hierbij worden voor verschillende batches (omdat alles tegelijk niet in 1x in NN kan),
    # een log mel spectrogram gemaakt (in vorm [num_frames, num_bands])
# Stap 3: Bepaal features: nu wordt de embedding laag gemaakt (PCA-components, discreet maken etc)
    # Hiervoor worden model-parameters opgehaald die eerder zijn opgeslagen
# Stap 4: Maken van predictions

In [4]:
## Stap 1 en 2
## This function reads the wav file and converts the samples into np arrays of [batch size, num frames, num bands]
examples_batch = vggish_input.wavfile_to_examples(wav_file)
print(examples_batch.shape)

(251, 96, 64)


In [5]:
## Read csv-file with labels
class_map = {}
with open(CLASS_LABELS_INDICES) as f:
    next(f)  # skip header
    reader = csv.reader(f)
    for row in reader:
        class_map[int(row[0])] = row[2]

'index,mid,display_name\n'

In [6]:
## Stap 3
with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model: load the checkpoint and locate input and output tensors
    # Input: [batch_size, num_frames, num_bands] 
    # where [num_frames, num_bands] represents log-mel-scale spectrogram
    # Output: embeddings
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, vggish_params.VGGISH_MODEL)
    
    pca_params = np.load(vggish_params.VGGISH_PCA_PARAMS)
    pca_matrix = pca_params[vggish_params.PCA_EIGEN_VECTORS_NAME]
    pca_means = pca_params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)
    
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.VGGISH_INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.VGGISH_OUTPUT_TENSOR_NAME)
    vggish_slim.load_youtube_model(sess, MODEL_CHECKPOINT_FILE) # HK vggish_params.YOUTUBE_CHECKPOINT_FILE
    
    # Run inference and postprocessing
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    
    postprocessed_batches = np.dot(
            pca_matrix, (embedding_batch.T - pca_means)
        ).T
    #print(postprocessed_batches)
    pred_vals =  []
    # we need to loop over batch size of 10
    for i in range(10,postprocessed_batches.shape[0]):
        # per 10 batches
        postprocessed_batch = postprocessed_batches[i-10:i]

        num_frames = np.minimum(postprocessed_batch.shape[0], vggish_params.MAX_FRAMES)
        data = vggish_postprocess.resize(postprocessed_batch, 0, vggish_params.MAX_FRAMES)
        data = np.expand_dims(data, 0)
        num_frames = np.expand_dims(num_frames, 0)

        input_tensor = sess.graph.get_collection("input_batch_raw")[0]
        num_frames_tensor = sess.graph.get_collection("num_frames")[0]
        predictions_tensor = sess.graph.get_collection("predictions")[0]

        predictions_tensors = sess.graph.get_collection("predictions")

        ## Stap 4
        predictions_val, = sess.run(
            [predictions_tensor],
            feed_dict={
                input_tensor: data,
                num_frames_tensor: num_frames
            })
        pred_vals.append(predictions_val)

<tf.Tensor 'vggish/embedding:0' shape=(?, 128) dtype=float32>

INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/serval03/model.ckpt-13810


In [7]:
## Filter predictions (give top 20 where p>0.1)
count = vggish_params.PREDICTIONS_COUNT_LIMIT
hit = vggish_params.PREDICTIONS_HIT_LIMIT
top_indices = np.argpartition(predictions_val[0], -count)[-count:]
line = ((class_map[i], float(predictions_val[0][i])) for i in top_indices if predictions_val[0][i] > hit)
predictions = sorted(line, key=lambda p: -p[1])

In [8]:
print(predictions)

[('Motor vehicle (road)', 0.274502158164978), ('Motorcycle', 0.21720299124717712), ('Dog', 0.19438397884368896), ('Aircraft', 0.18517421185970306), ('Boat, Water vehicle', 0.11269407719373703)]


In [9]:
## Probabilities per increment
preds=[]
for p in pred_vals :
    line = [(class_map[i], float(p[0][i])) for i in range(1,len(p[0]))]
    preds.append(line)


In [10]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd

#type(preds[0][0])
df = pd.DataFrame(preds[0], columns = ['label',"aps"])
#df
plt.barh(y=df.label, width=df.aps, align='center', alpha=0.5 )
plt.show()


TypeError: barh() missing 1 required positional argument: 'bottom'

In [1]:
import matplotlib as mpl
print(mpl.__version__)

2.0.2
