In [None]:
## Import packages
import os
import csv
import numpy as np
import pandas as pd
from scipy.io import wavfile

import tensorflow as tf
from tensorflow import gfile

import vggish_input
import vggish_postprocess
import vggish_params
import vggish_slim

In [None]:
wavfile_path = 'elephants/angela_elephants'
#wavfile_path = 'elephants/angela_background'

labels_csv = "models/class_labels_indices_jungle.csv"

In [None]:
flags = tf.app.flags
FLAGS = flags.FLAGS
tf.app.flags.DEFINE_string('f', '', 'kernel')

flags.DEFINE_string(
    'model_checkpoint', 'models/models_jungle/model.ckpt',
    'Path to the VGGish checkpoint file.')

In [None]:
## Read csv-file with labels
class_map = {}
with open(labels_csv) as f:
    next(f)  # skip header
    reader = csv.reader(f)
    for row in reader:
        class_map[int(row[0])] = row[2]

In [None]:
class_map

In [None]:
## Ik verdeel het parsen van een wav-file naar embeddings in stappen:
# Stap 1a: lezen van wav-file, input is array met samples die db aanduiden. Ook sample rate (per sec) wordt gelezen
# Stap 1b: Bij 2d array (stereo, ipv mono) bereken gemiddelde, daarna normaliseren (delen door 32.768)
# Stap 2: Bepaal examples in vorm [batch size, num frames, num bands].
    # Hierbij worden voor verschillende batches (omdat alles tegelijk niet in 1x in NN kan),
    # een log mel spectrogram gemaakt (in vorm [num_frames, num_bands])
# Stap 3: Bepaal features: nu wordt de embedding laag gemaakt (PCA-components, discreet maken etc)
    # Hiervoor worden model-parameters opgehaald die eerder zijn opgeslagen
# Stap 4: Maken van predictions

In [None]:
def getPredictions(examples_batch):    
    ## Stap 3
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model: load the checkpoint and locate input and output tensors
        # Input: [batch_size, num_frames, num_bands] 
        # where [num_frames, num_bands] represents log-mel-scale spectrogram
        # Output: embeddings
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, vggish_params.VGGISH_MODEL)

        pca_params = np.load(vggish_params.VGGISH_PCA_PARAMS)
        pca_matrix = pca_params[vggish_params.PCA_EIGEN_VECTORS_NAME]
        pca_means = pca_params[vggish_params.PCA_MEANS_NAME].reshape(-1, 1)

        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.VGGISH_INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.VGGISH_OUTPUT_TENSOR_NAME)
        vggish_slim.load_youtube_model(sess, FLAGS.model_checkpoint)

        # Run inference and postprocessing
        [embedding_batch] = sess.run([embedding_tensor],
                                     feed_dict={features_tensor: examples_batch})

        postprocessed_batch = np.dot(
                pca_matrix, (embedding_batch.T - pca_means)
            ).T
        #print(postprocessed_batch)

        num_frames = np.minimum(postprocessed_batch.shape[0], vggish_params.MAX_FRAMES)
        data = vggish_postprocess.resize(postprocessed_batch, 0, vggish_params.MAX_FRAMES)
        data = np.expand_dims(data, 0)
        num_frames = np.expand_dims(num_frames, 0)

        input_tensor = sess.graph.get_collection("input_batch_raw")[0]
        num_frames_tensor = sess.graph.get_collection("num_frames")[0]
        predictions_tensor = sess.graph.get_collection("predictions")[0]

        ## Stap 4
        predictions_val, = sess.run(
            [predictions_tensor],
            feed_dict={
                input_tensor: data,
                num_frames_tensor: num_frames
            })
    return(predictions_val)

In [None]:
## Filter predictions (give top 20 where p>0.1)
def filterPredictions(predictions_val):
    count = vggish_params.PREDICTIONS_COUNT_LIMIT
    hit = vggish_params.PREDICTIONS_HIT_LIMIT
    top_indices = np.argpartition(predictions_val[0], -count)[-count:]
    line = ((class_map[i], float(predictions_val[0][i])) for i in top_indices if predictions_val[0][i] > hit)
    predictions = sorted(line, key=lambda p: -p[1])
    print(predictions)
    return(predictions)

In [None]:
all_predictions = []

In [None]:
## Read wav files
files = gfile.Glob(str(wavfile_path + "/*.wav"))

for file in files:
    ## Stap 1 en 2
    ## This function reads the wav file and converts the samples into np arrays of [batch size, num frames, num bands]
    examples_batch = vggish_input.wavfile_to_examples(file)
    print(examples_batch.shape)
    predictions = getPredictions(examples_batch)
    predictions = filterPredictions(predictions)
    if(len(predictions)>0):
        all_predictions.append(predictions[0][0])

In [None]:
from collections import Counter

sum_occur = 0
cnt_predictions = Counter(all_predictions)

for i in cnt_predictions.most_common(10):
    print(i)