In [1]:
# This script adds more layers on top of the VGGish model and then trains the larger model
# The model is fed with log mel spectrogram examples with associated labels

import numpy as np
import csv
import os
import re
import argparse
import sys
from random import shuffle

from scipy.io import wavfile
import tensorflow as tf
from tensorflow.python.platform import gfile

from audio.processor import WavProcessor, format_predictions
from audio import params
from audio.utils import vggish, youtube8m

In [2]:
flags = tf.app.flags
slim = tf.contrib.slim

FLAGS = flags.FLAGS

In [3]:
# location original model
flags.DEFINE_string('checkpoint','C:\\Users\\myrna\\Documents\\SensingClues\\devicehive-audio-analysis-master\\models\\vggish_model.ckpt','checkpoint')

# Epoch = entire dataset passing forward and backward through the neural network once
# For higher accuracy, take several epochs
flags.DEFINE_integer('num_epochs',10,'Number of times dataset is fed to model')

# One epoch is too big to feed at once, so it is divided into smaller batches
# Batch size = nr of training samples in order to make one update to the model parameters

# Deze setten we pas als we een grote dataset hebben
flags.DEFINE_integer('num_batches',50,'Number of batches of examples to feed into the model')

In [4]:
## Even een korte uitleg over wat er gebeurt met een wav-bestand
## Samples vormt de geluidsgolf, waarbij de getallen het aantal decibel voorstellen
## sr=sample rate is het aantal samples per seconden (de samples worden later ge-resampled met de gewenste sr)
## In de functie 'waveform_to_examples' (zie vvgish/input) worden de mel spectogrammen gemaakt
## Om niet alle samples in 1 keer aan het model te voeren, worden ze in overlappende windows onderverdeeld
## Voor elk window wordt een spectogram gemaakt (x-as: tijd (in samples), y-as: frequency, getallen zijn db)

## In get_features worden deze examples vervolgens weer verwerkt voordat ze het model in gaan
## Hierbij worden ook tf-parameters en embeddings opgehaald
## Embeddings worden bepaald door o.a. PCA, discretisering

# Function that extracts features (input: filename, output: features)
def extract_features(wav_file):
    proc = WavProcessor()
    sr, data = wavfile.read(wav_file)
    samples = data / 32768.0 #each sample is 2bytes (-32767 t/m 32768), convert into nrs between -1 and 1
    # This function converts the samples into np arrays of [batch size, num frames, num bands]
    examples_batch = vggish.input.waveform_to_examples(samples, sr)
    # This function uses parameters to convert examples into features
    #features = proc._get_features(examples_batch)
    return(examples_batch)#features)

In [32]:
# Function that returns set of shuffled batches consisting of log mel spectogram examples and labels

# Features is a NumPy array of shape [batch_size, num_frames, num_bands]
# Each row is a log mel spectrogram patch of shape [num_frames, num_bands]
# Frames are 'windows' over time
# Bands are frequency intervals
# (oftewel: voor verschillende momenten in de tijd en voor allerlei frequency-intervallen, wordt het aantal dB gegeven met een getal)

# Labels is a NumPy array of shape[batch_size, num_classes]
# where each row is a multi-hot label vector
# Hier moeten we nog wat voor verzinnen. Want hoe maken we een multi-hot vector van alle labels samen (527 + elephant rumble)

def get_examples_batch():
    all_examples = []
    all_labels = []

    for s in range(len(sub_dirs)):
        sub_dir = sub_dirs[s]
        label_name = os.path.basename(sub_dir)
        print(label_name)
        file_list = []
        file_glob = os.path.join(sub_dir, '*.' + extension)
        file_list.extend(tf.gfile.Glob(file_glob))
        for file_name in file_list:
            print(file_name)
            examples = extract_features(file_name)
            ## Volgens mij verdwijnen door onderstaande regel wel de gunshot-sounds, doordat ze te kort zijn
            if examples.shape[0] > 0: # if sound is less than 10 seconds, there is no output, so skip
                # Make multi-hot vector
                label_vector = [0] * len(sub_dirs)
                label_vector[s] = 1
                print(label_vector)
                labels = np.array([label_vector] * examples.shape[0])
                all_examples.append(examples)
                all_labels.append(labels)

    all_examples = np.concatenate(all_examples)
    all_labels = np.concatenate(all_labels)

    labeled_examples = list(zip(all_examples, all_labels))
    shuffle(labeled_examples)
    
    # Separate and return the features and labels.
    features = [example for (example, _) in labeled_examples]
    labels = [label for (_, label) in labeled_examples]
    return (features, labels)

In [31]:
## Get directory name where sounds are stored in subfolders (which represent the labels)
directory = "C:\\Users\\myrna\\Documents\\SensingClues\\Sounds"
sub_dirs = [x[0] for x in gfile.Walk(directory)]
## Remove root folder
sub_dirs = sub_dirs[1:]
num_classes = len(sub_dirs)

extension = "wav"

In [33]:
with tf.Graph().as_default(), tf.Session() as sess:
    embeddings = vggish.model.define_vggish_slim(True)
    with tf.variable_scope('mymodel'):
        
        # Add a fully connected layer with 100 units.
        num_units = 100
        fc = slim.fully_connected(embeddings, num_units)
        
        # Add a classifier layer at the end, consisting of parallel logistic
        # classifiers, one per class. This allows for multi-class tasks.
        logits = slim.fully_connected(fc, num_classes, activation_fn=None, scope='logits')
        tf.sigmoid(logits, name='prediction')
        
        # Add training ops.
        with tf.variable_scope('train'):
            global_step = tf.Variable(
                0, name='global_step', trainable=False,
                collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                             tf.GraphKeys.GLOBAL_STEP])

            # Labels are assumed to be fed as a batch multi-hot vectors, with
            # a 1 in the position of each positive class label, and 0 elsewhere.
            labels = tf.placeholder(
                tf.float32, shape=(None, 2), name='labels')

            # Cross-entropy label loss.
            xent = tf.nn.sigmoid_cross_entropy_with_logits(
                logits=logits, labels=labels, name='xent')
            loss = tf.reduce_mean(xent, name='loss_op')
            tf.summary.scalar('loss', loss)
            # We use the same optimizer and hyperparameters as used to train VGGish.
            optimizer = tf.train.AdamOptimizer(
                learning_rate=1e-4,
                epsilon=1e-8)
            optimizer.minimize(loss, global_step=global_step, name='train_op')
        
        # Initialize all variables in the model, and then load the pre-trained
        # VGGish checkpoint.
        sess.run(tf.global_variables_initializer())
        vggish.model.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    
         # Locate all the tensors and ops we need for the training loop.
        features_tensor = sess.graph.get_tensor_by_name('vggish/input_features:0')
        labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
        global_step_tensor = sess.graph.get_tensor_by_name('mymodel/train/global_step:0')
        loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
        train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')
        
        # The training loop.

        # [MvdB] Volgende twee loops staan even in commentaar, zodat ik niet lang hoef te wachten
        
        #for epoch in range(FLAGS.num_epochs)
        #for i in range(FLAGS.num_batches):
        (features, labels) = get_examples_batch()
        [num_steps, loss, i] = sess.run(
            [global_step_tensor, loss_tensor, train_op],
            feed_dict={features_tensor: features, labels_tensor: labels})
        print('Step %d: loss %g' % (num_steps, loss))


INFO:tensorflow:Restoring parameters from C:\Users\myrna\Documents\SensingClues\devicehive-audio-analysis-master\models\vggish_model.ckpt
Elephant rumble
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B07h15m37s24jul2007y_BRM_20.24695___27.92262.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B07h15m37s24jul2007y_BRM_27.52690___34.00537.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B07h15m37s24jul2007y_BRM_36.21818___39.13330.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B07h15m37s24jul2007y_BRM_37.4

[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B10h59m47s23jul2007y_BRM_54.11499___59.67574.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B10h59m47s23jul2007y_BRM_6.16209___10.37443.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B10h59m47s23jul2007y_BRM_81.91876___87.91565.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B11h10m13s21may2007y_BRM_1.86977___3.04282.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.

INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B14h50m28s23jul2007y_BRM_7.90074___9.41539.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B15h02m38s10apr2007y_BRM_134.57074___136.24527.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B15h02m38s10apr2007y_BRM_139.19405___141.65565.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B15h02m38s10apr2007y_BRM_150.04045___15

INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B16h19m14s19oct2007y_BRM_25.57084___29.80277.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B16h19m14s19oct2007y_BRM_44.00394___48.93416.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B16h19m14s19oct2007y_BRM_6.00212___10.64103.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B16h24m21s30jul2007y_BRM_10.47145___11.682

[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B17h16m44s02nov2007y_BRM_6.00616___8.44882.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\B17h52m18s06aug2007y_BRM_308.69723___310.45726.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B07h15m37s24jul2007y_BRM_20.24695___27.92262.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B07h15m37s24jul2007y_BRM_27.52690___34.00537.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from mode

INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B10h29m02s05apr2007y_BRM_10.83633___12.35301.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B10h29m02s05apr2007y_BRM_4.51684___7.73075.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B10h59m47s23jul2007y_BRM_54.11499___59.67574.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B10h59m47s23jul2007y_BRM_6.16209___10.37443.wav
INFO:tensorflow:Restoring paramete

INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B14h43m01s08feb2007y_BRM_93.94503___95.75793.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B14h43m01s08feb2007y_BRM_98.53061___100.45016.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B14h44m29s11may2007y_BRM_24.34446___28.02359.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B14h50m28s23

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x000002559048E730>
Traceback (most recent call last):
  File "C:\Users\myrna\Anaconda3\lib\weakref.py", line 357, in remove
    self = selfref()
KeyboardInterrupt


INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B15h17m50s05apr2007y_BRM_698.17948___700.26830.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B15h18m02s19oct2007y_BRM_1.11890___3.18435.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B15h20m23s24apr2007y_BRM_15.59986___18.74578.wav
INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt
[1, 0]
C:\Users\myrna\Documents\SensingClues\Sounds\Elephant rumble\padded_B15h24m29s19o

KeyboardInterrupt: 