In [1]:
import argparse
import numpy as np
from scipy.io import wavfile

In [2]:
from audio.processor import WavProcessor, format_predictions

  from ._conv import register_converters as _register_converters


In [3]:
import csv
import os
import tensorflow as tf

from audio import params
from audio.utils import vggish, youtube8m

In [4]:
wav_file = "wav_files/07063137_car.wav"
sr, data = wavfile.read(wav_file)



In [5]:
proc = WavProcessor()

INFO:tensorflow:Restoring parameters from models/vggish_model.ckpt
INFO:tensorflow:Restoring parameters from models/youtube_model.ckpt


In [6]:
# Normalize (between -1 and 1)
samples = data / 32768.0

In [7]:
# Converts audio waveform into an array of examples for VGGish
"""Returns:
      3-D np.array of shape [num_examples, num_frames, num_bands] which represents
      a sequence of examples, each of which contains a patch of log mel
      spectrogram, covering num_frames frames of audio and num_bands mel frequency
      bands, where the frame length is params.STFT_HOP_LENGTH_SECONDS"""
examples_batch = vggish.input.waveform_to_examples(samples, sr)

In [8]:
examples_batch.shape

(31, 96, 64)

In [9]:
features = proc._get_features(examples_batch)

In [10]:
features.shape

(31, 128)

In [16]:
predictions = proc._process_features(features)

In [17]:
predictions.shape

(1, 527)

In [24]:
#top_indices = np.argpartition(predictions[0], -count)[-count:]
line = ((proc._class_map[i], float(predictions[0][i])) for
                i in top_indices if predictions[0][i] > hit)
#        return sorted(line, key=lambda p: -p[1])

In [26]:
with open(params.CLASS_LABELS_INDICES) as f:
    next(f)  # skip header
    reader = csv.reader(f)
    for row in reader:
        print(row)
        #self._class_map[int(row[0])] = row[2]

['0', '/m/09x0r', 'Speech']
['1', '/m/05zppz', 'Male speech, man speaking']
['2', '/m/02zsn', 'Female speech, woman speaking']
['3', '/m/0ytgt', 'Child speech, kid speaking']
['4', '/m/01h8n0', 'Conversation']
['5', '/m/02qldy', 'Narration, monologue']
['6', '/m/0261r1', 'Babbling']
['7', '/m/0brhx', 'Speech synthesizer']
['8', '/m/07p6fty', 'Shout']
['9', '/m/07q4ntr', 'Bellow']
['10', '/m/07rwj3x', 'Whoop']
['11', '/m/07sr1lc', 'Yell']
['12', '/m/04gy_2', 'Battle cry']
['13', '/t/dd00135', 'Children shouting']
['14', '/m/03qc9zr', 'Screaming']
['15', '/m/02rtxlg', 'Whispering']
['16', '/m/01j3sz', 'Laughter']
['17', '/t/dd00001', 'Baby laughter']
['18', '/m/07r660_', 'Giggle']
['19', '/m/07s04w4', 'Snicker']
['20', '/m/07sq110', 'Belly laugh']
['21', '/m/07rgt08', 'Chuckle, chortle']
['22', '/m/0463cq4', 'Crying, sobbing']
['23', '/t/dd00002', 'Baby cry, infant cry']
['24', '/m/07qz6j3', 'Whimper']
['25', '/m/07qw_06', 'Wail, moan']
['26', '/m/07plz5l', 'Sigh']
['27', '/m/015lz1', 'S

In [14]:
predictions = proc._filter_predictions(predictions)

In [15]:
predictions

[('Vehicle', 0.9999918937683105),
 ('Music', 0.7827696204185486),
 ('Car', 0.7718221545219421),
 ('Bus', 0.7142980098724365),
 ('Toot', 0.46270951628685)]

In [29]:
len(predictions)

1

In [39]:
predictions.size

527