In [None]:
# Base
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Tensoflow
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
# YAMNet
# Model naučen nad AudioSet-YouTube podatkih
# Vsak 10 sekundni video posnetek je označen z enim ali več izmed 521 možnimi razredi
# Lahko ga uporabimo tudi za izdvajanje značilnic
# Izhod je 521 dimenzionalni vektor, ki predstavlja verjetnosti za vsak možen razred

In [None]:
# Load YAMNet model
yamnet = hub.load('https://www.kaggle.com/models/google/yamnet/TensorFlow2/yamnet/1')

# Load class names (521 AudioSet labels)
class_map_path = tf.keras.utils.get_file('yamnet_class_map.csv', 'https://raw.githubusercontent.com/tensorflow/models/master/research/audioset/yamnet/yamnet_class_map.csv')
class_names = pd.read_csv(class_map_path)['display_name'].to_list()

# Download UrbanSound8K for testing
!curl -O https://storage.googleapis.com/audioset/speech_whistling2.wav
!curl -O https://storage.googleapis.com/audioset/miaow_16k.wav

In [None]:
# Silence
waveform = np.zeros(3 * 16000, dtype=np.float32)

# Run model
scores, embeddings, log_mel_spectrogram = yamnet(waveform)
scores = scores.numpy()
spectrogram = log_mel_spectrogram.numpy()

print(class_names[scores.mean(axis=0).argmax()])

In [None]:
# Domestic animals, pets
waveform, sr = librosa.load("miaow_16k.wav")

# Run model
scores, embeddings, log_mel_spectrogram = yamnet(waveform)
scores = scores.numpy()
spectrogram = log_mel_spectrogram.numpy()

print(class_names[scores.mean(axis=0).argmax()])

In [None]:
# Whistle
waveform, sr = librosa.load("speech_whistling2.wav")

# Run model
scores, embeddings, log_mel_spectrogram = yamnet(waveform)
scores = scores.numpy()
spectrogram = log_mel_spectrogram.numpy()

print(class_names[scores.mean(axis=0).argmax()])

In [None]:
# Parameters
patch_window_seconds: float = 0.96
patch_hop_seconds: float = 0.48

# Visualize
plt.figure(figsize=(10, 8))

# Plot waveform
plt.subplot(3, 1, 1)
plt.plot(waveform)
plt.xlim([0, len(waveform)])

# Plot log-mel spectrogram
plt.subplot(3, 1, 2)
plt.pcolormesh(spectrogram.T, shading='auto')

# Plot and label the model output scores for the top-scoring classes
mean_scores = np.mean(scores, axis=0)
top_N = 10
top_class_indices = np.argsort(mean_scores)[::-1][:top_N]

plt.subplot(3, 1, 3)
plt.imshow(scores[:, top_class_indices].T, aspect='auto', interpolation='nearest', cmap='gray_r')

# Compensate for patch_window_seconds (0.96s) context window to align with spectrogram
patch_padding = (patch_window_seconds / 2) / patch_hop_seconds
plt.xlim([-patch_padding, scores.shape[0] + patch_padding])

# Label top_N classes
yticks = range(0, top_N, 1)
plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks])
_ = plt.ylim(-0.5 + np.array([top_N, 0]))