In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import os 
import math
import random
import zipfile
import requests
import warnings
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
from tensorflow.keras import layers
from tensorflow.keras import models
from IPython.display import Audio
from IPython.core.display import display
from matplotlib import pyplot as plt
from pydub import AudioSegment, effects
from pydub.generators import WhiteNoise
from multiprocessing.pool import ThreadPool

**Listen to Parsed Samples**

In [3]:
capuchin_files = os.listdir("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips")
not_capuchin_files = os.listdir("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips")
Capuchin_File = "XC3776-6.wav" #random.choice(capuchin_files)
Not_Capuchin_File = random.choice(not_capuchin_files)
print(f"Displaying {Capuchin_File} which is an example of a Parsed Capuchinbird Call:")
display(Audio(os.path.join("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips",Capuchin_File)))
print(f"Displaying {Not_Capuchin_File} which is an example of a Parsed Other Noise:")
display(Audio(os.path.join("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips",Not_Capuchin_File)))

**Data Augmentation**
> Similar to Image Recognition in order to produce a robust model I want to Augment training set with transformed samples. Here I provide a few examples that can be used to transform audio clips.

In [4]:
def Add_White_Noise(sound, decibels = 50):
    """
    Add White Noise to an Audio Clip and return the new clip
    Note: sound should be an AudioSegment
    """
    noise = WhiteNoise().to_audio_segment(duration=len(sound))-decibels
    combined = sound.overlay(noise)
    return combined
def Normalize_Volume(sound):
    """
    Normalize the Volume of a Clip and return the new clip
    Note :sound should be an AudioSegment
    """
    normalized_sound = effects.normalize(sound) 
    return normalized_sound
def Filter_Out_High_Frequency(sound,cutoff = 8e3):
    """
    Filter out High Frequencies in a Clip and return the new clip
    Note: sound should be an AudioSegment and cutoff is in Hz (default is 8kHz)
    """
    filtered_sound = effects.low_pass_filter(sound,cutoff) 
    return filtered_sound
def Filter_Out_Low_Frequency(sound,cutoff = 8e3):
    """
    Filter out High Frequencies in a Clip and return the new clip
    Note: sound should be an AudioSegment and cutoff is in Hz (default is 8kHz)
    """
    filtered_sound = effects.high_pass_filter(sound,cutoff) 
    return filtered_sound

***Model Training (Spectrograph and CNN)***

In [6]:
def decode_audio(audio_binary):
    # Decode WAV-encoded audio files to `float32` tensors, normalized
    # to the [-1.0, 1.0] range. Return `float32` audio and a sample rate.
    audio, _ = tf.audio.decode_wav(contents=audio_binary,desired_channels=1,)
    # Since all the data is single channel (mono), drop the `channels`
    # axis from the array.
    return tf.squeeze(audio, axis=-1)
def get_label(file_path):
    parts = tf.strings.split(
        input=file_path,
        sep=os.path.sep)
    # Note: You'll use indexing here instead of tuple unpacking to enable this
    # to work in a TensorFlow graph.
    return parts[-2]
def get_waveform_and_label(file_path):
    label = get_label(file_path)
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    return waveform, label
def get_spectrogram(waveform):
    # Zero-padding for an audio waveform with less than 16,000 samples.
    input_len = 16000
    waveform = waveform[:input_len]
    zero_padding = tf.zeros(
    [16000] - tf.shape(waveform),
    dtype=tf.float32)
    # Cast the waveform tensors' dtype to float32.
    waveform = tf.cast(waveform, dtype=tf.float32)
    # Concatenate the waveform with `zero_padding`, which ensures all audio
    # clips are of the same length.
    equal_length = tf.concat([waveform, zero_padding], 0)
    # Convert the waveform to a spectrogram via a STFT.
    spectrogram = tf.signal.stft(
    equal_length, frame_length=255, frame_step=128)
    # Obtain the magnitude of the STFT.
    spectrogram = tf.abs(spectrogram)
    # Add a `channels` dimension, so that the spectrogram can be used
    # as image-like input data with convolution layers (which expect
    # shape (`batch_size`, `height`, `width`, `channels`).
    spectrogram = spectrogram[..., tf.newaxis]
    return spectrogram
def plot_spectrogram(spectrogram, ax):
    if len(spectrogram.shape) > 2:
        assert len(spectrogram.shape) == 3
        spectrogram = np.squeeze(spectrogram, axis=-1)
    # Convert the frequencies to log scale and transpose, so that the time is
    # represented on the x-axis (columns).
    # Add an epsilon to avoid taking a log of zero.
    log_spec = np.log(spectrogram.T + np.finfo(float).eps)
    height = log_spec.shape[0]
    width = log_spec.shape[1]
    X = np.linspace(0, np.size(spectrogram), num=width, dtype=int)
    Y = range(height)
    ax.pcolormesh(X, Y, log_spec)
def get_spectrogram_and_label_id(audio, label):
    spectrogram = get_spectrogram(audio)
    label_id = tf.argmax(label == commands)
    return spectrogram, label_id
def preprocess_dataset(files):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(
        map_func=get_waveform_and_label,
        num_parallel_calls=AUTOTUNE)
    output_ds = output_ds.map(
        map_func=get_spectrogram_and_label_id,
        num_parallel_calls=AUTOTUNE)
    return output_ds

**Build Dataset**

In [7]:
# Set the seed value for experiment reproducibility.
seed = 1842
tf.random.set_seed(seed)
np.random.seed(seed)
# Turn off warnings for cleaner looking notebook
warnings.simplefilter('ignore')

data_dir = "/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing"
commands = ["Parsed_Capuchinbird_Clips","Parsed_Not_Capuchinbird_Clips"]
filenames_Capuchinbird = tf.io.gfile.glob(str(data_dir) + '/Parsed_Capuchinbird_Clips/*')
filenames_Not_Capuchinbird = tf.io.gfile.glob(str(data_dir) + '/Parsed_Not_Capuchinbird_Clips/*')
filenames =tf.concat([filenames_Capuchinbird, filenames_Not_Capuchinbird], 0)
filenames = tf.random.shuffle(filenames)
num_samples = len(filenames)
print('Number of total examples:', num_samples)

train_split = int(.8*num_samples)
val_split = int(.1*num_samples)
test_split = num_samples - train_split - val_split
train_files = filenames[:train_split]
val_files = filenames[train_split: train_split + val_split]
test_files = filenames[-1*test_split:]

print('Training set size', len(train_files))
print('Validation set size', len(val_files))
print('Test set size', len(test_files))

AUTOTUNE = tf.data.AUTOTUNE

files_ds = tf.data.Dataset.from_tensor_slices(train_files)

waveform_ds = files_ds.map(
    map_func=get_waveform_and_label,
    num_parallel_calls=AUTOTUNE)

**Visualize Dataset**

In [8]:
rows = 3
cols = 3
n = rows * cols
fig, axes = plt.subplots(rows, cols, figsize=(10, 12))
for i, (audio, label) in enumerate(waveform_ds.take(n)):
    r = i // cols
    c = i % cols
    ax = axes[r][c]
    ax.plot(audio.numpy())
    ax.set_yticks(np.arange(-1.2, 1.2, 0.2))
    label = label.numpy().decode('utf-8')
    ax.set_title(label)

plt.show()

In [9]:
for waveform, label in waveform_ds.take(1):
    label = label.numpy().decode('utf-8')
    spectrogram = get_spectrogram(waveform)

print('Label:', label)
print('Waveform shape:', waveform.shape)
print('Spectrogram shape:', spectrogram.shape)
print('Audio playback')
display(Audio(waveform, rate=16000))

In [10]:
fig, axes = plt.subplots(2, figsize=(12, 8))
timescale = np.arange(waveform.shape[0])
axes[0].plot(timescale, waveform.numpy())
axes[0].set_title('Waveform')
axes[0].set_xlim([0, 16000])

plot_spectrogram(spectrogram.numpy(), axes[1])
axes[1].set_title('Spectrogram')
plt.show()

In [11]:
spectrogram_ds = waveform_ds.map(
  map_func=get_spectrogram_and_label_id,
  num_parallel_calls=AUTOTUNE)

rows = 3
cols = 3
n = rows*cols
fig, axes = plt.subplots(rows, cols, figsize=(10, 10))

for i, (spectrogram, label_id) in enumerate(spectrogram_ds.take(n)):
    r = i // cols
    c = i % cols
    ax = axes[r][c]
    plot_spectrogram(spectrogram.numpy(), ax)
    ax.set_title(commands[label_id.numpy()])
    ax.axis('off')

plt.show()

**BUILD AND TRIN MODEL**

In [12]:
train_ds = spectrogram_ds
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

train_ds = train_ds.cache().prefetch(AUTOTUNE)
val_ds = val_ds.cache().prefetch(AUTOTUNE)

In [13]:
for spectrogram, _ in spectrogram_ds.take(1):
    input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(commands)

# Instantiate the `tf.keras.layers.Normalization` layer.
norm_layer = layers.Normalization()
# Fit the state of the layer to the spectrograms
# with `Normalization.adapt`.
norm_layer.adapt(data=spectrogram_ds.map(map_func=lambda spec, label: spec))

cnn_model = models.Sequential([
    layers.Input(shape=input_shape),
    # Downsample the input.
    layers.Resizing(32, 32),
    # Normalize.
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels),
])

cnn_model.summary()

In [14]:
cnn_model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
)
EPOCHS = 20
history = cnn_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, 
                                               patience=5,
                                               restore_best_weights=True
                                              ),
)

****Investigate Model Performance****

In [15]:
metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

In [16]:
test_audio = []
test_labels = []

for audio, label in test_ds:
    test_audio.append(audio.numpy())
    test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

y_pred = np.argmax(cnn_model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

In [17]:
confusion_mtx = tf.math.confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx,
            xticklabels=commands,
            yticklabels=commands,
            annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

In [18]:
capuchin_files = os.listdir("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips")
sample_file = random.choice(capuchin_files)
print(sample_file)
sample_ds = preprocess_dataset([os.path.join("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips",sample_file)])

for spectrogram, label in sample_ds.batch(1):
    prediction = cnn_model(spectrogram)
    plt.bar(commands, tf.nn.softmax(prediction[0]))
    plt.title(f'Predictions for "{commands[label[0]]}"')
    plt.show()

**Model Training (Transfer Learning)**

In [19]:
@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav
def load_wav_for_map(filename, label):
    return load_wav_16k_mono(filename), label
def extract_embedding(wav_data, label):
    """ run YAMNet to extract embedding from the wav data """
    scores, embeddings, spectrogram = yamnet_model(wav_data)
    num_embeddings = tf.shape(embeddings)[0]
    return (embeddings,
            tf.repeat(label, num_embeddings)
           )
def get_dataset_partitions_tf(ds, ds_size, train_split=0.8, val_split=0.1, test_split=0.1, shuffle=True, shuffle_size=10000):
    """ Split Train, Test and Validation Datasets out of Dataframe """
    assert (train_split + test_split + val_split) == 1
    
    if shuffle:
        # Specify seed to always have the same split distribution between runs
        ds = ds.shuffle(shuffle_size, seed=1842)
    
    train_size = int(train_split * ds_size)
    val_size = int(val_split * ds_size)
    
    train_ds = ds.take(train_size)    
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)
    
    return train_ds, val_ds, test_ds
# Filter out Annoying Tensorflow Warnings
tf.get_logger().setLevel('ERROR')

**Download YAMNet pretrained model**

In [20]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

**Build Training Set**

In [21]:
capuchin_files = os.listdir("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips")
capuchin = []
for file in capuchin_files:
    if file.endswith('.wav'):
        capuchin.append(os.path.join("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips",file))
not_capuchin_files = os.listdir("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips")
not_capuchin = []
for file in not_capuchin_files:
    if file.endswith('.wav'):
        not_capuchin.append(os.path.join("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips",file))

capuchin_pd = pd.DataFrame({"filename":capuchin,"target":1})
not_capuchin_pd = pd.DataFrame({"filename":not_capuchin,"target":0})
dataset_pd = pd.concat([capuchin_pd,not_capuchin_pd],axis=0,ignore_index=True)
dataset_pd

main_ds = tf.data.Dataset.from_tensor_slices((dataset_pd["filename"], dataset_pd["target"]))
main_ds = main_ds.map(load_wav_for_map)
main_ds.element_spec

# extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()
main_ds.element_spec

train_ds, val_ds, test_ds = get_dataset_partitions_tf(main_ds,len(dataset_pd))
train_ds = train_ds.cache().shuffle(1000).batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

**Build Model**

In [22]:
yamnet_transfer_learning_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(2)
], name='yamnet_transfer_learning_model')

yamnet_transfer_learning_model.summary()

In [23]:
yamnet_transfer_learning_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)

**Train model**

In [24]:
history = yamnet_transfer_learning_model.fit(train_ds,
                       epochs=20,
                       validation_data=val_ds,
                       callbacks=callback)

**Investigate model perfomance**

In [25]:
loss, accuracy = yamnet_transfer_learning_model.evaluate(test_ds)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

history = yamnet_transfer_learning_model.fit(train_ds,
                       epochs=20,
                       validation_data=val_ds,
                       callbacks=callback)

# Use Model Outputs to Count Capuchinbird Calls

**Simple Test Case Generator**

In [26]:
def locations_to_approx_seconds(location):
    return str(location*3)
def locations_to_approx_result_row(location):
    return str(int(location*6.5))
def make_tests(capuchin_path,not_capuchin_path,capuchin_count):
    seconds = 1000
    capuchin_sound = AudioSegment.from_wav(capuchin_path)
    not_capuchin_sound = AudioSegment.from_wav(not_capuchin_path)
    total_clips = 60
    locations = random.sample(range(1, total_clips), capuchin_count)
    locations.sort()
    clip_positions = ",".join(map(str,locations))
    approx_locations_sec = ",".join(map(locations_to_approx_seconds,locations))
    approx_locations_result_row = ",".join(map(locations_to_approx_result_row,locations))
    print(f"Capuchin Calls are Located at [{clip_positions}] positions in the clip")
    print(f"Capuchin Calls are Located around [{approx_locations_sec}] seconds in the clip")
    print(f"Capuchin Calls are Located around [{approx_locations_result_row}] in the result rows")
    clips = []
    for i in range(total_clips):
        if i in locations:
            clips.append(capuchin_sound)
        else:
            test = random.sample([0,1],1)
            if test == 0:
                clips.append(WhiteNoise().to_audio_segment(duration=len(3*1000)))
            else:
                clips.append(not_capuchin_sound)
    final_clip = clips[0]
    for i in range(1,len(clips)):
        final_clip = final_clip + clips[i]
    output_file = "test.wav"
    final_clip.export(output_file, format="wav")
    return output_file

**Generate Test Case**

In [27]:
capuchin_files = os.listdir("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips")
not_capuchin_files = os.listdir("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips")
Capuchin_File = random.choice(capuchin_files)
Not_Capuchin_File = random.choice(not_capuchin_files)
Num_Capuchin_Calls = 5
print(f"Using {Capuchin_File} and {Not_Capuchin_File} to generate {Num_Capuchin_Calls} Capuchinbird Calls")

not_capuchin_path = os.path.join("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Not_Capuchinbird_Clips",Not_Capuchin_File)
capuchin_path = os.path.join("/kaggle/input/z-by-hp-unlocked-challenge-3-signal-processing/Parsed_Capuchinbird_Clips",Capuchin_File)
test_file_name = make_tests(capuchin_path,not_capuchin_path,Num_Capuchin_Calls)

**Generate Model Results**

In [28]:
testing_wav_data = load_wav_16k_mono(test_file_name)
scores, embeddings, spectrogram = yamnet_model(testing_wav_data)
result = yamnet_transfer_learning_model(embeddings).numpy()
result

**Simple Capuchinbird Call Counter**
> All Positive Model Scores for the Positive Class (Capuchinbird) that occur in a row as a single call. It is easy to see cases where this will fail to properly count the calls so building a more complex Call Counter will be up to you and an important piece of your solution

In [29]:
count = 0
previous_pos = 0
capuchin_count = 0
print("Embeddings with Positive Val for Capuchinbird Call:")
for row in result:
    if row[1]>0:
        if count - previous_pos > 1:
            capuchin_count += 1
        previous_pos = count
        value = '%.2f' % round(row[1],2)
        if count <100:
            print(f"  row:  {count} value: {value}")
        else:
            print(f"  row: {count} value: {value}")
    count += 1
print(f"Found {capuchin_count} of {Num_Capuchin_Calls} Capuchin Calls!")