# Imports

In [9]:
from jupyter_client import kernelspec
spec = kernelspec.get_kernel_spec("tensorflow")
print(spec.resource_dir)

/Users/ryantran/Library/Jupyter/kernels/tensorflow


In [4]:
import os

from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio

ModuleNotFoundError: No module named 'tensorflow_io'

# Convert file to 16 KHz audio

In [5]:
@tf.function
def load_wav_16k_mono(filename):
    """ Load a WAV file, convert it to a float tensor, resample to 16 kHz single-channel audio. """
    file_contents = tf.io.read_file(filename)
    wav, sample_rate = tf.audio.decode_wav(
          file_contents,
          desired_channels=1)
    wav = tf.squeeze(wav, axis=-1)
    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
    wav = tfio.audio.resample(wav, rate_in=sample_rate, rate_out=16000)
    return wav

# Path declaration

In [7]:
cleanTrain = os.path.join('data', 'clean_train')
cleanTest = os.path.join('data', 'clean_test')
noiseTrain = os.path.join('data', 'noise_train')
noiseTest = os.path.join('data', 'noise_test')

# Testing Yamnet model

In [3]:
yamnet_model_handle = 'https://tfhub.dev/google/yamnet/1'
yamnet_model = hub.load(yamnet_model_handle)

NameError: name 'hub' is not defined

In [6]:
class_map_path = yamnet_model.class_map_path().numpy().decode('utf-8')
class_names =list(pd.read_csv(class_map_path)['display_name'])

for name in class_names[:20]:
  print(name)
print('...')

Speech
Child speech, kid speaking
Conversation
Narration, monologue
Babbling
Speech synthesizer
Shout
Bellow
Whoop
Yell
Children shouting
Screaming
Whispering
Laughter
Baby laughter
Giggle
Snicker
Belly laugh
Chuckle, chortle
Crying, sobbing
...


In [8]:
test = os.path.join(noiseTrain, os.listdir(noiseTrain)[5])
testing_wav_data = load_wav_16k_mono(test)
display.Audio(testing_wav_data,rate=16000)





In [9]:
scores, embeddings, spectrogram = yamnet_model(testing_wav_data)
class_scores = tf.reduce_mean(scores, axis=0)
top_class = tf.math.argmax(class_scores)
inferred_class = class_names[top_class]

print(f'The main sound is: {inferred_class}')
print(f'The embeddings shape: {embeddings.shape}')

The main sound is: Vehicle
The embeddings shape: (61, 1024)


# Create and embed train dataset

In [10]:
pd_data = pd.DataFrame(columns = ['filename', 'category'])
for i in os.listdir(noiseTrain):
    pd_data = pd_data.append({'filename' : (os.path.join(noiseTrain, i)), 'category' : 0},
        ignore_index = True)
for i in os.listdir(cleanTrain):
    pd_data = pd_data.append({'filename' : (os.path.join(cleanTrain, i)), 'category' : 1},
        ignore_index = True)
    
pd_data.head(10)

Unnamed: 0,filename,category
0,data\noise_train\AirConditioner_1.wav,0
1,data\noise_train\AirConditioner_10.wav,0
2,data\noise_train\AirConditioner_2.wav,0
3,data\noise_train\AirConditioner_3.wav,0
4,data\noise_train\AirConditioner_4.wav,0
5,data\noise_train\AirConditioner_5.wav,0
6,data\noise_train\AirConditioner_6.wav,0
7,data\noise_train\AirConditioner_7.wav,0
8,data\noise_train\AirConditioner_8.wav,0
9,data\noise_train\AirConditioner_9.wav,0


In [39]:
filenames = pd_data['filename']
categories = pd_data['category'].astype('int64')

main_ds = tf.data.Dataset.from_tensor_slices((filenames, categories))
main_ds.element_spec

(TensorSpec(shape=(), dtype=tf.string, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [18]:
def load_wav_for_map(filename, label):
  return load_wav_16k_mono(filename), label

main_ds = main_ds.map(load_wav_for_map)
main_ds.element_spec





(TensorSpec(shape=<unknown>, dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

In [19]:
# applies the embedding extraction model to a wav data
def extract_embedding(wav_data, label):
  ''' run YAMNet to extract embedding from the wav data '''
  scores, embeddings, spectrogram = yamnet_model(wav_data)
  num_embeddings = tf.shape(embeddings)[0]
  return (embeddings,
            tf.repeat(label, num_embeddings))

# extract embedding
main_ds = main_ds.map(extract_embedding).unbatch()
main_ds.element_spec

(TensorSpec(shape=(1024,), dtype=tf.float32, name=None),
 TensorSpec(shape=(), dtype=tf.int64, name=None))

# Create and embed test dataset

In [None]:
pd_data = pd.DataFrame(columns = ['filename', 'category'])
for i in os.listdir(noiseTest):
    pd_data = pd_data.append({'filename' : (os.path.join(noiseTest, i)), 'category' : 0},
        ignore_index = True)
for i in os.listdir(cleanTest):
    pd_data = pd_data.append({'filename' : (os.path.join(cleanTest, i)), 'category' : 1},
        ignore_index = True)
    
pd_data.head(10)

In [None]:
filenames = pd_data['filename']
categories = pd_data['category'].astype('int64')

test_ds = tf.data.Dataset.from_tensor_slices((filenames, categories))
test_ds = main_ds.map(load_wav_for_map)
test_ds = main_ds.map(extract_embedding).unbatch()
test_ds.element_spec

In [None]:
noise_ds = main_ds.cache().filter(lambda embedding, category: category == 0)
clean_ds = main_ds.cache().filter(lambda embedding, category: category == 1)

trainNoise_ds, valNoise_ds = tfds.load(noise_ds, split=['train', 'val[:20%]'])
trainClean_ds, valClean_ds = tfds.load(clean_ds, split=['train', 'val[:20%]'])

train_ds = trainNoise_ds.concatenate(trainClean_ds)
val_ds = valNoise_ds.concatenate(valClean_ds)

In [54]:
train_ds = train_ds.cache().shuffle(23205).batch(32).prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.cache().batch(32).prefetch(tf.data.AUTOTUNE)

23205

# Create and train model

In [None]:
my_model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1024), dtype=tf.float32,
                          name='input_embedding'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(len(my_classes))
], name='my_model')

my_model.summary()

In [None]:
my_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 optimizer="adam",
                 metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                            patience=3,
                                            restore_best_weights=True)

history = my_model.fit(train_ds,
                       epochs=20,
                       validation_data=val_ds,
                       callbacks=callback)

# Check for overfitting

In [None]:
loss, accuracy = my_model.evaluate(test_ds)

print("Loss: ", loss)
print("Accuracy: ", accuracy)

# Test model

In [None]:
scores, embeddings, spectrogram = yamnet_model(testing_wav_data)
result = my_model(embeddings).numpy()

inferred_class = my_classes[result.mean(axis=0).argmax()]
print(f'The main sound is: {inferred_class}')