# Task 2: Setup

In [1]:
!pip install -q pydub

In [5]:
!pip install tensorflow-datasets

Collecting tensorflow-datasets
  Downloading tensorflow_datasets-4.4.0-py3-none-any.whl (4.0 MB)
Collecting promise
  Downloading promise-2.3.tar.gz (19 kB)
Collecting importlib-resources; python_version < "3.9"
  Downloading importlib_resources-5.2.2-py3-none-any.whl (27 kB)
Collecting tensorflow-metadata
  Downloading tensorflow_metadata-1.2.0-py3-none-any.whl (48 kB)
Collecting dill
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
Collecting googleapis-common-protos<2,>=1.52.0
  Downloading googleapis_common_protos-1.53.0-py2.py3-none-any.whl (198 kB)
Building wheels for collected packages: promise
  Building wheel for promise (setup.py): started
  Building wheel for promise (setup.py): finished with status 'done'
  Created wheel for promise: filename=promise-2.3-py3-none-any.whl size=21499 sha256=877cafbeda28a0266b75edff54fd99c434f9cf1ad01a68147f3ea3970d52b908
  Stored in directory: c:\users\tua_f\appdata\local\pip\cache\wheels\54\aa\01\724885182f93150035a2a91bce34a12877e8067a

In [6]:
import tensorflow as tf
import tensorflow_datasets as tfds

from IPython.display import Audio
from matplotlib import pyplot as plt
from tqdm import tqdm

print(tf.__version__)
print(tfds.__version__)

2.3.1
4.4.0


In [7]:
dataset = tfds.load("gtzan_music_speech")

[1mDownloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to C:\Users\tua_f\tensorflow_datasets\gtzan_music_speech\1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…






ConnectionError: HTTPConnectionPool(host='opihi.cs.uvic.ca', port=80): Max retries exceeded with url: /sound/music_speech.tar.gz (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001E7B7FEE100>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))

# Task 3: Explore the Data

In [None]:
train = dataset["train"]

data_iter = iter(train)

In [None]:
example = next(data_iter)

example

In [None]:
audio = example.get("audio")
label = example.get("label")

print(label.numpy())

Audio(audio, rate=22050)

In [None]:
index_to_class = {0: "music", 1: "speech"}
class_to_index = {"music": 0, "speech": 1}

In [None]:
def plot_waveform(audio):
    plt.plot(audio)
    plt.xlabel("samples")
    plt.ylabel("amplitude")
    plt.show()

In [None]:
plot_waveform(audio.numpy())

# Task 4: Spectrogram

In [None]:
def get_stft(audio, frame_length=2048, frame_step=512, fft_length=256):
    return tf.signal.stft(
        tf.cast(audio, tf.float32),
        frame_length=frame_length,
        frame_step=frame_step,
        fft_length=fft_length
    )

In [None]:
audio_stft = get_stft(audio)
audio_spec = tf.abs(audio_stft)

In [None]:
def plot_spec(spec):
    plt.figure(figsize=(12, 4))
    plt.imshow(tf.transpose(spec), cmap="viridis")
    plt.colorbar()
    plt.show()

In [None]:
plot_spec(audio_spec[:200])

In [None]:
audio_spec_log = tf.math.log(audio_spec)

plot_spec(audio_spec_log[:200])

In [None]:
def get_spectrogram(audio):
    audio_stft = get_stft(audio)
    audio_spec = tf.abs(audio_stft)
    return tf.math.log(tf.transpose(audio_spec))

# Task 5: Prepare the Data

In [None]:
sr = 22050 # samples per second
chunk_len = 5 # seconds


def preprocess_example(example):
    audio = example.get("audio")
    label = example.get("label")

    x_batch, y_batch = None, None

    for j in range(0, 6):
        audio_chunk = audio[j * chunk_len * sr: (j + 1) * chunk_len * sr]
        audio_spec = get_spectrogram(audio_chunk)
        audio_spec = tf.expand_dims(audio_spec, axis=0)
        current_label = tf.expand_dims(label, axis=0)
        
        x_batch = audio_spec if x_batch is None else tf.concat([x_batch, audio_spec], axis=0)
        y_batch = current_label if y_batch is None else tf.concat([y_batch, current_label], axis=0)
    return x_batch, y_batch

In [None]:
x_train, y_train = None, None


for example in tqdm(iter(train)):
    x_batch, y_batch = preprocess_example(example)
    
    x_train = x_batch if x_train is None else tf.concat([x_train, x_batch], axis=0)
    y_train = y_batch if y_train is None else tf.concat([y_train, y_batch], axis=0)

In [None]:
print(x_train.shape, y_train.shape)

In [None]:
indices = tf.random.shuffle(list(range(0, 768)))

x_train = tf.gather(x_train, indices)
y_train = tf.gather(y_train, indices)

n_val = 300

x_valid = x_train[:n_val, ...]
y_valid = y_train[:n_val, ...]

x_train = x_train[n_val:, ...]
y_train = y_train[n_val:, ...]

In [None]:
plt.figure(figsize=(12, 12))

st = 0

for i in range(0, 6):
    x, y = x_train[st + i], y_train[st + i]

    plt.subplot(3, 2, i + 1)
    plt.imshow(x, cmap="viridis")
    plt.title(index_to_class[y.numpy()])
    plt.colorbar()
plt.show()

# Task 6: Create the Model

In [None]:
from tensorflow.keras.layers import Input, Lambda, Conv2D, BatchNormalization
from tensorflow.keras.layers import Activation, MaxPool2D, Flatten, Dropout, Dense


input_ = Input(shape=(129, 212))
x = Lambda(lambda x: tf.expand_dims(x, axis=-1))(input_)

for i in range(0, 4):
    n_filters = 2**(5 + i)
    x = Conv2D(n_filters, 3)(x)
    x = BatchNormalization()(x)
    x = Activation("tanh")(x)
    x = MaxPool2D(2)(x)

x = Flatten()(x)
x = Dropout(0.4)(x)
x = Dense(128, activation="relu")(x)
x = Dropout(0.4)(x)
x = Dense(1, activation="sigmoid")(x)

model = tf.keras.models.Model(input_, x)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-6),
    loss="binary_crossentropy", metrics=["accuracy"]
)

model.summary()

# Task 7: Model Training

In [None]:
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self, *args, **kwargs):
        super(CustomCallback, self).__init__(*args, **kwargs)
        self.target_acc = kwargs.get("target_acc") or 0.95
        self.log_epoch = kwargs.get("log_epoch") or 5

    def on_epoch_end(self, epoch, logs=None):
        loss = logs.get("loss")
        acc = logs.get("accuracy")
        val_loss = logs.get("val_loss")
        val_acc = logs.get("val_accuracy")

        if (epoch + 1) % self.log_epoch == 0:
            print(f"Epoch: {epoch:3d}, Loss: {loss:.4f}, Acc: {acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        if val_acc >= self.target_acc:
            print("Target val accuracy achieved", val_acc)
            model.stop_training = True

In [None]:
_ = model.fit(
    x_train, y_train,
    batch_size=12,
    epochs=500,
    validation_data=(x_valid, y_valid),
    verbose=False,
    callbacks=[CustomCallback()]
)

# Task 8: Predictions

In [None]:
example = next(data_iter)

x_test, y_test = preprocess_example(example)

preds = model.predict(x_test)
pred_classes = tf.squeeze(tf.cast(preds > 0.5, tf.int8))

print("Preds:", [index_to_class[c.numpy()] for c in pred_classes])
print("GT:", [index_to_class[c.numpy()] for c in y_test])