In [1]:
import os
import pandas as pd
import numpy as np
from scipy import signal
import librosa
import pandas as pd
import tensorflow as tf
import cv2
from sklearn.model_selection import train_test_split

In [2]:
DATASET_DIR = "./Dataset/train/audio/"
# DATASET_DIR_TEST = './Dataset/train/audio/'

In [3]:
def log_specgram(audio, sample_rate, window_size=20, step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(
        audio,
        fs=sample_rate,
        window="hann",
        nperseg=nperseg,
        noverlap=noverlap,
        detrend=False,
    )
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

In [4]:
# This block of code was insipired by the following link:
# https://github.com/OldBonhart/TensorFlow_Speech_Recognition_Challenge/blob/master/Train___EfficientNet.ipynb
labels = os.listdir(DATASET_DIR)
data_map = {}
for ind, label in enumerate(labels):
    data_path = DATASET_DIR + label
    data_map[data_path] = label, os.listdir(data_path), ind
data_map.keys()
data_map = {}
for ind, label in enumerate(labels):
    data_path = DATASET_DIR + label
    data_map[ind] = label, os.listdir(data_path), ind

df = pd.DataFrame(data_map.values(), columns=["label_name", "path", "label"])
df = df.set_index(["label_name", "label"])
df.reset_index(inplace=True)
background_df = df[df["label_name"] == "_background_noise_"]
df = df[df["label_name"] != "_background_noise_"]
element_to_remove = "_background_noise_"
if element_to_remove in labels:
    labels.remove(element_to_remove)
rows = []
_ = df.apply(
    lambda row: [rows.append([row["label_name"], row["label"], nn]) for nn in row.path],
    axis=1,
)
df_new = pd.DataFrame(rows, columns=df.columns)
df_new.to_csv(r"speech_commands_dataset_all_labels.csv", index=True)
df_new["label"] = df_new["label"].apply(lambda x: x if x < 12 else x - 1)
data = df_new

In [6]:
train_df, val_df = train_test_split(data, stratify=data["label"], test_size=0.1)
train_df, test_df = train_test_split(
    train_df, stratify=train_df["label"], test_size=0.1
)
# train_background, test_background = train_test_split(background_df, stratify=train_df['label'], test_size=0.1)
print(train_df.shape, val_df.shape)
print(train_df.dtypes)
print(train_df.head())

(52423, 3) (6473, 3)
label_name    object
label          int64
path          object
dtype: object
      label_name  label                   path
46478      house     22  00b01445_nohash_1.wav
63391       four     29  3a70ab7f_nohash_1.wav
58207       five     27  39a12648_nohash_1.wav
19163        wow      9  023a61ad_nohash_0.wav
40641         up     19  f17be97f_nohash_2.wav


In [7]:
input_shape = (32, 32, 3)
num_classes = len(labels)
epochs = 2
batch_size = 64

In [8]:
import keras
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, LSTM, Dense, Flatten, Reshape
from keras.optimizers import Adam

model = Sequential(
    [
        Flatten(input_shape=input_shape),
        Reshape((-1, 128)),
        LSTM(128),
        Dense(num_classes, activation="softmax"),
    ]
)
model.compile(optimizer=Adam(), loss="categorical_crossentropy", metrics=["accuracy"])
model.summary()

  super().__init__(**kwargs)


In [9]:
def preprocess_data(row, num_classes):
    signal_path = DATASET_DIR + row["label_name"] + "/" + row["path"]
    samples, sample_rate = librosa.load(signal_path, mono=True, sr=None)
    freqs, times, spectrogram = log_specgram(samples, sample_rate)
    img = cv2.resize(spectrogram, (32, 32))
    img = np.divide((img - np.mean(img)), np.std(img), out=img, where=np.std(img) != 0)
    img = np.stack((img,) * 3, axis=-1).astype(np.float32)
    label = np.zeros(num_classes)
    label[row["label"]] = 1
    return img, label


def create_dataset(df, num_classes, batch_size):
    def generator():
        for index, row in df.iterrows():
            img, label = preprocess_data(row, num_classes)
            yield img, label

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(32, 32, 3), dtype=tf.float32),
            tf.TensorSpec(shape=(num_classes,), dtype=tf.float32),
        ),
    )
    # dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.batch(batch_size)
    # dataset = dataset.prefetch(2)
    return dataset

In [10]:
train_dataset = create_dataset(train_df, num_classes, batch_size)
val_dataset = create_dataset(val_df, num_classes, batch_size)
model.fit(
    train_dataset, epochs=epochs, batch_size=batch_size, validation_data=val_dataset
)

Epoch 1/2
    819/Unknown [1m41s[0m 48ms/step - accuracy: 0.3993 - loss: 2.0547

2024-05-14 06:34:34.810831: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
  self.gen.throw(typ, value, traceback)


[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 53ms/step - accuracy: 0.3998 - loss: 2.0531 - val_accuracy: 0.7516 - val_loss: 0.8257
Epoch 2/2
[1m  3/820[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m36s[0m 45ms/step - accuracy: 0.7917 - loss: 0.8257

2024-05-14 06:34:39.236625: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.7693 - loss: 0.7723

2024-05-14 06:35:18.181869: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m820/820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 54ms/step - accuracy: 0.7693 - loss: 0.7722 - val_accuracy: 0.8123 - val_loss: 0.6185


2024-05-14 06:35:23.342145: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


<keras.src.callbacks.history.History at 0x32df38090>

In [11]:
import keras
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, LSTM, Dense, Flatten, Reshape
from keras.optimizers import Adam

model_background = Sequential(
    [
        Conv2D(32, (3, 3), activation="relu", input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Flatten(),
        Reshape((-1, 128)),
        LSTM(128),
        Dense(1, activation="sigmoid"),
    ]
)
model_background.compile(
    optimizer=Adam(), loss="binary_crossentropy", metrics=["accuracy"]
)
model_background.summary()

  super().__init__(


In [12]:
# This block of code was insipired by the following link:
# https://github.com/OldBonhart/TensorFlow_Speech_Recognition_Challenge/blob/master/Train___EfficientNet.ipynb
labels = os.listdir(DATASET_DIR)
data_map = {}
for ind, label in enumerate(labels):
    data_path = DATASET_DIR + label
    data_map[data_path] = label, os.listdir(data_path), ind
data_map.keys()
data_map = {}
for ind, label in enumerate(labels):
    data_path = DATASET_DIR + label
    data_map[ind] = label, os.listdir(data_path), ind
df = pd.DataFrame(data_map.values(), columns=["label_name", "path", "label"])
df = df.set_index(["label_name", "label"])
df.reset_index(inplace=True)
# background_df = df[df['label_name'] == '_background_noise_']
# df = df[df['label_name'] != '_background_noise_']
# element_to_remove = "_background_noise_"
# if element_to_remove in labels:
#     labels.remove(element_to_remove)
rows = []
_ = df.apply(
    lambda row: [rows.append([row["label_name"], row["label"], nn]) for nn in row.path],
    axis=1,
)
df_new = pd.DataFrame(rows, columns=df.columns)
df_new.to_csv(r"speech_commands_dataset_all_labels.csv", index=True)
df_new["label"] = df_new["label"].apply(lambda x: x if x < 12 else x - 1)
data = df_new

In [13]:
train_df, val_df = train_test_split(data, stratify=data["label"], test_size=0.1)
train_df, test_df = train_test_split(
    train_df, stratify=train_df["label"], test_size=0.1
)
# train_background, test_background = train_test_split(background_df, stratify=train_df['label'], test_size=0.1)
print(train_df.shape, val_df.shape)
print(train_df.dtypes)
print(train_df.head())

(84290, 3) (10407, 3)
label_name    object
label          int64
path          object
dtype: object
                label_name  label                   path
81260                   up     19  7c1d8533_nohash_1.wav
86828                house     22  96ab6565_nohash_0.wav
73468                 bird     16  de650823_nohash_0.wav
103403                four     29  be7a5b2d_nohash_1.wav
54833   _background_noise_     13    pink_noise_1824.wav


In [6]:
train_background = train_df.copy()
val_background = val_df.copy()
test_background = test_df.copy()
train_background["label"] = train_df["label_name"].apply(
    lambda x: 1 if x == "_background_noise_" else 0
)
val_background["label"] = val_df["label_name"].apply(
    lambda x: 1 if x == "_background_noise_" else 0
)
test_background["label"] = test_df["label_name"].apply(
    lambda x: 1 if x == "_background_noise_" else 0
)

In [7]:
train_background

Unnamed: 0,label_name,label,path
62389,_background_noise_,1,pink_noise_5615.wav
28640,three,0,587f3271_nohash_1.wav
64348,_background_noise_,1,exercise_bike_4964.wav
93868,yes,0,28e47b1a_nohash_0.wav
31617,_background_noise_,1,doing_the_dishes_8893.wav
...,...,...,...
11531,happy,0,c5570933_nohash_0.wav
54488,_background_noise_,1,dude_miaowing_3599.wav
14515,dog,0,c24d96eb_nohash_0.wav
33242,_background_noise_,1,white_noise_3728.wav


In [8]:
input_shape = (32, 32, 3)
num_classes = len(labels)
epochs = 2
batch_size = 64

In [18]:
def preprocess_data(row):
    signal_path = DATASET_DIR + row["label_name"] + "/" + row["path"]
    samples, sample_rate = librosa.load(signal_path, mono=True, sr=None)
    freqs, times, spectrogram = log_specgram(samples, sample_rate)
    img = cv2.resize(spectrogram, (32, 32))
    img = np.divide((img - np.mean(img)), np.std(img), out=img, where=np.std(img) != 0)
    img = np.stack((img,) * 3, axis=-1).astype(np.float32)
    label = np.array([row["label"]], dtype=np.float32)
    return img, label


def create_dataset(df, batch_size):
    def generator():
        for index, row in df.iterrows():
            img, label = preprocess_data(row)
            yield img, label

    dataset = tf.data.Dataset.from_generator(
        generator,
        output_signature=(
            tf.TensorSpec(shape=(32, 32, 3), dtype=tf.float32),
            tf.TensorSpec(shape=(1,), dtype=tf.float32),
        ),
    )
    # dataset = dataset.shuffle(buffer_size=len(df))
    dataset = dataset.batch(batch_size)
    # dataset = dataset.prefetch(2)
    return dataset

In [19]:
# train_background[train_background['label'] == 1]
count_1s = len(train_background[train_background["label"] == 1])
count_0s = len(train_background[train_background["label"] == 0])

print("Number of 1s:", count_1s)
print("Number of 0s:", count_0s)

Number of 1s: 31867
Number of 0s: 52423


In [20]:
train_dataset = create_dataset(train_background, batch_size)
val_dataset = create_dataset(val_background, batch_size)

In [21]:
model_background.fit(
    train_dataset, epochs=epochs, batch_size=batch_size, validation_data=val_dataset
)

Epoch 1/2
    292/Unknown [1m15s[0m 52ms/step - accuracy: 0.9603 - loss: 0.1188

KeyboardInterrupt: 