In [2]:
import tensorflow as tf
import os
import librosa
import IPython.display as ipd

In [3]:
# alphanumeric order
labels = sorted(os.listdir("./../speech_commands_v0.02"))

idx_label_dict = {label: idx for idx, label in enumerate(labels)}
    
up_label = idx_label_dict["up"]
down_label = idx_label_dict["down"]
right_label = idx_label_dict["right"]
left_label = idx_label_dict["left"]

In [4]:
train_ds, test_ds = tf.keras.utils.audio_dataset_from_directory(
                            directory="./../speech_commands_v0.02",
                            batch_size=None,
                            validation_split=0.2,
                            seed=0,
                            output_sequence_length=16000,
                            subset='both'
                        )

Found 105835 files belonging to 36 classes.
Using 84668 files for training.
Using 21167 files for validation.


2024-05-19 13:28:31.866628: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-19 13:28:31.901222: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-05-19 13:28:31.905086: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

In [5]:
idel_wav_file_path = "./../../LESSI noise/Idel.wav"
idel_wav, sr = librosa.load(idel_wav_file_path)
idel_wav = librosa.resample(y=idel_wav, orig_sr=sr, target_sr=16000)

walk_wav_file_path = "./../../LESSI noise/Walk.wav"
walk_wav, sr = librosa.load(walk_wav_file_path)
walk_wav = walk_wav[50000:]
walk_wav = librosa.resample(y=walk_wav, orig_sr=sr, target_sr=16000)

def add_noise(speech_wav):
    r = tf.random.uniform(shape=(), minval=0, maxval=2, dtype=tf.int32)

    # idel
    if r == 0:
    
        min_alpha = 0.2
        max_alpha = 0.4
        alpha = tf.random.uniform(shape=(), minval=min_alpha, maxval=max_alpha, dtype=tf.float32)

        pos = tf.random.uniform(shape=(), minval=0, maxval=len(idel_wav) - 16000, dtype=tf.int32)
        noise_indices = tf.range(pos, pos+16000, dtype=tf.dtypes.int64)
        noise = tf.gather(idel_wav, noise_indices)

        superimposed = alpha * speech_wav + (1 - alpha) * noise
        return superimposed
    
    # walk
    else:
        
        min_alpha = 0.4
        max_alpha = 0.5
        alpha = tf.random.uniform(shape=(), minval=min_alpha, maxval=max_alpha, dtype=tf.float32)

        pos = tf.random.uniform(shape=(), minval=0, maxval=len(walk_wav) - 16000, dtype=tf.int32)
        noise_indices = tf.range(pos, pos+16000, dtype=tf.dtypes.int64)
        noise = tf.gather(walk_wav, noise_indices)

        superimposed = alpha * speech_wav + (1 - alpha) * noise
        return superimposed

In [6]:
def relabel(label):

    if label == up_label:
        return 0
    elif label == right_label:
        return 1
    elif label == down_label:
        return 2
    elif label == left_label:
        return 3
    else:
        return 4

In [7]:
def prepare_data(dataset):

    dataset = dataset.map(lambda wav, label: (wav, relabel(label)))

    # Flatten: (num_samples, 1) -> (num_samples)
    dataset = dataset.map(lambda wav, label: (tf.reshape(wav, (-1,)), label))
    
    # One hot target
    #dataset = dataset.map(lambda wav, label: (wav, tf.one_hot(label, depth=5)))


    #
    # Cache
    #


    #dataset = dataset.cache()

    dataset = dataset.map(lambda wav, target: (add_noise(wav), target))

    # Spectrogram shape = (124, 129, 1)
    #dataset = dataset.map(lambda wav, target:(get_spectrogram(wav), target))

    # resize: (124, 129) -> (128, 128)
    #dataset = dataset.map(lambda spectrogram, target: (tf.image.resize(spectrogram, (128, 128)), target))

    #
    # Shuffle, batch, prefetch
    #
    #dataset = dataset.shuffle(5000)
    #dataset = dataset.batch(BATCH_SIZE)
    #dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset

In [8]:
train_ds = train_ds.apply(prepare_data)

In [9]:
for wav, label in train_ds.take(50):
    print(label)
    ipd.display(ipd.Audio(wav, rate=16000))

tf.Tensor(1, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(3, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(2, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(0, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(1, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(0, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


tf.Tensor(4, shape=(), dtype=int32)


2024-05-19 13:28:35.803810: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
