In [110]:
import torch
import torch.nn as nn
import torchaudio
import librosa
import os
import IPython.display as ipd

## Importar Dataset

In [82]:
# Importar Dataset

class CustomSpeechCommands(torchaudio.datasets.SPEECHCOMMANDS):

    def __init__(self, subset_to_download: str = "all"):
        super().__init__("", download=True, subset=None, url="speech_commands_v0.02")

        if subset_to_download == "validation":
            self._walker = self.load_list("val_list.txt")
        elif subset_to_download == "testing":
            self._walker = self.load_list("test_list.txt")
        elif subset_to_download == "training":
            self._walker = self.load_list("train_list.txt")
        else:
            raise RuntimeError("Invalid subset")


    def load_list(self, *filenames):
        output = []
        for filename in filenames:
            filepath = os.path.join("", filename)
            with open(filepath) as fileobj:
                output += [os.path.normpath(
                    os.path.join("./SpeechCommands/speech_commands_v0.02/", 
                                 line.strip())) for line in fileobj]
        return output


train_dataset = CustomSpeechCommands("training")
val_dataset = CustomSpeechCommands("validation")
test_dataset = CustomSpeechCommands("testing")

valid_labels = list(set(map(lambda x: x[2], train_dataset)))

In [107]:
def collate_and_transform(batch):
    SAMPLE_RATE = 16_000
    N_FFT = int(SAMPLE_RATE * 40e-3) # Ventana de 40 ms
    HOP_LENGTH = N_FFT // 2 # Overlap del 50%
    N_MFCC = 20
    
    audios, labels = [], []

    for audio, _, label, *_ in batch:
        audios.append(audio.t())
        labels.append(torch.tensor(valid_labels.index(label.lower())))

    labels = torch.stack(labels)
    audios = torch.nn.utils.rnn.pad_sequence(audios, 
                                              batch_first=True)\
                                .squeeze() # REVISAR SI ES (TIEMPO, FEAT) O (FEAT, TIEMPO)

    mfcc = torchaudio.transforms.MFCC(sample_rate=SAMPLE_RATE, n_mfcc=N_MFCC, 
                                   melkwargs= {
                                       "hop_length": HOP_LENGTH,
                                       "n_fft": N_FFT
                                   })
    mfcc_vals = mfcc(audios)
    delta = torchaudio.functional.compute_deltas(mfcc_vals) # (batch, 20, 51) cambiar la cantidad de features
    delta2 = torchaudio.functional.compute_deltas(delta)

    audios = torch.stack((mfcc_vals, delta, delta2), dim=1)

    return audios, labels

train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_and_transform)

val_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_and_transform)

test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_and_transform)

In [35]:
# Zero
ipd.Audio(train_dataset[-1][0], rate=16000)