In [77]:
import torch
from torchaudio.datasets import SPEECHCOMMANDS
import os
import matplotlib.pyplot as plt
import IPython.display as ipd
import torchaudio
import glob
import torch.nn.functional as F
import torch.nn as nn
import torch.utils.data as tud
import pandas as pd
import torch


In [51]:

class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("../..", download=False)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.join(self._path, line.strip()) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

In [52]:
# Create training and testing split of the data. We do not use validation in this tutorial.
train_set = SubsetSC("training")
test_set = SubsetSC("testing")

waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]


In [53]:

waveform, sample_rate, label, speaker_id, utterance_number

(tensor([[-0.0658, -0.0709, -0.0753,  ..., -0.0700, -0.0731, -0.0704]]),
 16000,
 'backward',
 '0165e0e8',
 0)

In [55]:
class AudioDataset(tud.Dataset):
    """
    A rapper class for the UrbanSound8K dataset.
    """

    def __init__(
        self,  
        annotation_file, 
        audio_dir, 
        folds,
        target_sample_rate=16000,
        num_samples=32000):
        self.audio_file = pd.read_csv(annotation_file)
        self.folds = folds
        self.audio_paths = glob.glob(audio_dir + '/*' + str(self.folds) + '/*')
        self.num_samples = num_samples
        self.target_sample_rate = target_sample_rate
    
    

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        
        audio_sample_path = self._get_audio_sample_path(idx)
        label = self._get_audio_sample_label(audio_sample_path)
        signal, sr = torchaudio.load(audio_sample_path)
        print(signal.shape, sr)
        signal = self._resample_if_necessary(signal, sr)
        print(signal.shape, sr)
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        print(signal.shape, sr)
        signal = self._right_pad_if_necessary(signal)
        print(signal.shape, sr)
        
        return signal, label

    def _cut_if_necessary(self, signal):
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, idx):
        audio_path = self.audio_paths[idx]
        return audio_path

    def _get_audio_sample_label(self, audio_path):
        audio_name = audio_path.split(sep='/')[-1]
        label = self.audio_file.loc[self.audio_file.slice_file_name == audio_name].iloc[0,-2]
        return label

In [56]:
train_ds = AudioDataset(
    audio_dir="/home/hacene/Documents/UrbanSound8K/audio/", 
    annotation_file="/home/hacene/Documents/UrbanSound8K/metadata/UrbanSound8K.csv",
    folds=[1,2, 3, 4, 5, 6, 7, 8, 9],
    )
len(train_ds)

7895

In [57]:
train_ds[0][0].shape

torch.Size([1, 107520]) 96000
torch.Size([1, 17920]) 96000
torch.Size([1, 17920]) 96000
torch.Size([1, 32000]) 96000


torch.Size([1, 32000])

In [59]:
waveform.shape

torch.Size([1, 16000])

In [98]:
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=16000,
        n_fft=512,
        hop_length=128,
        n_mels=64
    )

In [99]:
spec = mel_spectrogram(waveform)

In [100]:
spec.shape

torch.Size([1, 64, 126])

In [119]:
def make_layers():
    layers = []
    in_channels = 1
    for v in [64, "M", 128, "M", 256, 256, "M", 512, 512]:
        if v == "M":
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    layers += [nn.AvgPool2d(kernel_size=(2, 4), stride=(2, 2))]
    return nn.Sequential(*layers)

In [120]:
vgg = make_layers()

In [121]:
y = vgg(spec.unsqueeze(0))
print(y.shape)

torch.Size([1, 512, 4, 6])
