In [1]:
import torch
import torch.nn as nn
from torch.nn import init
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader, random_split

# 1. Import datset

In [2]:
!wget https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz -O urban8k.tgz
!tar -xzf urban8k.tgz
!rm urban8k.tgz

--2023-11-29 09:46:14--  https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz
Resolving zenodo.org (zenodo.org)... 188.184.98.238, 188.185.79.172, 188.184.103.159, ...
Connecting to zenodo.org (zenodo.org)|188.184.98.238|:443... connected.
HTTP request sent, awaiting response... 301 MOVED PERMANENTLY
Location: /records/1203745/files/UrbanSound8K.tar.gz [following]
--2023-11-29 09:46:15--  https://zenodo.org/records/1203745/files/UrbanSound8K.tar.gz
Reusing existing connection to zenodo.org:443.
HTTP request sent, awaiting response... 200 OK
Length: 6023741708 (5.6G) [application/octet-stream]
Saving to: ‘urban8k.tgz’


2023-11-29 09:50:12 (24.2 MB/s) - ‘urban8k.tgz’ saved [6023741708/6023741708]



# 2. Preprocess

In [3]:
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio

class AudioUtil():
    @staticmethod
    def open(audio_file):
        sig, sr = torchaudio.load(audio_file)
        return (sig, sr)

    @staticmethod
    def rechannel(aud, new_channel):
        sig, sr = aud

        if (sig.shape[0] == new_channel):
          # Nothing to do
          return aud

        if (new_channel == 1):
          # Convert from stereo to mono by selecting only the first channel
          resig = sig[:1, :]
        else:
          # Convert from mono to stereo by duplicating the first channel
          resig = torch.cat([sig, sig])

        return ((resig, sr))

    def resample(aud, newsr):
        sig, sr = aud

        if (sr == newsr):
            # Nothing to do
            return aud

        num_channels = sig.shape[0]
        # Resample first channel
        resig = torchaudio.transforms.Resample(sr, newsr)(sig[:1,:])
        if (num_channels > 1):
            # Resample the second channel and merge both channels
            retwo = torchaudio.transforms.Resample(sr, newsr)(sig[1:,:])
            resig = torch.cat([resig, retwo])

        return ((resig, newsr))

    @staticmethod
    def pad_trunc(aud, max_ms):
        sig, sr = aud
        num_rows, sig_len = sig.shape
        max_len = sr//1000 * max_ms

        if (sig_len > max_len):
          # Truncate the signal to the given length
          sig = sig[:,:max_len]

        elif (sig_len < max_len):
          # Length of padding to add at the beginning and end of the signal
          pad_begin_len = random.randint(0, max_len - sig_len)
          pad_end_len = max_len - sig_len - pad_begin_len

          # Pad with 0s
          pad_begin = torch.zeros((num_rows, pad_begin_len))
          pad_end = torch.zeros((num_rows, pad_end_len))

          sig = torch.cat((pad_begin, sig, pad_end), 1)

        return (sig, sr)

    @staticmethod
    def time_shift(aud, shift_limit):
        sig, sr = aud
        _, sig_len = sig.shape
        shift_amt = int(random.random() * shift_limit * sig_len)
        return (sig.roll(shift_amt), sr)

    @staticmethod
    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig,sr = aud
        top_db = 80

        # spec has shape [channel, n_mels, time], where channel is mono, stereo etc
        spec = transforms.MelSpectrogram(sr, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels)(sig)

        # Convert to decibels
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return (spec)

    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

        return aug_spec



In [4]:
class SoundDS(Dataset):
    def __init__(self, df, data_path):
        self.df = df
        self.data_path = str(data_path)
        self.duration = 4000
        self.sr = 44100
        self.channel = 2
        self.shift_pct = 0.4

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Absolute file path of the audio file - concatenate the audio directory with
        # the relative path
        audio_file = self.data_path + self.df.loc[idx, 'relative_path']
        # Get the Class ID
        class_id = self.df.loc[idx, 'classID']

        aud = AudioUtil.open(audio_file)
        # Some sounds have a higher sample rate, or fewer channels compared to the
        # majority. So make all sounds have the same number of channels and same
        # sample rate. Unless the sample rate is the same, the pad_trunc will still
        # result in arrays of different lengths, even though the sound duration is
        # the same.
        reaud = AudioUtil.resample(aud, self.sr)
        rechan = AudioUtil.rechannel(reaud, self.channel)

        dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
        shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
        sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

        return aug_sgram, class_id

In [5]:
import pandas as pd

metadata_file = './UrbanSound8K/metadata/UrbanSound8K.csv'
audio_path = './UrbanSound8K/audio'
df = pd.read_csv(metadata_file)
df['relative_path'] = '/fold' + df['fold'].astype(str) + '/' + df['slice_file_name']

myds = SoundDS(df, audio_path)

# Random split of 80:20
n_items = len(myds)
n_train = round(n_items * 0.8)
n_val = n_items - n_train
train_ds, val_ds = random_split(myds, [n_train, n_val])

# DataLoader
train_dl = DataLoader(train_ds, batch_size=20, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=20)

# 2. Build model

In [6]:
class AudioClassifier (nn.Module):
    # ----------------------------
    # Build the model architecture
    # ----------------------------
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Second Convolution Block
        self.conv3 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Second Convolution Block
        self.conv4 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=64, out_features=10)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)

    # ----------------------------
    # Forward pass computations
    # ----------------------------
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# 3. Train

In [7]:
from tqdm import tqdm

def training(model, train_dl, n_epochs, device='cuda'):
    # Tensorboard
    writer = SummaryWriter()
    # Loss Function, Optimizer and Scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer,
                                                    steps_per_epoch=len(train_dl),
                                                    max_lr=0.001,
                                                    epochs=n_epochs)
    model.to(device)
    print(model)
    # Repeat for each epoch
    for epoch in range(n_epochs):
        running_loss = .0
        correct_prediction = 0
        total_prediction = 0

        # Repeat for each batch
        print(f'Epoch {epoch + 1}/{n_epochs}')
        for i, data in enumerate(tqdm(train_dl)):
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Zero for the param gradient
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs).to(device)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step() # Update weight
            scheduler.step() # Update lr os param groups

            # Keep stats for loss and accuracy
            running_loss += loss.item()
            _, prediction = torch.max(outputs, 1)
            correct_prediction += (prediction == labels).sum()
            total_prediction += prediction.shape[0]

        # Print stats at the end of the epoch
        num_batches = len(train_dl)
        avg_loss = running_loss / num_batches
        avg_acc = correct_prediction / total_prediction
        writer.add_scalar("Loss/train", avg_loss, epoch)
        writer.add_scalar("Acc/train", avg_acc, epoch)
        print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Acc: {avg_acc:.2f}')

        # Save model
        torch.save(model.state_dict(), f'model_{epoch}.pt')

    print('Finished Training')

In [8]:
from torchsummary import summary
n_epochs = 100
in_channels = [2, 8, 16, 32]
out_channels = [8, 16, 32, 64]
kernel_sizes = [(5, 5), (3, 3), (3, 3), (3, 3)]
strides = [(2, 2), (2, 2), (2, 2), (2, 2)]
paddings = [(2, 2), (1, 1), (1, 1), (1, 1)]
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AudioClassifier()

In [None]:
training(model, train_dl, n_epochs, device)

AudioClassifier(
  (conv1): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
  (relu1): ReLU()
  (bn1): BatchNorm2d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu2): ReLU()
  (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv3): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu3): ReLU()
  (bn3): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (relu4): ReLU()
  (bn4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (ap): AdaptiveAvgPool2d(output_size=1)
  (lin): Linear(in_features=64, out_features=10, bias=True)
  (conv): Sequential(
    (0): Conv2d(2, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU()
    (2): BatchNorm2d(8, eps=1e-05, mome

100%|██████████| 350/350 [03:29<00:00,  1.67it/s]


Epoch: 0, Loss: 2.16, Acc: 0.22
Epoch 2/100


100%|██████████| 350/350 [03:16<00:00,  1.78it/s]


Epoch: 1, Loss: 2.02, Acc: 0.27
Epoch 3/100


100%|██████████| 350/350 [03:11<00:00,  1.82it/s]


Epoch: 2, Loss: 1.94, Acc: 0.32
Epoch 4/100


100%|██████████| 350/350 [03:12<00:00,  1.82it/s]


Epoch: 3, Loss: 1.86, Acc: 0.35
Epoch 5/100


100%|██████████| 350/350 [03:08<00:00,  1.86it/s]


Epoch: 4, Loss: 1.78, Acc: 0.39
Epoch 6/100


100%|██████████| 350/350 [03:10<00:00,  1.84it/s]


Epoch: 5, Loss: 1.71, Acc: 0.41
Epoch 7/100


100%|██████████| 350/350 [03:10<00:00,  1.84it/s]


Epoch: 6, Loss: 1.64, Acc: 0.43
Epoch 8/100


100%|██████████| 350/350 [03:07<00:00,  1.86it/s]


Epoch: 7, Loss: 1.58, Acc: 0.46
Epoch 9/100


100%|██████████| 350/350 [03:06<00:00,  1.88it/s]


Epoch: 8, Loss: 1.51, Acc: 0.48
Epoch 10/100


100%|██████████| 350/350 [03:05<00:00,  1.89it/s]


Epoch: 9, Loss: 1.46, Acc: 0.50
Epoch 11/100


100%|██████████| 350/350 [03:06<00:00,  1.88it/s]


Epoch: 10, Loss: 1.38, Acc: 0.52
Epoch 12/100


100%|██████████| 350/350 [03:02<00:00,  1.92it/s]


Epoch: 11, Loss: 1.33, Acc: 0.54
Epoch 13/100


100%|██████████| 350/350 [03:02<00:00,  1.91it/s]


Epoch: 12, Loss: 1.25, Acc: 0.57
Epoch 14/100


100%|██████████| 350/350 [03:06<00:00,  1.88it/s]


Epoch: 13, Loss: 1.18, Acc: 0.60
Epoch 15/100


100%|██████████| 350/350 [03:02<00:00,  1.92it/s]


Epoch: 14, Loss: 1.12, Acc: 0.62
Epoch 16/100


100%|██████████| 350/350 [03:01<00:00,  1.93it/s]


Epoch: 15, Loss: 1.06, Acc: 0.64
Epoch 17/100


100%|██████████| 350/350 [03:03<00:00,  1.91it/s]


Epoch: 16, Loss: 1.03, Acc: 0.66
Epoch 18/100


100%|██████████| 350/350 [02:58<00:00,  1.96it/s]


Epoch: 17, Loss: 0.99, Acc: 0.67
Epoch 19/100


100%|██████████| 350/350 [02:55<00:00,  2.00it/s]


Epoch: 18, Loss: 0.93, Acc: 0.69
Epoch 20/100


100%|██████████| 350/350 [02:54<00:00,  2.00it/s]


Epoch: 19, Loss: 0.89, Acc: 0.70
Epoch 21/100


100%|██████████| 350/350 [02:54<00:00,  2.00it/s]


Epoch: 20, Loss: 0.87, Acc: 0.71
Epoch 22/100


100%|██████████| 350/350 [02:52<00:00,  2.02it/s]


Epoch: 21, Loss: 0.84, Acc: 0.72
Epoch 23/100


100%|██████████| 350/350 [02:55<00:00,  1.99it/s]


Epoch: 22, Loss: 0.79, Acc: 0.73
Epoch 24/100


100%|██████████| 350/350 [02:57<00:00,  1.98it/s]


Epoch: 23, Loss: 0.77, Acc: 0.74
Epoch 25/100


100%|██████████| 350/350 [02:57<00:00,  1.98it/s]


Epoch: 24, Loss: 0.73, Acc: 0.76
Epoch 26/100


100%|██████████| 350/350 [02:55<00:00,  2.00it/s]


Epoch: 25, Loss: 0.70, Acc: 0.77
Epoch 27/100


100%|██████████| 350/350 [02:55<00:00,  1.99it/s]


Epoch: 26, Loss: 0.67, Acc: 0.78
Epoch 28/100


100%|██████████| 350/350 [02:54<00:00,  2.01it/s]


Epoch: 27, Loss: 0.65, Acc: 0.78
Epoch 29/100


100%|██████████| 350/350 [02:54<00:00,  2.00it/s]


Epoch: 28, Loss: 0.63, Acc: 0.79
Epoch 30/100


100%|██████████| 350/350 [02:56<00:00,  1.98it/s]


Epoch: 29, Loss: 0.60, Acc: 0.81
Epoch 31/100


100%|██████████| 350/350 [02:53<00:00,  2.02it/s]


Epoch: 30, Loss: 0.60, Acc: 0.80
Epoch 32/100


100%|██████████| 350/350 [02:54<00:00,  2.01it/s]


Epoch: 31, Loss: 0.58, Acc: 0.80
Epoch 33/100


100%|██████████| 350/350 [02:53<00:00,  2.02it/s]


Epoch: 32, Loss: 0.56, Acc: 0.82
Epoch 34/100


100%|██████████| 350/350 [02:53<00:00,  2.02it/s]


Epoch: 33, Loss: 0.56, Acc: 0.81
Epoch 35/100


100%|██████████| 350/350 [02:54<00:00,  2.01it/s]


Epoch: 34, Loss: 0.53, Acc: 0.82
Epoch 36/100


100%|██████████| 350/350 [02:52<00:00,  2.03it/s]


Epoch: 35, Loss: 0.52, Acc: 0.83
Epoch 37/100


100%|██████████| 350/350 [02:55<00:00,  2.00it/s]


Epoch: 36, Loss: 0.50, Acc: 0.83
Epoch 38/100


100%|██████████| 350/350 [02:57<00:00,  1.97it/s]


Epoch: 37, Loss: 0.50, Acc: 0.83
Epoch 39/100


100%|██████████| 350/350 [02:53<00:00,  2.02it/s]


Epoch: 38, Loss: 0.50, Acc: 0.83
Epoch 40/100


100%|██████████| 350/350 [02:52<00:00,  2.03it/s]


Epoch: 39, Loss: 0.46, Acc: 0.84
Epoch 41/100


100%|██████████| 350/350 [02:50<00:00,  2.06it/s]


Epoch: 40, Loss: 0.47, Acc: 0.85
Epoch 42/100


100%|██████████| 350/350 [02:49<00:00,  2.06it/s]


Epoch: 41, Loss: 0.47, Acc: 0.84
Epoch 43/100


100%|██████████| 350/350 [02:50<00:00,  2.05it/s]


Epoch: 42, Loss: 0.46, Acc: 0.84
Epoch 44/100


100%|██████████| 350/350 [02:52<00:00,  2.03it/s]


Epoch: 43, Loss: 0.46, Acc: 0.85
Epoch 45/100


100%|██████████| 350/350 [02:56<00:00,  1.98it/s]


Epoch: 44, Loss: 0.44, Acc: 0.86
Epoch 46/100


100%|██████████| 350/350 [02:56<00:00,  1.98it/s]


Epoch: 45, Loss: 0.44, Acc: 0.86
Epoch 47/100


100%|██████████| 350/350 [02:54<00:00,  2.01it/s]


Epoch: 46, Loss: 0.42, Acc: 0.86
Epoch 48/100


100%|██████████| 350/350 [02:55<00:00,  2.00it/s]


Epoch: 47, Loss: 0.42, Acc: 0.86
Epoch 49/100


100%|██████████| 350/350 [02:54<00:00,  2.00it/s]


Epoch: 48, Loss: 0.42, Acc: 0.86
Epoch 50/100


100%|██████████| 350/350 [02:51<00:00,  2.04it/s]


Epoch: 49, Loss: 0.39, Acc: 0.87
Epoch 51/100


100%|██████████| 350/350 [02:52<00:00,  2.02it/s]


Epoch: 50, Loss: 0.42, Acc: 0.86
Epoch 52/100


100%|██████████| 350/350 [02:53<00:00,  2.02it/s]


Epoch: 51, Loss: 0.39, Acc: 0.87
Epoch 53/100


100%|██████████| 350/350 [02:53<00:00,  2.02it/s]


Epoch: 52, Loss: 0.38, Acc: 0.87
Epoch 54/100


100%|██████████| 350/350 [02:50<00:00,  2.05it/s]


Epoch: 53, Loss: 0.39, Acc: 0.87
Epoch 55/100


100%|██████████| 350/350 [02:50<00:00,  2.05it/s]


Epoch: 54, Loss: 0.38, Acc: 0.88
Epoch 56/100


100%|██████████| 350/350 [02:50<00:00,  2.06it/s]


Epoch: 55, Loss: 0.37, Acc: 0.87
Epoch 57/100


100%|██████████| 350/350 [02:51<00:00,  2.04it/s]


Epoch: 56, Loss: 0.37, Acc: 0.88
Epoch 58/100


100%|██████████| 350/350 [02:50<00:00,  2.05it/s]


Epoch: 57, Loss: 0.36, Acc: 0.88
Epoch 59/100


100%|██████████| 350/350 [02:49<00:00,  2.06it/s]


Epoch: 58, Loss: 0.36, Acc: 0.88
Epoch 60/100


100%|██████████| 350/350 [02:49<00:00,  2.06it/s]


Epoch: 59, Loss: 0.35, Acc: 0.88
Epoch 61/100


100%|██████████| 350/350 [02:50<00:00,  2.05it/s]


Epoch: 60, Loss: 0.33, Acc: 0.89
Epoch 62/100


100%|██████████| 350/350 [02:50<00:00,  2.05it/s]


Epoch: 61, Loss: 0.33, Acc: 0.89
Epoch 63/100


100%|██████████| 350/350 [02:50<00:00,  2.05it/s]


Epoch: 62, Loss: 0.33, Acc: 0.89
Epoch 64/100


100%|██████████| 350/350 [02:50<00:00,  2.05it/s]


Epoch: 63, Loss: 0.33, Acc: 0.89
Epoch 65/100


100%|██████████| 350/350 [02:56<00:00,  1.98it/s]


Epoch: 64, Loss: 0.33, Acc: 0.89
Epoch 66/100


100%|██████████| 350/350 [02:55<00:00,  2.00it/s]


Epoch: 65, Loss: 0.31, Acc: 0.89
Epoch 67/100


100%|██████████| 350/350 [02:54<00:00,  2.00it/s]


Epoch: 66, Loss: 0.33, Acc: 0.89
Epoch 68/100


 49%|████▊     | 170/350 [01:24<03:28,  1.16s/it]

# 4. Test

In [None]:
def inference(model, test_dl, device='cpu'):
    correct_prediction = 0
    total_prediction = 0

    with torch.no_grad():
        for data in test_dl:
            inputs, labels = data[0].to(device), data[1].to(device)

            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            outputs = model(inputs)
            _, prediction = torch.max(outputs, 1)

            correct_prediction += (prediction == labels).sum()
            total_prediction += prediction.shape[0]

    acc = correct_prediction / total_prediction
    print(f'Acc: {acc:.2f}, Total items: {total_prediction}')

In [None]:
model_inf = AudioClassifier()
model_inf = model_inf.to(device)
model_inf.load_state_dict(torch.load('/content/model.pt'))
model_inf.eval()
inference(model_inf, val_dl, device)

In [None]:
print(model_inf.eval())