In [3]:
import math
import random
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
import torchaudio
from torch.utils.data import DataLoader, Dataset, random_split
from torchaudio import transforms
from pathlib import Path
from IPython.display import Audio

def install_libraries():
    !pip install pandas torch torchaudio torchvision

def prepare_metadata(base_directory: Path):
    metadata = base_directory/'metadata'/'UrbanSound8K.csv'
    dataframe = pd.read_csv(metadata)
    dataframe['relative_path'] = '/fold' + dataframe['fold'].astype(str) + '/' + dataframe['slice_file_name'].astype(str)
    dataframe = dataframe[['relative_path', 'classID']]
    return dataframe

class AudioAugment:

    # Insert an audio file. Return the signal as a tensor as well as the sampling rate.
    @staticmethod
    def open(wav_file):
        sig, sampling_rate = torchaudio.load(wav_file)
        return (sig, sampling_rate)

    # Convert the audio provided to the desired number of channels
    @staticmethod
    def rechannel(aud, new_channel):
        sig, sampling_rate = aud

        if sig.shape[0] == new_channel:
            return aud

        if new_channel == 1:
            # Convert first channel to mono.
            resig = sig[:1, :]
        else:
            # Convert from mono to stereo.
            resig = torch.cat([sig, sig])

        return (resig, sampling_rate)

    # Because Resample only applies to one channel, we resample one channel at a time.
    @staticmethod
    def resample(aud, newsr):
        sig, sampling_rate = aud

        if sampling_rate == newsr:
            return aud

        num_channels = sig.shape[0]
        # First channel resampling
        resig = torchaudio.transforms.Resample(sampling_rate, newsr)(sig[:1, :])
        if num_channels > 1:
            # Resample the second channel and merge it with the first.
            retwo = torchaudio.transforms.Resample(sampling_rate, newsr)(sig[1:, :])
            resig = torch.cat([resig, retwo])

        return (resig, newsr)

    # Truncate or Pad the signal to a fixed length in milliseconds ('maximum audio length').
    @staticmethod
    def pad_trunc(aud, maximum_audio_length):
        sig, sampling_rate = aud
        num_rows, input_signal_length = sig.shape
        maximum_length = sampling_rate // 1000 * maximum_audio_length

        if input_signal_length > maximum_length:
            # Reduce the signal to the specified length.
            sig = sig[:, :maximum_length]

        elif input_signal_length < maximum_length:
            # Padding length to be added at the beginning and end of the signal
            padding_begin_length = random.randint(0, maximum_length - input_signal_length)
            padding_end_length = maximum_length - input_signal_length - padding_begin_length

            # Pad with 0s
            pad_begin = torch.zeros((num_rows, padding_begin_length))
            pad_end = torch.zeros((num_rows, padding_end_length))

            sig = torch.cat((pad_begin, sig, pad_end), 1)

        return (sig, sampling_rate)

    # Shifts the signal by a percentage to the left or right. End values are 'wrapped around' to the beginning of the transformed signal.
    @staticmethod
    def time_shift(aud, shift_limit):
        sig, sampling_rate = aud
        _, input_signal_length = sig.shape
        shift_amt = int(random.random() * shift_limit * input_signal_length)
        return (sig.roll(shift_amt), sampling_rate)

    # Create a Spectrogram
    @staticmethod
    def spectro_gram(aud, n_mels=64, n_fft=1024, hop_len=None):
        sig, sampling_rate = aud
        top_db = 80

        # The shape of spec is [channel, n mels, time], where channel is mono, stereo, and so on.
        spec = transforms.MelSpectrogram(
            sampling_rate, n_fft=n_fft, hop_length=hop_len, n_mels=n_mels
        )(sig)

        # Decibel conversion
        spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
        return spec

    # Mask out some sections of the Spectrogram in both the frequency dimension (horizontal bars) and the time dimension (vertical bars) to prevent overfitting and help the model generalize better. The mean value is used to replace the masked sections.
    @staticmethod
    def spectro_augment(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
        _, n_mels, n_steps = spec.shape
        mask_value = spec.mean()
        aug_spec = spec

        freq_mask_param = max_mask_pct * n_mels
        for _ in range(n_freq_masks):
            aug_spec = transforms.FrequencyMasking(freq_mask_param)(
                aug_spec, mask_value
            )

        time_mask_param = max_mask_pct * n_steps
        for _ in range(n_time_masks):
            aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

        return aug_spec

class SoundDS(Dataset):
    def __init__(self, dataframe, audio_path):
        self.dataframe = dataframe
        self.audio_path = str(audio_path)
        self.duration = 4000
        self.sampling_rate = 44100
        self.channel = 2
        self.shift_pct = 0.4

    # The number of items in the dataset
    def __len__(self):
        return len(self.dataframe)

    # Get the i'th item in the dataset
    def __getitem__(self, idx):
        # Absolute audio file path - concatenate the audio directory with the relative path
        audio_file = self.audio_path + self.dataframe.loc[idx, "relative_path"]
        # Obtain the Class ID
        class_id = self.dataframe.loc[idx, "classID"]

        aud = AudioAugment.open(audio_file)
        # When compared to the majority, some sounds have a higher sample rate or fewer channels. As a result, ensure that all sounds have the same number of channels and sample rate. Even if the sound duration is the same, the pad trunc will produce arrays of varying lengths unless the sample rate is the same.
        reaud = AudioAugment.resample(aud, self.sampling_rate)
        rechan = AudioAugment.rechannel(reaud, self.channel)

        dur_aud = AudioAugment.pad_trunc(rechan, self.duration)
        shift_aud = AudioAugment.time_shift(dur_aud, self.shift_pct)
        sgram = AudioAugment.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
        aug_sgram = AudioAugment.spectro_augment(
            sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2
        )

        return aug_sgram, class_id

def create_data_loaders(dataframe, audio_path):
    myds = SoundDS(dataframe, audio_path)

    num_items = len(myds)
    num_train = round(num_items * 0.8)
    num_val = num_items - num_train
    train_ds, val_ds = random_split(myds, [num_train, num_val])

    train_dataloader = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
    validation_dataloader = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

    return train_dataloader, validation_dataloader

class AudioClassifier(nn.Module):

    # Create the model architecture.
    def __init__(self):
        super().__init__()
        convolutional_layers_list = []

        # The first convolution block is made up of Relu and Batch Norm. Make use of Kaiming Initialization.
        self.convolutional_layer1 = nn.Conv2d(2, 8, padding=(2, 2), kernel_size=(5, 5), stride=(2, 2))
        self.rectified_linear_unit1 = nn.ReLU()
        self.batch_normalization1 = nn.BatchNorm2d(8)
        init.kaiming_normal_(self.convolutional_layer1.weight, a=0.1)
        self.convolutional_layer1.bias.data.zero_()
        convolutional_layers_list += [self.convolutional_layer1, self.rectified_linear_unit1, self.batch_normalization1]

        # Convolution-Block number 2
        self.convolutional_layer2 = nn.Conv2d(8, 16, padding=(1, 1), kernel_size=(3, 3), stride=(2, 2))
        self.rectified_linear_unit2 = nn.ReLU()
        self.batch_normalization2 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.convolutional_layer2.weight, a=0.1)
        self.convolutional_layer2.bias.data.zero_()
        convolutional_layers_list += [self.convolutional_layer2, self.rectified_linear_unit2, self.batch_normalization2]

        # Convolution-Block number 2
        self.convolutional_layer3 = nn.Conv2d(16, 32, padding=(1, 1), kernel_size=(3, 3), stride=(2, 2))
        self.rectified_linear_unit3 = nn.ReLU()
        self.batch_normalization3 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.convolutional_layer3.weight, a=0.1)
        self.convolutional_layer3.bias.data.zero_()
        convolutional_layers_list += [self.convolutional_layer3, self.rectified_linear_unit3, self.batch_normalization3]

        # Convolution-Block number 2
        self.convolutional_layer4 = nn.Conv2d( 32, 64, padding=(1, 1), kernel_size=(3, 3), stride=(2, 2))
        self.rectified_linear_unit4 = nn.ReLU()
        self.batch_normalization4 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.convolutional_layer4.weight, a=0.1)
        self.convolutional_layer4.bias.data.zero_()
        convolutional_layers_list += [self.convolutional_layer4, self.rectified_linear_unit4, self.batch_normalization4]

        # Linear-Classifier
        self.adaptive_average_pooling = nn.AdaptiveAvgPool2d(output_size=1)
        self.linear = nn.Linear(in_features=64, out_features=10)

        # Convolutional Blocks Should Be Wrapped
        self.convolutional_layer_container = nn.Sequential(*convolutional_layers_list)

    # Computed in the first pass
    def forward(self, x):
        # Execute the convolutional blocks.
        x = self.convolutional_layer_container(x)

        # Adaptive pooling and flattening for linear layer input
        x = self.adaptive_average_pooling(x)
        x = x.view(x.shape[0], -1)

        # The linear layer
        x = self.linear(x)

        # final output result
        return x

def train_model(model, train_dataloader, number_of_epochs, device):
    # Optimizer, Loss Function, and Scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=0.001,
        steps_per_epoch=int(len(train_dataloader)),
        epochs=number_of_epochs,
        anneal_strategy="linear",
    )

    # Repeat this process for each epoch.
    for epoch in range(number_of_epochs):
        accumlated_loss = 0.0
        number_correct = 0
        prediction_total = 0

        # Repeat this process for each batch in the training set.
        for i, data in enumerate(train_dataloader):
            # Place the input features and target labels on the device (GPU or CPU).
            input_features, labels_of_batch = data[0].to(device), data[1].to(device)

            # Make the inputs uniform.
            inputs_mean_value, inputs_standard_deviation = input_features.mean(), input_features.std()
            input_features = (input_features - inputs_mean_value) / inputs_standard_deviation

            # The parameter gradients should be set to zero.
            optimizer.zero_grad()

            # backward + forward + optimize
            predicted_outputs = model(input_features)
            calculated_loss = criterion(predicted_outputs, labels_of_batch)
            calculated_loss.backward()
            optimizer.step()
            scheduler.step()

            # Keep track of your loss and accuracy statistics.
            accumlated_loss += calculated_loss.item()

            # Get the class with the highest predicted score.
            _, batch_predicted_classid = torch.max(predicted_outputs, 1)
            # The number of predictions that corresponded to the target label.
            number_correct += (batch_predicted_classid == labels_of_batch).sum().item()
            prediction_total += batch_predicted_classid.shape[0]

        # At the end of the epoch, print the statistics.
        num_batches = len(train_dataloader)
        avg_loss = accumlated_loss / num_batches
        accuracy = number_correct / prediction_total
        print(f"Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {accuracy:.2f}")

    print("Finished Training")

def test_inference(model, validation_dataloader, device):

    number_corrected = 0
    prediction_total = 0

    # Turn off gradient updates.
    with torch.no_grad():
        for data in validation_dataloader:
            # Place the input features and target labels on the device (GPU or CPU).
            input_features, labels_of_batch = data[0].to(device), data[1].to(device)

            # Make the inputs uniform.
            inputs_mean_value, inputs_standard_deviation = input_features.mean(), input_features.std()
            input_features = (input_features - inputs_mean_value) / inputs_standard_deviation

            # Get predicted outputs
            predicted_outputs = model(input_features)

            # Get the class with the highest predicted score.
            _, batch_predicted_classid = torch.max(predicted_outputs, 1)
            # The number of predictions that corresponded to the target label.
            number_corrected += (batch_predicted_classid == labels_of_batch).sum().item()
            prediction_total += batch_predicted_classid.shape[0]

    accuracy = number_corrected / prediction_total
    print(f"Accuracy: {accuracy:.2f}, Total items: {prediction_total}")

def main():
    install_libraries()

    base_directory = Path.cwd() / 'UrbanSound8K'
    dataframe = prepare_metadata(base_directory)
    audio_path = base_directory / 'audio'
    train_dataloader, validation_dataloader = create_data_loaders(dataframe, audio_path)

    audioClassifierModel = AudioClassifier()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    audioClassifierModel = audioClassifierModel.to(device)

    number_of_epochs = 2
    train_model(audioClassifierModel, train_dataloader, number_of_epochs, device)
    test_inference(audioClassifierModel, validation_dataloader, device)

if __name__ == "__main__":
    main()

Epoch: 0, Loss: 1.88, Accuracy: 0.33
Epoch: 1, Loss: 1.54, Accuracy: 0.46
Finished Training
Accuracy: 0.48, Total items: 1746
