In [4]:
from google.colab import drive

In [5]:
import pandas as pd
import numpy as np
import matplotlib as plt
import re
import shutil
import os
import math, random
import torch
import torchaudio
from torchaudio import transforms
from IPython.display import Audio
from fastcore.utils import gt
from pathlib import Path

In [6]:
drive.mount('/content/drive')

Mounted at /content/drive


### Talkapalooza - TL BL SS 23

# A custom Pytorch CNN:

In hopes of getting better results and to better understand the purviews of deep learning, I have researched examples for the architecture of convolutional neural networks used in audio recognition.

## Preparing audio files and data frames

As per my previous experiences, I move ahead with a dataset filtering out recordings that have less than 5 recordings.

In [7]:
src = '/content/drive/MyDrive/talkapalooza/labelled_audio/'
audio_data = Path(src).ls()
audio_data

(#471) [Path('/content/drive/MyDrive/talkapalooza/labelled_audio/yellow'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/big'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/nose'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/where'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/hear'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/hair'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/vacuum'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/bedroom'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/picture'),Path('/content/drive/MyDrive/talkapalooza/labelled_audio/above')...]

In [8]:
filtered_wordbank_df = pd.read_csv('/content/drive/MyDrive/talkapalooza/data/wordbank_crossref_filt3.csv')

In [9]:
filtered_words_5 = filtered_wordbank_df[filtered_wordbank_df["count"] >= 5]["word"]
filtered_paths_5 = list()

In [10]:
for p in audio_data:
  if p.name.split('/')[-1] in list(filtered_words_5):
    filtered_paths_5.append(p)

In [11]:
len(filtered_paths_5)

53

### Creating the meta and classes dataframes

The meta dataframe stores the paths and class ID of each recording. The class dataframe has information on which word each class ID represents.

In [12]:
meta_df = pd.DataFrame(columns={'path': [], 'class_id': []})

In [13]:
class_df = pd.DataFrame(columns={'class_id': [], 'class_name': []})

In [14]:
class_id = 0
meta_id = 0
for p in filtered_paths_5:
  class_df.loc[class_id] = class_id, p.name
  for path, subdirs, files in os.walk(p):
    i = 0
    for file in files:
        p_src = os.path.abspath(Path(path, file))
        p_dst ='/content/drive/MyDrive/talkapalooza/labelled_audio_filter5/' + str(class_id) + '-' + str(i) + '.ogg'
        meta_df.loc[meta_id] = p_dst, class_id
        # shutil.copy(p_src, p_dst)
        i += 1
        meta_id += 1
  class_id += 1

In [15]:
class_df.head()

Unnamed: 0,class_id,class_name
0,0,big
1,1,where
2,2,foot
3,3,with
4,4,stick


In [16]:
meta_df.head()

Unnamed: 0,path,class_id
0,/content/drive/MyDrive/talkapalooza/labelled_a...,0
1,/content/drive/MyDrive/talkapalooza/labelled_a...,0
2,/content/drive/MyDrive/talkapalooza/labelled_a...,0
3,/content/drive/MyDrive/talkapalooza/labelled_a...,0
4,/content/drive/MyDrive/talkapalooza/labelled_a...,0


## Setting up audio preprocessing

### Preparing AudioUtil class with methods to load, convert audio and more

Adapted from [Audio Deep Learning by Ketan Doshi](https://towardsdatascience.com/audio-deep-learning-made-simple-sound-classification-step-by-step-cebc936bbe5), including the creation of an AudioUtil class that houses several methods that allow for the transformation and augmentation of loaded audio data.

In [17]:
class AudioUtil():
  # Open audio file using torchaudio
  @staticmethod
  def open(af):
    x, sr = torchaudio.load(af)
    return (x, sr)

  # Convert to desired num of channels
  @staticmethod
  def rechannel(aud, new_channel):
    x, sr = aud

    if (x.shape[0] == new_channel):
      return aud

    if (new_channel == 1):
      rex = x[:1, :]
    else:
      rex = torch.cat([x, x])

    return ((rex, sr))

  # Convert to desired sampling rate
  @staticmethod
  def resample(aud, newsr):
    x, sr = aud

    if (sr == newsr):
      return aud

    num_channels = x.shape[0]
    rex = torchaudio.transforms.Resample(sr, newsr)(x[:1,:])
    if (num_channels > 1):
      retwo = torchaudio.transforms.Resample(sr, newsr)(x[1:,:])
      rex = torch.cat([rex, retwo])

    return ((rex, newsr))

  # Resize audio to desired size by either truncating or padding with silence
  @staticmethod
  def resize(aud, max_ms):
    x, sr = aud
    num_rows, x_len = x.shape
    max_len = sr//1000 * max_ms

    if (x_len > max_len):
      x = x[:,:max_len]

    elif (x_len < max_len):
      pad_begin_len = random.randint(0, max_len - x_len)
      pad_end_len = max_len - x_len - pad_begin_len

      pad_begin = torch.zeros((num_rows, pad_begin_len))
      pad_end = torch.zeros((num_rows, pad_end_len))

      x = torch.cat((pad_begin, x, pad_end), 1)

    return (x, sr)

  # Data augmentation: shifting audio left or right randomly
  @staticmethod
  def time_shift(aud, max_shift):
    x,sr = aud
    _, x_len = x.shape
    shift_amt = int(random.random() * max_shift * x_len)
    return (x.roll(shift_amt), sr)

  # Create a spectrogram
  @staticmethod
  def to_spectro(aud, n_mels=64, n_fft=1024, hop_len=None):
    x,sr = aud
    top_db = 80

    spec = transforms.MFCC(sr, melkwargs={
        'n_fft': n_fft,
        'hop_length': hop_len,
        'n_mels': n_mels})(x)

    spec = transforms.AmplitudeToDB(top_db=top_db)(spec)
    return (spec)

  # Data augmentation: masking time||frequency with vert/horiz bars
  @staticmethod
  def mask_spectro(spec, max_mask_pct=0.1, n_freq_masks=1, n_time_masks=1):
    _, n_mels, n_steps = spec.shape
    mask_value = spec.mean()
    aug_spec = spec

    freq_mask_param = max_mask_pct * n_mels
    for _ in range(n_freq_masks):
      aug_spec = transforms.FrequencyMasking(freq_mask_param)(aug_spec, mask_value)

    time_mask_param = max_mask_pct * n_steps
    for _ in range(n_time_masks):
      aug_spec = transforms.TimeMasking(time_mask_param)(aug_spec, mask_value)

    return aug_spec

### Create custom data loader

In [18]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio

In [19]:
class WordDS(Dataset):
  def __init__(self, df):
    self.df = df
    # self.data_path = str(data_path)
    self.duration = 2000
    self.sr = 44100
    self.channel = 2
    self.shift_pct = 0.4

  # Number of items
  def __len__(self):
    return len(self.df)

  # Get item at index
  def __getitem__(self, idx):
    audio_file = self.df.loc[idx, 'path']
    # audio_file = self.data_path + self.df.loc[idx, 'path']
    class_id = self.df.loc[idx, 'class_id']

    aud = AudioUtil.open(audio_file)
    reaud = AudioUtil.resample(aud, self.sr)
    rechan = AudioUtil.rechannel(reaud, self.channel)

    dur_aud = AudioUtil.resize(rechan, self.duration)
    shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
    sgram = AudioUtil.to_spectro(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    aug_sgram = AudioUtil.mask_spectro(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

    return aug_sgram, class_id

### Split data into batches

In [20]:
from torch.utils.data import random_split

In [21]:
words_ds = WordDS(meta_df)

# Random split of 80:20 between training and validation
num_items = len(words_ds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(words_ds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

## Creating the model

In [23]:
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init

class AudioClassifier (nn.Module):
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 16, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Third Convolution Block
        self.conv3 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Fourth Convolution Block
        self.conv4 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(128)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=128, out_features=53)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)

    # Forward pass computations
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

## Training

In [25]:
def training(model, train_dl, num_epochs):
  # Loss Function, Optimizer and Scheduler
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(),lr=0.03)
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.03,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')

  # Repeat for each epoch
  for epoch in range(num_epochs):
    running_loss = 0.0
    correct_prediction = 0
    total_prediction = 0

    # Repeat for each batch in the training set
    for i, data in enumerate(train_dl):
        # Get the input features and target labels, and put them on the GPU
        inputs, labels = data[0].to(device), data[1].to(device)

        # Normalize the inputs
        inputs_m, inputs_s = inputs.mean(), inputs.std()
        inputs = (inputs - inputs_m) / inputs_s

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Keep stats for Loss and Accuracy
        running_loss += loss.item()

        # Get the predicted class with the highest score
        _, prediction = torch.max(outputs,1)
        # Count of predictions that matched the target label
        correct_prediction += (prediction == labels).sum().item()
        total_prediction += prediction.shape[0]

    # Print stats at the end of the epoch
    num_batches = len(train_dl)
    avg_loss = running_loss / num_batches
    acc = correct_prediction/total_prediction
    print(f'Epoch: {epoch}, Loss: {avg_loss:.2f}, Accuracy: {acc:.2f}')

  print('Finished Training')

In [None]:
num_epochs=20
training(myModel, train_dl, num_epochs)

Epoch: 0, Loss: 4.09, Accuracy: 0.02
Epoch: 1, Loss: 4.14, Accuracy: 0.02
Epoch: 2, Loss: 4.07, Accuracy: 0.02
Epoch: 3, Loss: 4.15, Accuracy: 0.02
Epoch: 4, Loss: 4.16, Accuracy: 0.02
Epoch: 5, Loss: 4.13, Accuracy: 0.03
Epoch: 6, Loss: 4.27, Accuracy: 0.03
Epoch: 7, Loss: 4.09, Accuracy: 0.04
Epoch: 8, Loss: 4.00, Accuracy: 0.04
Epoch: 9, Loss: 3.87, Accuracy: 0.04
Epoch: 10, Loss: 3.82, Accuracy: 0.05
Epoch: 11, Loss: 3.72, Accuracy: 0.07
Epoch: 12, Loss: 3.69, Accuracy: 0.05
Epoch: 13, Loss: 3.58, Accuracy: 0.08
Epoch: 14, Loss: 3.57, Accuracy: 0.09
Epoch: 15, Loss: 3.54, Accuracy: 0.09
Epoch: 16, Loss: 3.41, Accuracy: 0.12
Epoch: 17, Loss: 3.57, Accuracy: 0.08
Epoch: 18, Loss: 3.37, Accuracy: 0.12
Epoch: 19, Loss: 3.40, Accuracy: 0.14
Finished Training


### Modifying Kernel size

I tried a variety of setups to gauge the impact of kernel size, stride etc. at different layers of the CNN.

In [None]:
class AudioClassifier (nn.Module):
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 16, kernel_size=(20, 20), stride=(6, 6), padding=(10, 10))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Third Convolution Block
        self.conv3 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Fourth Convolution Block
        self.conv4 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(128)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=128, out_features=53)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)

    # Forward pass computations
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cpu')

In [None]:
num_epochs=20
training(myModel, train_dl, num_epochs)

Epoch: 0, Loss: 4.02, Accuracy: 0.03
Epoch: 1, Loss: 3.98, Accuracy: 0.01
Epoch: 2, Loss: 3.87, Accuracy: 0.05
Epoch: 3, Loss: 3.76, Accuracy: 0.05
Epoch: 4, Loss: 3.62, Accuracy: 0.08
Epoch: 5, Loss: 3.56, Accuracy: 0.08
Epoch: 6, Loss: 3.47, Accuracy: 0.08
Epoch: 7, Loss: 3.36, Accuracy: 0.12
Epoch: 8, Loss: 3.10, Accuracy: 0.13
Epoch: 9, Loss: 2.91, Accuracy: 0.19
Epoch: 10, Loss: 2.75, Accuracy: 0.23
Epoch: 11, Loss: 2.67, Accuracy: 0.27
Epoch: 12, Loss: 2.36, Accuracy: 0.36
Epoch: 13, Loss: 2.39, Accuracy: 0.36
Epoch: 14, Loss: 2.16, Accuracy: 0.38
Epoch: 15, Loss: 2.03, Accuracy: 0.39
Epoch: 16, Loss: 1.96, Accuracy: 0.44
Epoch: 17, Loss: 1.75, Accuracy: 0.50
Epoch: 18, Loss: 1.73, Accuracy: 0.52
Epoch: 19, Loss: 1.64, Accuracy: 0.52
Finished Training


Returning to a small kernel size, but increasing the stride throughout helped increase the accuracy dramatically:

In [28]:
class AudioClassifier (nn.Module):
    def __init__(self):
        super().__init__()
        conv_layers = []

        # First Convolution Block with Relu and Batch Norm. Use Kaiming Initialization
        self.conv1 = nn.Conv2d(2, 16, kernel_size=(5, 5), stride=(3, 3), padding=(2, 2))
        self.relu1 = nn.ReLU()
        self.bn1 = nn.BatchNorm2d(16)
        init.kaiming_normal_(self.conv1.weight, a=0.1)
        self.conv1.bias.data.zero_()
        conv_layers += [self.conv1, self.relu1, self.bn1]

        # Second Convolution Block
        self.conv2 = nn.Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu2 = nn.ReLU()
        self.bn2 = nn.BatchNorm2d(32)
        init.kaiming_normal_(self.conv2.weight, a=0.1)
        self.conv2.bias.data.zero_()
        conv_layers += [self.conv2, self.relu2, self.bn2]

        # Third Convolution Block
        self.conv3 = nn.Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu3 = nn.ReLU()
        self.bn3 = nn.BatchNorm2d(64)
        init.kaiming_normal_(self.conv3.weight, a=0.1)
        self.conv3.bias.data.zero_()
        conv_layers += [self.conv3, self.relu3, self.bn3]

        # Fourth Convolution Block
        self.conv4 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
        self.relu4 = nn.ReLU()
        self.bn4 = nn.BatchNorm2d(128)
        init.kaiming_normal_(self.conv4.weight, a=0.1)
        self.conv4.bias.data.zero_()
        conv_layers += [self.conv4, self.relu4, self.bn4]

        # Linear Classifier
        self.ap = nn.AdaptiveAvgPool2d(output_size=1)
        self.lin = nn.Linear(in_features=128, out_features=53)

        # Wrap the Convolutional Blocks
        self.conv = nn.Sequential(*conv_layers)

    # Forward pass computations
    def forward(self, x):
        # Run the convolutional blocks
        x = self.conv(x)

        # Adaptive pool and flatten for input to linear layer
        x = self.ap(x)
        x = x.view(x.shape[0], -1)

        # Linear layer
        x = self.lin(x)

        # Final output
        return x

# Create the model and put it on the GPU if available
myModel = AudioClassifier()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
myModel = myModel.to(device)
# Check that it is on Cuda
next(myModel.parameters()).device

device(type='cuda', index=0)

In [29]:
num_epochs=20
training(myModel, train_dl, num_epochs)

Epoch: 0, Loss: 3.97, Accuracy: 0.02
Epoch: 1, Loss: 3.87, Accuracy: 0.04
Epoch: 2, Loss: 3.70, Accuracy: 0.07
Epoch: 3, Loss: 3.58, Accuracy: 0.07
Epoch: 4, Loss: 3.41, Accuracy: 0.09
Epoch: 5, Loss: 3.41, Accuracy: 0.09
Epoch: 6, Loss: 3.36, Accuracy: 0.11
Epoch: 7, Loss: 3.10, Accuracy: 0.17
Epoch: 8, Loss: 2.82, Accuracy: 0.19
Epoch: 9, Loss: 2.77, Accuracy: 0.22
Epoch: 10, Loss: 2.30, Accuracy: 0.37
Epoch: 11, Loss: 2.19, Accuracy: 0.37
Epoch: 12, Loss: 1.91, Accuracy: 0.41
Epoch: 13, Loss: 1.79, Accuracy: 0.49
Epoch: 14, Loss: 1.36, Accuracy: 0.64
Epoch: 15, Loss: 1.33, Accuracy: 0.58
Epoch: 16, Loss: 1.16, Accuracy: 0.69
Epoch: 17, Loss: 1.02, Accuracy: 0.77
Epoch: 18, Loss: 0.94, Accuracy: 0.73
Epoch: 19, Loss: 0.88, Accuracy: 0.78
Finished Training


## Closing thoughts

### Room for further improvement

I identified four remaining key avenues for future improvements:

1. Utility functions: Fine-tune utility functions responsible for audio data augmentation, including custom transformations and diverse augmentation ranges.
2. Neural network architecture: Further exploration and modification of the network's architecture.
3. Neural network parameters: Experimentation with learning rate optimizers, batch sizes, train/test splits and other parameters.
4. The underlying dataset: A thorough examination of the dataset to assess potential improvements, including the impact of restricting training to words with a minimum of eight recordings (instead of the current five). Collecting more training data is also critical.

### Looking past the model

As the second model is being refined, the API needs to be revisited, as well. The intelligibility score in particular needs to be reassessed. Considering the possible differences in dialect, accent, and speaker's native tongue relative to the training data negatively impact the intelligibility score as it is currently implemented. The score should serve as a general indicator of intelligibility rather than a definitive marker of success or failure. This also affects how the score is presented on the frontend, of course.