# Speech Recognition Model Training from Scratch

In this notebook, we will develop a simple speech recognition model using PyTorch and the `torchaudio` library. We will look at defining, training, and evaluating a neural network on the SPEECHCOMMANDS dataset. 


In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchaudio.datasets import SPEECHCOMMANDS
from torch.utils.data import DataLoader
import os

### Data Preparation

The SPEECHCOMMANDS dataset is a collection of spoken words designed for command recognition. Below, we define a subclass to handle data loading and preprocessing for different subsets of this dataset.


In [26]:
QUICK = True

class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("./", download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.join(self._path, line.strip()) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]
        elif subset == "debug":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [self._walker[w] for w in range(len(self._walker)) if self._walker[w] not in excludes and w%10 == 0]
        elif subset == "dev":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [self._walker[w] for w in range(len(self._walker)) if self._walker[w] not in excludes and w%1000 == 0]

### Model Definition

We will define a simple neural network with fully connected layers to classify audio into one of 35 categories based on the command spoken.


In [27]:
class SimpleNet(nn.Module):
    
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.n_labels = 35
        self.net = nn.Sequential(
            nn.Linear(16000, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, self.n_labels)
        )

    def forward(self, x):
        x = self.net(x)
        return F.log_softmax(x, dim=1)


### Training the Model

We will train our model on the training subset, refining the data to only include samples that match our input dimensionality and training it for a number of epochs.


In [28]:
def refine(data):
    """ Refine data to ensure each sample is of correct dimensionality. """
    return [data[i] for i in range(len(data)) if data[i][0].shape[1] == 16000 ]

def train_model():
    NUM_EPOCHS = 20
    BATCH_SIZE = 100

    working_set = SubsetSC("training")
    if QUICK:
        working_set = SubsetSC("dev")
    working_set = refine(working_set)
    dataloader = DataLoader(working_set, batch_size=BATCH_SIZE, shuffle=True)
    model = SimpleNet()
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

    for epoch  in range(1, NUM_EPOCHS + 1):
        for features, size, train_labels, serial, train_labels_indices in dataloader:
            optimizer.zero_grad()
            features = features.reshape(features.shape[0], features.shape[2])
            y_pred = model(features)
            loss = F.nll_loss(y_pred, train_labels_indices)
            loss.backward()
            optimizer.step()
        print(f'Epoch:{epoch}, Loss:{loss.item():.4f}')
    return model

model = train_model()

Epoch:1, Loss:3.5317
Epoch:2, Loss:3.4479
Epoch:3, Loss:3.3486
Epoch:4, Loss:3.2293
Epoch:5, Loss:3.0913
Epoch:6, Loss:2.9352
Epoch:7, Loss:2.7646
Epoch:8, Loss:2.5826
Epoch:9, Loss:2.3928
Epoch:10, Loss:2.1990
Epoch:11, Loss:2.0055
Epoch:12, Loss:1.8166
Epoch:13, Loss:1.6361
Epoch:14, Loss:1.4666
Epoch:15, Loss:1.3109
Epoch:16, Loss:1.1706
Epoch:17, Loss:1.0468
Epoch:18, Loss:0.9389
Epoch:19, Loss:0.8455
Epoch:20, Loss:0.7648


### Testing the Model

Finally, evaluate the performance of our trained model using the validation subset.

In [29]:
def test_model(model):
    working_set = SubsetSC("testing")
    if QUICK:
        working_set = SubsetSC("dev")
    working_set = refine(working_set)
    dataloader = DataLoader(working_set, batch_size=100, shuffle=True)
    model.eval()
    correct = 0
    with torch.no_grad():
        for features, size, train_labels, serial, train_labels_indices in dataloader:
            features = features.reshape(features.shape[0], features.shape[2])
            y_pred = model(features)
            pred = y_pred.data.max(1, keepdim=True)[1]
            correct += pred.eq(train_labels_indices.data.view_as(pred)).sum()
    print(f"{correct} correct among {len(working_set)} in test set")
    
test_model(model)

93 correct among 97 in test set
