# Packages

In [2]:
import os

import lightning
import torch
import torchmetrics
from torch.utils.data import DataLoader
from torchvision.transforms import Compose, Lambda

from .load_data import AudioTrainDataset, PaddingZeros, CustomSpectogram, TargetEncoder

# Simple loading

In [3]:
DATA_PATH = os.path.join("tensorflow-speech-recognition-challenge", "train", "audio")
dataset = AudioTrainDataset(DATA_PATH)

labels_list, labels_dict = dataset.find_classes(DATA_PATH)
labels_dict = {idx: name for name, idx in labels_dict.items()}
labels_dict

{1: 'bed',
 2: 'bird',
 3: 'cat',
 4: 'dog',
 5: 'down',
 6: 'eight',
 7: 'five',
 8: 'four',
 9: 'go',
 10: 'happy',
 11: 'house',
 12: 'left',
 13: 'marvin',
 14: 'nine',
 15: 'no',
 16: 'off',
 17: 'on',
 18: 'one',
 19: 'right',
 20: 'seven',
 21: 'sheila',
 22: 'silence',
 23: 'six',
 24: 'stop',
 25: 'three',
 26: 'tree',
 27: 'two',
 28: 'up',
 29: 'wow',
 30: 'yes',
 31: 'zero'}

In [4]:
NUM_WORKERS = 6
BATCH_SIZE = 512

# Simple Model

In [5]:
transforms = Compose([
    PaddingZeros(16000),
    CustomSpectogram(n_fft=1024, power=2),
])

features_dataset = AudioTrainDataset(DATA_PATH, transform=transforms,
                                     target_transform=TargetEncoder(class_dict=labels_dict))

In [6]:
gen = torch.Generator().manual_seed(42)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(features_dataset, [0.7, 0.1, 0.2],
                                                                           generator=gen)
len(train_dataset), len(valid_dataset), len(test_dataset)

(45587, 6512, 13024)

In [7]:
train_dataset_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True,
                                  generator=torch.random.manual_seed(123))
valid_dataset_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False)
test_dataset_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False)
len(train_dataset_loader), len(valid_dataset_loader), len(test_dataset_loader)



(90, 13, 26)

In [8]:
class MyLSTM(lightning.LightningModule):
    def __init__(self, input_size, hidden_size, target_size):
        super().__init__()
        self.lstm = torch.nn.LSTM(input_size, hidden_size, num_layers=1, batch_first=True)
        self.hidden2label = torch.nn.Linear(hidden_size, target_size)
        self.softmax = torch.nn.Softmax(dim=-1)
        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=target_size)
        self.valid_acc = torchmetrics.Accuracy(task="multiclass", num_classes=target_size)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=target_size)
        self.test_conf_mat = torchmetrics.ConfusionMatrix(task="multiclass", num_classes=target_size)

    def forward(self, x):
        x_squeeze = x.squeeze()
        lstm_out, _ = self.lstm(x_squeeze)
        label_space = self.hidden2label(lstm_out[:, -1])
        return self.softmax(label_space)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        self.train_acc(y_hat, torch.argmax(y, dim=-1))
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("train_acc_step", self.train_acc)
        return loss

    def on_train_epoch_end(self):
        self.log('train_acc', self.train_acc)

    def predict_step(self, batch, batch_idx):
        x, y = batch
        return self(x)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        self.valid_acc(y_hat, torch.argmax(y, dim=-1))
        self.log('val_loss', loss, on_epoch=True)
        self.log('val_acc', self.valid_acc, on_epoch=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        y_class = torch.argmax(y, dim=-1)
        self.test_acc(y_hat, y_class)
        self.test_conf_mat(y_hat, y_class)
        self.log('test_loss', loss, on_epoch=True)
        self.log('test_acc', self.test_acc, on_epoch=True)
        self.log('test_confustion_matrix', self.test_conf_mat)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.3, patience=3),
                "monitor": "val_loss",
            }
        }

In [10]:
model = MyLSTM(513, 128, 12)
trainer = lightning.Trainer(max_epochs=2, logger=True)
torch.set_float32_matmul_precision('medium')
trainer.fit(model, train_dataloaders=train_dataset_loader, val_dataloaders=valid_dataset_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name          | Type                      | Params
------------------------------------------------------------
0 | lstm          | LSTM                      | 329 K 
1 | hidden2label  | Linear                    | 1.5 K 
2 | softmax       | Softmax                   | 0     
3 | train_acc     | MulticlassAccuracy        | 0     
4 | valid_acc     | MulticlassAccuracy        | 0     
5 | test_acc      | MulticlassAccuracy        | 0     
6 | test_conf_mat | MulticlassConfusionMatrix | 0     
------------------------------------------------------------
330 K     Trainable params
0         Non-trainable params
330 K     Total params
1.323     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


In [10]:
early_stopping = lightning.pytorch.callbacks.EarlyStopping('val_loss')
results = []
for i in range(5):
    lightning.pytorch.seed_everything(i)
    model = MyLSTM(32, 128, 12)
    trainer = lightning.Trainer(max_epochs=4, callbacks=[early_stopping])
    trainer.fit(model, train_dataloaders=train_dataset_loader, val_dataloaders=valid_dataset_loader)
    res = trainer.test(dataloaders=test_dataset_loader)
    results.append(res[0])
results

Global seed set to 0
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                      | Params
------------------------------------------------------------
0 | lstm          | LSTM                      | 82.9 K
1 | hidden2label  | Linear                    | 1.5 K 
2 | softmax       | Softmax                   | 0     
3 | train_acc     | MulticlassAccuracy        | 0     
4 | valid_acc     | MulticlassAccuracy        | 0     
5 | test_acc      | MulticlassAccuracy        | 0     
6 | test_conf_mat | MulticlassConfusionMatrix | 0     
------------------------------------------------------------
84.5 K    Trainable params
0         Non-trainable params
84.5 K    Total params
0.338     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 90/90 [00:28<00:00,  3.12it/s, v_num=5, train_loss_step=2.040]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|▊         | 1/13 [00:00<00:04,  2.89it/s][A
Validation DataLoader 0:  15%|█▌        | 2/13 [00:00<00:02,  5.25it/s][A
Validation DataLoader 0:  23%|██▎       | 3/13 [00:00<00:01,  7.16it/s][A
Validation DataLoader 0:  31%|███       | 4/13 [00:00<00:01,  8.72it/s][A
Validation DataLoader 0:  38%|███▊      | 5/13 [00:00<00:00, 10.06it/s][A
Validation DataLoader 0:  46%|████▌     | 6/13 [00:00<00:00, 11.24it/s][A
Validation DataLoader 0:  54%|█████▍    | 7/13 [00:00<00:00, 12.30it/s][A
Validation DataLoader 0:  62%|██████▏   | 8/13 [00:00<00:00, 13.25it/s][A
Validation DataLoader 0:  69%|██████▉   | 9/13 [00:00<00:00, 14.02it/s][A
Validation DataLoader 0:  77%|███████▋  | 10/13 [00:00<00:00, 14.73it/s][A
Valid

`Trainer.fit` stopped: `max_epochs=4` reached.


Epoch 3: 100%|██████████| 90/90 [00:56<00:00,  1.59it/s, v_num=5, train_loss_step=1.930, train_loss_epoch=1.990]


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 26/26 [00:01<00:00, 20.42it/s]

ValueError: The metric `tensor([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  463,    1],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  463,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  483,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  462,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  482,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  492,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  497,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  450,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    1,    0,  466,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  500,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 8185,    0],
        [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   54,   25]],
       device='cuda:0')` does not contain a single element, thus it cannot be converted to a scalar.