# Packages

In [1]:
import os

import lightning
import pandas as pd
import torch
import torchmetrics
from torch.utils.data import DataLoader
from torchvision.transforms import Compose

from load_data import AudioTrainDataset, PaddingZeros, CustomSpectogram, TargetEncoder

# Simple loading

In [2]:
DATA_PATH = os.path.join("tensorflow-speech-recognition-challenge", "train", "audio")
dataset = AudioTrainDataset(DATA_PATH)

labels_list, labels_dict = dataset.find_classes(DATA_PATH)
labels_dict = {idx: name for name, idx in labels_dict.items()}
labels_dict

{1: 'bed',
 2: 'bird',
 3: 'cat',
 4: 'dog',
 5: 'down',
 6: 'eight',
 7: 'five',
 8: 'four',
 9: 'go',
 10: 'happy',
 11: 'house',
 12: 'left',
 13: 'marvin',
 14: 'nine',
 15: 'no',
 16: 'off',
 17: 'on',
 18: 'one',
 19: 'right',
 20: 'seven',
 21: 'sheila',
 22: 'silence',
 23: 'six',
 24: 'stop',
 25: 'three',
 26: 'tree',
 27: 'two',
 28: 'up',
 29: 'wow',
 30: 'yes',
 31: 'zero'}

In [3]:
NUM_WORKERS = 6
BATCH_SIZE = 512

# Simple Model

In [4]:
transforms = Compose([
    PaddingZeros(16000),
    CustomSpectogram(n_fft=1024, power=2),
])

features_dataset = AudioTrainDataset(DATA_PATH, transform=transforms,
                                     target_transform=TargetEncoder(class_dict=labels_dict))

In [5]:
gen = torch.Generator().manual_seed(42)
train_dataset, valid_dataset, test_dataset = torch.utils.data.random_split(features_dataset, [0.7, 0.1, 0.2],
                                                                           generator=gen)
len(train_dataset), len(valid_dataset), len(test_dataset)

(45587, 6512, 13024)

In [6]:
train_dataset_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True,
                                  generator=torch.random.manual_seed(123))
valid_dataset_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False)
test_dataset_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False)
len(train_dataset_loader), len(valid_dataset_loader), len(test_dataset_loader)

(90, 13, 26)

In [7]:
class MyLSTM(lightning.LightningModule):
    def __init__(self,
                 input_features_size, # number of frequencies of spectogram
                 input_sequence_size, # length of spectogram
                 hidden_size,
                 conv_channels_out,
                 conv_kernel_size,
                 target_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.conv = torch.nn.Conv1d(input_sequence_size, conv_channels_out, kernel_size=conv_kernel_size, groups=input_sequence_size)
        lstm_input_size = input_features_size - (conv_kernel_size - 1)
        self.lstm = torch.nn.LSTM(lstm_input_size, hidden_size, num_layers=1, batch_first=True, dropout=0.3)
        self.hidden2label = torch.nn.Linear(hidden_size, target_size)
        self.softmax = torch.nn.Softmax(dim=-1)
        self.train_acc = torchmetrics.Accuracy(task="multiclass", num_classes=target_size)
        self.valid_acc = torchmetrics.Accuracy(task="multiclass", num_classes=target_size)
        self.test_acc = torchmetrics.Accuracy(task="multiclass", num_classes=target_size)
        self.test_conf_mat = torchmetrics.ConfusionMatrix(task="multiclass", num_classes=target_size)

    def forward(self, x):
        x_squeeze = x.squeeze()
        x = self.conv(x_squeeze)
        lstm_out, (hc, _) = self.lstm(x)
        label_space = self.hidden2label(hc.squeeze())
        return self.softmax(label_space)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        self.train_acc(y_hat, torch.argmax(y, dim=-1))
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("train_acc_step", self.train_acc)
        return loss

    def on_train_epoch_end(self):
        self.log('train_acc', self.train_acc)

    def predict_step(self, batch, batch_idx, dataloader_idx = 0):
        x, y = batch
        return self(x)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        self.valid_acc(y_hat, torch.argmax(y, dim=-1))
        self.log('val_loss', loss, on_epoch=True)
        self.log('val_acc', self.valid_acc, on_epoch=True)

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        y_class = torch.argmax(y, dim=-1)
        self.test_acc(y_hat, y_class)
        self.test_conf_mat(y_hat, y_class)
        self.log('test_loss', loss, on_epoch=True)
        self.log('test_acc', self.test_acc, on_epoch=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.3, patience=3),
                "monitor": "val_loss",
            }
        }

In [8]:
for batch_x,  batch_y in train_dataset_loader:
    print(batch_x.shape)
    break

torch.Size([512, 1, 32, 513])


In [9]:
model = MyLSTM(513, 32, 128, 32, 8, 12)
for batch_x,  batch_y in train_dataset_loader:
    y_hat = model(batch_x)
    break



In [10]:
model = MyLSTM(513, 32,  128, 128, 8, 12)
trainer = lightning.Trainer(max_epochs=2, logger=True)
torch.set_float32_matmul_precision('high')
trainer.fit(model, train_dataloaders=train_dataset_loader, val_dataloaders=valid_dataset_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                      | Params
------------------------------------------------------------
0 | conv          | Conv1d                    | 1.2 K 
1 | lstm          | LSTM                      | 325 K 
2 | hidden2label  | Linear                    | 1.5 K 
3 | softmax       | Softmax                   | 0     
4 | train_acc     | MulticlassAccuracy        | 0     
5 | valid_acc     | MulticlassAccuracy        | 0     
6 | test_acc      | MulticlassAccuracy        | 0     
7 | test_conf_mat | MulticlassConfusionMatrix | 0     
------------------------------------------------------------
328 K     Trainable params
0         Non-trainable params
328 K     Total params
1.313     Total estimated model params size (MB)


Epoch 0: 100%|██████████| 90/90 [00:19<00:00,  4.66it/s, v_num=29, train_loss_step=1.990]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|▊         | 1/13 [00:00<00:02,  5.88it/s][A
Validation DataLoader 0:  15%|█▌        | 2/13 [00:00<00:01,  9.57it/s][A
Validation DataLoader 0:  23%|██▎       | 3/13 [00:00<00:00, 12.50it/s][A
Validation DataLoader 0:  31%|███       | 4/13 [00:00<00:00, 14.87it/s][A
Validation DataLoader 0:  38%|███▊      | 5/13 [00:00<00:00, 16.78it/s][A
Validation DataLoader 0:  46%|████▌     | 6/13 [00:00<00:00, 18.24it/s][A
Validation DataLoader 0:  54%|█████▍    | 7/13 [00:00<00:00, 19.50it/s][A
Validation DataLoader 0:  62%|██████▏   | 8/13 [00:00<00:00, 20.46it/s][A
Validation DataLoader 0:  69%|██████▉   | 9/13 [00:00<00:00, 21.28it/s][A
Validation DataLoader 0:  77%|███████▋  | 10/13 [00:00<00:00, 21.98it/s][A
Vali

`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 90/90 [00:34<00:00,  2.59it/s, v_num=29, train_loss_step=1.990, train_loss_epoch=1.990]


In [None]:
results = []
predictions = []
for i in range(5):
    lightning.pytorch.seed_everything(i)
    model = MyLSTM(513, 32,  128, 32, 8, 12)
    early_stopping = lightning.pytorch.callbacks.EarlyStopping('val_loss', verbose=True, patience=5)
    logger = lightning.pytorch.loggers.tensorboard.TensorBoardLogger(save_dir="cnn_lstm", version=i)
    trainer = lightning.Trainer(max_epochs=200, callbacks=[early_stopping], logger=logger)
    trainer.fit(model, train_dataloaders=train_dataset_loader, val_dataloaders=valid_dataset_loader)
    res = trainer.test(dataloaders=test_dataset_loader, ckpt_path='best')
    test_pred_tensor = torch.cat(trainer.predict(dataloaders=test_dataset_loader, ckpt_path='best'))
    results.append(res[0])
    predictions.append(test_pred_tensor)
torch.save(torch.stack(predictions), "spectogram_cnn_lstm_predictions.ts")
pd.DataFrame(results).to_csv("spectogram_cnn_lstm_metrics.csv")

Global seed set to 0
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name          | Type                      | Params
------------------------------------------------------------
0 | conv          | Conv1d                    | 288   
1 | lstm          | LSTM                      | 325 K 
2 | hidden2label  | Linear                    | 1.5 K 
3 | softmax       | Softmax                   | 0     
4 | train_acc     | MulticlassAccuracy        | 0     
5 | valid_acc     | MulticlassAccuracy        | 0     
6 | test_acc      | MulticlassAccuracy        | 0     
7 | test_conf_mat | MulticlassConfusionMatrix | 0     
------------------------------------------------------------
327 K     Trainable params
0         Non-trainable params
327 K     Total params
1.310    

Epoch 0: 100%|██████████| 90/90 [00:19<00:00,  4.59it/s, v_num=0, train_loss_step=2.040]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|▊         | 1/13 [00:00<00:02,  5.92it/s][A
Validation DataLoader 0:  15%|█▌        | 2/13 [00:00<00:01, 10.05it/s][A
Validation DataLoader 0:  23%|██▎       | 3/13 [00:00<00:00, 13.27it/s][A
Validation DataLoader 0:  31%|███       | 4/13 [00:00<00:00, 15.87it/s][A
Validation DataLoader 0:  38%|███▊      | 5/13 [00:00<00:00, 18.05it/s][A
Validation DataLoader 0:  46%|████▌     | 6/13 [00:00<00:00, 19.86it/s][A
Validation DataLoader 0:  54%|█████▍    | 7/13 [00:00<00:00, 18.37it/s][A
Validation DataLoader 0:  62%|██████▏   | 8/13 [00:00<00:00, 19.70it/s][A
Validation DataLoader 0:  69%|██████▉   | 9/13 [00:00<00:00, 20.74it/s][A
Validation DataLoader 0:  77%|███████▋  | 10/13 [00:00<00:00, 21.64it/s][A
Valid

Metric val_loss improved. New best score: 1.978


Epoch 1: 100%|██████████| 90/90 [00:19<00:00,  4.51it/s, v_num=0, train_loss_step=2.040, train_loss_epoch=2.000]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|▊         | 1/13 [00:00<00:02,  4.02it/s][A
Validation DataLoader 0:  15%|█▌        | 2/13 [00:00<00:01,  5.99it/s][A
Validation DataLoader 0:  23%|██▎       | 3/13 [00:00<00:01,  8.09it/s][A
Validation DataLoader 0:  31%|███       | 4/13 [00:00<00:00,  9.84it/s][A
Validation DataLoader 0:  38%|███▊      | 5/13 [00:00<00:00, 11.40it/s][A
Validation DataLoader 0:  46%|████▌     | 6/13 [00:00<00:00, 12.83it/s][A
Validation DataLoader 0:  54%|█████▍    | 7/13 [00:00<00:00, 14.16it/s][A
Validation DataLoader 0:  62%|██████▏   | 8/13 [00:00<00:00, 15.38it/s][A
Validation DataLoader 0:  69%|██████▉   | 9/13 [00:00<00:00, 16.45it/s][A
Validation DataLoader 0:  77%|███████▋  | 10/13 [00:00<00

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 1.978


Epoch 2: 100%|██████████| 90/90 [00:19<00:00,  4.51it/s, v_num=0, train_loss_step=2.090, train_loss_epoch=1.990]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|▊         | 1/13 [00:00<00:00, 45.46it/s][A
Validation DataLoader 0:  15%|█▌        | 2/13 [00:00<00:00, 27.03it/s][A
Validation DataLoader 0:  23%|██▎       | 3/13 [00:00<00:00, 27.52it/s][A
Validation DataLoader 0:  31%|███       | 4/13 [00:00<00:00, 26.82it/s][A
Validation DataLoader 0:  38%|███▊      | 5/13 [00:00<00:00, 26.15it/s][A
Validation DataLoader 0:  46%|████▌     | 6/13 [00:00<00:00, 26.41it/s][A
Validation DataLoader 0:  54%|█████▍    | 7/13 [00:00<00:00, 20.82it/s][A
Validation DataLoader 0:  62%|██████▏   | 8/13 [00:00<00:00, 21.55it/s][A
Validation DataLoader 0:  69%|██████▉   | 9/13 [00:00<00:00, 21.83it/s][A
Validation DataLoader 0:  77%|███████▋  | 10/13 [00:00<00

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 1.978


Epoch 4: 100%|██████████| 90/90 [00:20<00:00,  4.29it/s, v_num=0, train_loss_step=1.990, train_loss_epoch=1.990]
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/13 [00:00<?, ?it/s][A
Validation DataLoader 0:   8%|▊         | 1/13 [00:00<00:02,  5.01it/s][A
Validation DataLoader 0:  15%|█▌        | 2/13 [00:00<00:01,  8.85it/s][A
Validation DataLoader 0:  23%|██▎       | 3/13 [00:00<00:00, 11.76it/s][A
Validation DataLoader 0:  31%|███       | 4/13 [00:00<00:00, 13.98it/s][A
Validation DataLoader 0:  38%|███▊      | 5/13 [00:00<00:00, 15.82it/s][A
Validation DataLoader 0:  46%|████▌     | 6/13 [00:00<00:00, 17.44it/s][A
Validation DataLoader 0:  54%|█████▍    | 7/13 [00:00<00:00, 18.81it/s][A
Validation DataLoader 0:  62%|██████▏   | 8/13 [00:00<00:00, 19.03it/s][A
Validation DataLoader 0:  69%|██████▉   | 9/13 [00:00<00:00, 20.13it/s][A
Validation DataLoader 0:  77%|███████▋  | 10/13 [00:00<00

Metric val_loss improved by 0.000 >= min_delta = 0.0. New best score: 1.978


Epoch 5:   0%|          | 0/90 [00:00<?, ?it/s, v_num=0, train_loss_step=1.990, train_loss_epoch=1.990]         