In [1]:
!nvidia-smi
%env CUDA_VISIBLE_DEVICES=0
%env CUDA_VISIBLE_DEVICES

Fri May  7 13:42:41 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 465.19.01    CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA Tesla V1...  On   | 00000000:37:00.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA Tesla V1...  On   | 00000000:AF:00.0 Off |                    0 |
| N/A   73C    P0   134W / 250W |  15729MiB / 16160MiB |     99%      Default |
|       

'0'

In [None]:
######################################### ASR not-pretained main script #################################################

# Import general libraries
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.optim.lr_scheduler import CosineAnnealingLR
from datetime import datetime

import editdistance

# Import stuff from other ASR modules
from asr.data import BaseDataset
from asr.data.preprocessors import SpectrogramPreprocessor, TextPreprocessor
from asr.modules import ASRModel
from asr.utils.training import batch_to_tensor, epochs, Logger
from asr.utils.text import greedy_ctc
from asr.utils.metrics import ErrorRateTracker, LossTracker

def get_Stats(ref_batch,hyp_batch):
    ref_WER_splits = [rf.split() for rf in ref_batch]
    hyp_WER_splits = [hp.split() for hp in hyp_batch]
    ref_WER_lens = [len(rf_spl) for rf_spl in ref_WER_splits]
    hyp_WER_lens = [len(hp_spl) for hp_spl in hyp_WER_splits]
    
    WER = [editdistance.eval(rf,hp) for rf, hp in zip(ref_WER_splits, hyp_WER_splits)]
    
    ref_CER_splits = [list(rf) for rf in ref_batch]
    hyp_CER_splits = [list(hp) for hp in hyp_batch]
    ref_CER_lens = [len(rf_spl) for rf_spl in ref_CER_splits]
    hyp_CER_lens = [len(hp_spl) for hp_spl in hyp_CER_splits]
    
    CER = [editdistance.eval(rf,hp) for rf, hp in zip(ref_CER_splits, hyp_CER_splits)]
    
    return WER, ref_WER_lens, hyp_WER_lens, CER, ref_CER_lens, hyp_CER_lens


""" Function: Train and test ASR model on input datasets
    Input:    2 txt-files with IDs to training and test files. One observation consists of a transcript
              (txt-feature) and audio file (wav-target).
    Output:   Return best WER (validation) and save ASR model (model.pt) """


""" Part 1: Load and preprocess data """
train_IDs = 'wavenet-cut'
test_IDs = 'dev-clean'


train_source = train_IDs
val_source = test_IDs

# BLACK BOX
spec_preprocessor = SpectrogramPreprocessor(output_format='NFT', sample_rate=4000)
text_preprocessor = TextPreprocessor()
preprocessor = [spec_preprocessor, text_preprocessor]

train_dataset = BaseDataset(source=train_source, preprocessor=preprocessor, sort_by=0)
val_dataset = BaseDataset(source=val_source, preprocessor=preprocessor, sort_by=0)

# Data loader
train_loader = DataLoader(train_dataset, num_workers=4, pin_memory=True, collate_fn=train_dataset.collate, batch_size=16)
val_loader = DataLoader(val_dataset, num_workers=4, pin_memory=True, collate_fn=val_dataset.collate, batch_size=16)

""" Part 2: Setup model and loss """
# Create instance of model
asr_model = ASRModel(input_size=40).cuda()
print(asr_model)
print("Trainable parameters:", sum(p.numel() for p in asr_model.parameters() if p.requires_grad))

# Define loss, optimizer and learning rate scheduler
ctc_loss = nn.CTCLoss(reduction='none').cuda()#reduction='sum'
optimizer = torch.optim.Adam(asr_model.parameters(), lr=3e-4)
lr_scheduler = CosineAnnealingLR(optimizer, T_max=100, eta_min=5e-5)

""" Part 3: Train and evaluate model """
# Variables to create and store performance metrics
wer_metric = ErrorRateTracker(word_based=True)
cer_metric = ErrorRateTracker(word_based=False)
ctc_metric = LossTracker()

train_logger = Logger('Training', ctc_metric, wer_metric, cer_metric)
val_logger = Logger('Validation', ctc_metric, wer_metric, cer_metric)

def forward_pass(batch, training=False):
    (x, x_sl), (y, y_sl) = batch_to_tensor(batch)  # For CPU: change 'cuda' to 'cpu'
    
    logits, output_sl = asr_model.forward(x, x_sl.cpu())
    log_probs = F.log_softmax(logits, dim=2)
    loss = ctc_loss(log_probs, y, output_sl, y_sl)
    
    hyp_encoded_batch = greedy_ctc(logits, output_sl)
    hyp_batch = text_preprocessor.decode_batch(hyp_encoded_batch)
    ref_batch = text_preprocessor.decode_batch(y, y_sl)
    
    wer_metric.update(ref_batch, hyp_batch)
    cer_metric.update(ref_batch, hyp_batch)
    ctc_metric.update(loss.sum().item(), weight=output_sl.sum().item())

    if not training:
        WER, ref_WER_lens, hyp_WER_lens, CER, ref_CER_lens, hyp_CER_lens = get_Stats(ref_batch,hyp_batch)
        CTC_loss = [ctc.item() for ctc in loss]
        return CTC_loss, WER, ref_WER_lens, hyp_WER_lens, CER, ref_CER_lens, hyp_CER_lens
    else:
        return loss

# Run 200 epochs
model_name = f"{train_IDs}vs{test_IDs}{datetime.now().strftime('Y%Y-m%m-d%d-H%H-M%M')}" 
with open(f"./results/{model_name}.csv", "a+") as o_f:
    for epoch in epochs(200):
        # Set PyTorch in training mode
        asr_model.train()

        # Train model on training set
        print("Running training:")
        for batch, files in train_logger(train_loader):
            loss = forward_pass(batch, training=True)
            optimizer.zero_grad()
            loss.sum().backward()
            optimizer.step()

        # Set PyTorch in test mode
        asr_model.eval()

        # Test model using test set
        print("Running evaluation:")
        N = 1
        wer_sum = 0
        with torch.no_grad():
            for batch, files in val_logger(val_loader):
                CTC_loss, WER, ref_WER_lens, hyp_WER_lens, CER, ref_CER_lens, hyp_CER_lens = forward_pass(batch, training=False)
                wer_sum += np.sum(WER)
                N += len(WER)
                for i, flac_file in enumerate(files):
                    idx = os.path.splitext(os.path.basename(flac_file))[0]
                    print(f"{epoch}\t{idx}\t{CTC_loss[i]}\t{WER[i]}\t{ref_WER_lens[i]}\t{hyp_WER_lens[i]}\t{CER[i]}\t{ref_CER_lens[i]}\t{hyp_CER_lens[i]}", file=o_f)

        best_wer = np.inf
        if wer_sum/N < best_wer:
            best_wer = wer_sum/N
            torch.save(asr_model.state_dict(), f"./results/best_{model_name}.pt")

        if epoch >= 100:
            lr_scheduler.step()

ASRModel(
  (conv1d_layer): Conv1d(40, 256, kernel_size=(5,), stride=(2,), padding=(2,))
  (lstm_block): LSTM(256, 256, num_layers=2, dropout=0.4, bidirectional=True)
  (output_layer): Linear(in_features=512, out_features=29, bias=True)
)
Trainable parameters: 2695965
[1m
Epoch 1[0m
Running training:
Training [63/63, 0.4 min(s)]: Loss=1.091, WER=100.00, CER=100.79
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.854, WER=100.00, CER=100.00
[1m
Epoch 2[0m
Running training:
Training [63/63, 0.4 min(s)]: Loss=0.844, WER=100.00, CER=100.00
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.853, WER=100.00, CER=100.00
[1m
Epoch 3[0m
Running training:
Training [63/63, 0.2 min(s)]: Loss=0.843, WER=100.00, CER=100.00
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.853, WER=100.00, CER=100.00
[1m
Epoch 4[0m
Running training:
Training [63/63, 0.3 min(s)]: Loss=0.842, WER=100.00, CER=100.00
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.853

Training [63/63, 0.4 min(s)]: Loss=0.749, WER=108.07, CER=72.11
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.759, WER=109.55, CER=71.07
[1m
Epoch 44[0m
Running training:
Training [63/63, 0.3 min(s)]: Loss=0.749, WER=109.87, CER=72.07
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.760, WER=105.21, CER=72.35
[1m
Epoch 45[0m
Running training:
Training [63/63, 0.3 min(s)]: Loss=0.748, WER=108.91, CER=72.11
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.757, WER=108.32, CER=71.84
[1m
Epoch 46[0m
Running training:
Training [63/63, 0.5 min(s)]: Loss=0.747, WER=110.26, CER=71.69
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.758, WER=112.82, CER=71.14
[1m
Epoch 47[0m
Running training:
Training [63/63, 0.6 min(s)]: Loss=0.747, WER=110.80, CER=71.68
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.758, WER=105.51, CER=72.84
[1m
Epoch 48[0m
Running training:
Training [63/63, 0.6 min(s)]: Loss=0.746, WER=110.42, CER=71

Validation [169/169, 1.2 min(s)]: Loss=0.764, WER=104.85, CER=71.87
[1m
Epoch 87[0m
Running training:
Training [63/63, 0.6 min(s)]: Loss=0.718, WER=110.04, CER=69.98
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.768, WER=104.15, CER=71.98
[1m
Epoch 88[0m
Running training:
Training [63/63, 0.6 min(s)]: Loss=0.718, WER=110.38, CER=69.94
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.769, WER=103.21, CER=72.40
[1m
Epoch 89[0m
Running training:
Training [63/63, 0.5 min(s)]: Loss=0.716, WER=110.28, CER=69.84
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.769, WER=103.76, CER=72.22
[1m
Epoch 90[0m
Running training:
Training [63/63, 0.5 min(s)]: Loss=0.713, WER=109.93, CER=69.78
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.770, WER=104.29, CER=71.63
[1m
Epoch 91[0m
Running training:
Training [63/63, 0.5 min(s)]: Loss=0.712, WER=109.56, CER=69.66
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.769, WER=104.33, CE

Training [63/63, 0.5 min(s)]: Loss=0.607, WER=100.79, CER=62.08
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.835, WER=109.28, CER=69.12
[1m
Epoch 131[0m
Running training:
Training [63/63, 0.4 min(s)]: Loss=0.603, WER=100.75, CER=61.80
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.838, WER=109.73, CER=69.21
[1m
Epoch 132[0m
Running training:
Training [63/63, 0.4 min(s)]: Loss=0.600, WER=101.08, CER=61.59
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.839, WER=108.33, CER=69.06
[1m
Epoch 133[0m
Running training:
Training [63/63, 0.4 min(s)]: Loss=0.597, WER=100.42, CER=61.45
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.838, WER=107.79, CER=69.19
[1m
Epoch 134[0m
Running training:
Training [63/63, 0.7 min(s)]: Loss=0.593, WER=100.26, CER=60.97
Running evaluation:
Validation [169/169, 1.2 min(s)]: Loss=0.843, WER=108.12, CER=69.43
[1m
Epoch 135[0m
Running training:
Training [63/63, 0.6 min(s)]: Loss=0.591, WER=100.07, C