In [1]:
import torch
import torch.nn as nn
from train import main
from processing import data_processing, TextTransform, char_map_str
from networks import SpeechRecognitionModel
import torch.utils.data as data
import torchaudio
from ctcdecode import CTCBeamDecoder
from predict import predict

In [2]:
hyperparameters = {
    "n_cnn_layers": 3,
    "n_rnn_layers": 5,
    "rnn_dim": 512,
    "n_class": 29,
    "n_feats": 128,
    "stride": 2,
    "dropout": 0.1,
    "learning_rate": 5e-4,
    "batch_size": 20,
    "epochs": 10
}

In [3]:
train_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)
test_transforms = torchaudio.transforms.MelSpectrogram()
text_transforms = TextTransform()

In [4]:
train_dataset = torchaudio.datasets.LIBRISPEECH('data', url="train-clean-100", download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH('data', url="test-clean", download=True)

kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}
train_loader = data.DataLoader(dataset=train_dataset,
                               batch_size=hyperparameters['batch_size'],
                               shuffle=True,
                               collate_fn=lambda x: data_processing(x, train_transforms, text_transforms),
                               **kwargs)
test_loader = data.DataLoader(dataset=test_dataset,
                              batch_size=hyperparameters['batch_size'],
                              shuffle=False,
                              collate_fn=lambda x: data_processing(x, test_transforms, text_transforms),
                              **kwargs)

In [5]:
model = SpeechRecognitionModel(
    hyperparameters['n_cnn_layers'], hyperparameters['n_rnn_layers'], 
    hyperparameters['rnn_dim'], hyperparameters['n_class'], hyperparameters['n_feats'], 
    hyperparameters['stride'], hyperparameters['dropout']
    )
print('Number model parameters', sum([param.nelement() for param in model.parameters()]))
model

Number model parameters 23705373


SpeechRecognitionModel(
  (cnn): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (rescnn_layers): Sequential(
    (0): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
      (layer_norm2): CNNLayerNorm(
        (layer_norm): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      )
    )
    (1): ResidualCNN(
      (cnn1): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (cnn2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
      (layer_norm1): CNNLayerNorm(
        (layer_norm): LayerNorm((64,),

In [None]:
checkpoint = {'hyperparameters':hyperparameters}
checkpoints = main(train_loader, test_loader, hyperparameters, model, checkpoint=checkpoint)
torch.save(checkpoints[-1], 'last_checkpoint.pth')

In [6]:
decoder = CTCBeamDecoder(
    char_map_str,
    alpha=0.5,
    beta=0,
    cutoff_top_n=40,
    cutoff_prob=1.0,
    beam_width=100,
    num_processes=4,
    blank_id=char_map_str.index('_'),
    log_probs_input=False
)
predict(model, '121-123852-0001.flac', decoder)  # audio file without transformations

<class 'numpy.ndarray'>
tensor([13, 19, 16, 18, 17, 25, 20, 13, 17, 20, 12, 21, 17, 21, 20, 10, 17, 12,
         5, 14, 17, 20, 13,  2, 13, 18, 13, 17, 19, 13, 10], dtype=torch.int32)
[13 19 16 18 17 25 20 13 17 20 12 21 17 21 20 10 17 12  5 14 17 20 13  2
 13 18 13 17 19 13 10]


'lroqpxslpsktptsipkdmpslalqlprli'