<a href="https://colab.research.google.com/github/SavkinEgor/MADE_ML_Speech/blob/main/SPEECH_asr_lab_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Практика №4

Теперь мы построим и обучим простую end-to-end модель. Будем работать с пропатченной версией уже готового [пайплайна](https://www.assemblyai.com/blog/end-to-end-speech-recognition-pytorch). Также нам пригодится [ESPnet](https://github.com/espnet/espnet) для использования модели [Transformer](http://jalammar.github.io/illustrated-transformer/) в качестве энкодера.

### Bootstrap

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.activity.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fexperimentsandconfigs%20https%3a%2f%2fwww.googleapis.com%2fauth%2fphotos.native&response_type=code

Enter your authorization code:
4/1AY0e-g7xtHSKtXm-uaCqNjm0lmR-5-NglGmPys_OG96kpGd9IE1Xxghtoew


In [None]:
!pip install torchaudio

In [None]:
!gdown --id '1skrVbNyrhBLeceGS9CV9uIw_gvo1JiA6'

!unzip -q lab4.zip
!rm -rf lab4.zip sample_data

In [1]:
%cd lab4

/content/lab4


In [2]:
import os
import torch
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np
import math

from utils import TextTransform
from utils import cer
from utils import wer

from espnet.nets.pytorch_backend.transformer.embedding import PositionalEncoding
from espnet.nets.pytorch_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.pytorch_backend.transformer.repeat import repeat
from espnet.nets.pytorch_backend.transformer.attention import MultiHeadedAttention
from espnet.nets.pytorch_backend.transformer.positionwise_feed_forward import PositionwiseFeedForward
from espnet.nets.pytorch_backend.transformer.layer_norm import LayerNorm
from espnet.nets.pytorch_backend.nets_utils import make_pad_mask

In [3]:
torch.cuda.get_device_name()

'Tesla K80'

In [38]:
train_audio_transforms = torch.nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=400, hop_length=160, n_mels=80),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000,
                                                              n_fft=400,
                                                              hop_length=160,
                                                              n_mels=80)

# text_transform = TextTransform()
# text_transform = TextTransformBPE()

# #-----------------------------TODO №2-----------------------------------
# # Заменить графемный токенайзер на сабвордовый TextTransformBPE
# #-----------------------------------------------------------------------


def data_processing(data, data_type="train"):
    spectrograms = []
    labels = []
    input_lengths = []
    label_lengths = []
    for (waveform, _, utterance, _, _, _) in data:
        if data_type == 'train':
            spec = train_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        elif data_type == 'valid':
            spec = valid_audio_transforms(waveform).squeeze(0).transpose(0, 1)
        else:
            raise Exception('data_type should be train or valid')
        spectrograms.append(spec)
        label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
        labels.append(label)
        input_lengths.append(spec.shape[0])
        label_lengths.append(len(label))

    spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True).unsqueeze(1).transpose(2, 3)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)

    return spectrograms, labels, input_lengths, label_lengths


def GreedyDecoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
    # можно бимсерч прикрутить
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))
    return decodes, targets

In [5]:
class TransformerModel(torch.nn.Module):

    def __init__(
        self,
        input_size=80,
        output_size=29,
        conv2d_filters=32,
        attention_dim=360,
        attention_heads=8,
        feedforward_dim=1024,
        num_layers=10,
        dropout=0.1,
    ):
        super(TransformerModel, self).__init__()
        
        self.conv_in = torch.nn.Sequential(
            torch.nn.Conv2d(1, conv2d_filters, kernel_size=(3,3), stride=(2,2), padding=(1,1)),
            torch.nn.ReLU(),
            torch.nn.Conv2d(conv2d_filters, conv2d_filters, kernel_size=(3,3), stride=(2,2), padding=(1,1)),
            torch.nn.ReLU(),
        )
        self.conv_out = torch.nn.Sequential(
            torch.nn.Linear(conv2d_filters * ((input_size // 2) // 2), attention_dim),
            PositionalEncoding(attention_dim, 0.1),
        )
        positionwise_layer = PositionwiseFeedForward
        positionwise_layer_args = (attention_dim, feedforward_dim, dropout)
        self.encoder_layer = repeat(
            num_layers,
            lambda lnum: EncoderLayer(
                attention_dim,
                MultiHeadedAttention(
                    attention_heads, attention_dim, dropout
                ),
                positionwise_layer(*positionwise_layer_args),
                dropout,
                normalize_before=True,
                concat_after=False,
            ),
        )
        self.after_norm = LayerNorm(attention_dim)
        self.final_layer = torch.nn.Linear(attention_dim, output_size)

    def forward(self, x, ilens):
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv_in(x)
        b, c, t, f = x.size()
        x = self.conv_out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        masks = (~make_pad_mask(ilens)[:, None, :])[:, :, ::4].to(x.device)
        x, _ = self.encoder_layer(x, masks)
        x = self.after_norm(x)
        x = self.final_layer(x)
        return x

In [6]:
def train(model, device, train_loader, criterion, optimizer, scheduler, epoch):
    model.train()
    data_len = len(train_loader.dataset)

    for batch_idx, _data in enumerate(train_loader):
        spectrograms, labels, input_lengths, label_lengths = _data 
        spectrograms, labels = spectrograms[:, :, :,:max(input_lengths)].to(device), labels.to(device) #(batch, 1, feat_dim, time)
        spectrograms = spectrograms.squeeze(1).transpose(1,2) # (batch, time, feat_dim,)
        optimizer.zero_grad()
        
        output = model(spectrograms, input_lengths)  # (batch, time, n_classes)
        output = F.log_softmax(output, dim=2)
        output = output.transpose(0, 1) # (time, batch, n_class)
        input_lengths = [x // 4 for x in input_lengths]

        loss = criterion(output, labels, input_lengths, label_lengths)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
        optimizer.step()
        scheduler.step()
        if batch_idx % 100 == 0 or batch_idx == data_len:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {:.6f}'.format(
                epoch, batch_idx * len(spectrograms), data_len,
                100. * batch_idx / len(train_loader), loss.item(), scheduler.get_last_lr()[0]))


def test(model, device, test_loader, criterion, epoch):
    print('\nevaluating...')
    model.eval()
    test_loss = 0
    test_cer, test_wer = [], []
    with torch.no_grad():
        for i, _data in enumerate(test_loader):
            spectrograms, labels, input_lengths, label_lengths = _data 
            spectrograms, labels = spectrograms.to(device), labels.to(device)
            spectrograms = spectrograms.squeeze(1).transpose(1,2) # (batch time, feat_dim,)
            
            output = model(spectrograms, input_lengths)  # (batch, time, n_class)
            output = F.log_softmax(output, dim=2)
            output = output.transpose(0, 1) # (time, batch, n_class)
            input_lengths = [x // 4 for x in input_lengths]

            loss = criterion(output, labels, input_lengths, label_lengths)
            test_loss += loss.item() / len(test_loader)

            decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)
            for j in range(len(decoded_preds)):
                test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
                test_wer.append(wer(decoded_targets[j], decoded_preds[j]))

    avg_cer = sum(test_cer)/len(test_cer)
    avg_wer = sum(test_wer)/len(test_wer)

    print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))

In [7]:
def main(learning_rate=1e-5, batch_size=20, test_batch_size=7, epochs=10,
        train_url="train-clean-100", test_url="test-clean", vocab_size=29, transformer="transformer"):
    
    hparams = {
        "input_size": 80,
        "output_size": vocab_size,
        "conv2d_filters": 32,
        "attention_dim": 360,
        "attention_heads": 8,
        "cnn_module_kernel": 31,
        "feedforward_dim": 1024,
        "num_layers":10,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    use_cuda = torch.cuda.is_available()
    #use_cuda = False
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")

    if not os.path.isdir("./data"):
        os.makedirs("./data")

    train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True)
    test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True)

    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_loader = data.DataLoader(dataset=train_dataset,
                                batch_size=hparams['batch_size'],
                                shuffle=True,
                                collate_fn=lambda x: data_processing(x, 'train'),
                                **kwargs)
    test_loader = data.DataLoader(dataset=test_dataset,
                                batch_size=test_batch_size,
                                shuffle=False,
                                collate_fn=lambda x: data_processing(x, 'valid'),
                                **kwargs)
    
    if transformer == "transformer":
        model = TransformerModel(
            hparams['input_size'],
            hparams['output_size'],
            hparams['conv2d_filters'],
            hparams['attention_dim'],
            hparams['attention_heads'],
            hparams['feedforward_dim'],
            hparams['num_layers'],
            hparams['dropout']).to(device)

    elif transformer == "conformer":
        model = ConformerModel(
            hparams['input_size'],
            hparams['output_size'],
            hparams['conv2d_filters'],
            hparams['attention_dim'],
            hparams['attention_heads'],
            hparams['cnn_module_kernel'],
            hparams['feedforward_dim'],
            hparams['num_layers'],
            hparams['dropout']).to(device)
    else:
        raise NotImplementedError

    print(model)
    print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = torch.nn.CTCLoss(blank=vocab_size-1, zero_infinity=False).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=hparams['epochs'],
                                            anneal_strategy='linear')
    
    for epoch in range(1, epochs + 1):
        !date
        train(model, device, train_loader, criterion, optimizer, scheduler, epoch)
        test(model, device, test_loader, criterion, epoch)
        torch.save(model, "/content/drive/MyDrive/MADE_ex/lab4/s2t_model.pth")


In [None]:
learning_rate = 1e-3
# batch_size = 7
batch_size = 5
# test_batch_size = 7
test_batch_size = 5
epochs = 10
libri_train_set = "train-clean-100"
libri_test_set = "test-clean"

main(learning_rate, batch_size, test_batch_size, epochs, libri_train_set, libri_test_set)

HBox(children=(FloatProgress(value=0.0, max=6387309499.0), HTML(value='')))




KeyboardInterrupt: ignored

In [None]:
torch.save(model, "/content/drive/MyDrive/MADE_ex/lab4/default_model.pth")

In [25]:
learning_rate = 1e-3
batch_size = 12
test_batch_size = 12
epochs = 8
libri_train_set = "train-clean-100"
libri_test_set = "test-clean"

vocab_size=4001

train_url="train-clean-100"
test_url="test-clean"

transformer = "transformer"

In [26]:

hparams = {
    "input_size": 80,
    "output_size": vocab_size,
    "conv2d_filters": 32,
    "attention_dim": 360,
    "attention_heads": 8,
    "cnn_module_kernel": 31,
    "feedforward_dim": 1024,
    "num_layers":10,
    "dropout": 0.1,
    "learning_rate": learning_rate,
    "batch_size": batch_size,
    "epochs": epochs
}

use_cuda = torch.cuda.is_available()
#use_cuda = False
torch.manual_seed(7)
device = torch.device("cuda" if use_cuda else "cpu")

if not os.path.isdir("./data"):
    os.makedirs("./data")

train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True)

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = data.DataLoader(dataset=train_dataset,
                            batch_size=hparams['batch_size'],
                            shuffle=True,
                            collate_fn=lambda x: data_processing(x, 'train'),
                            **kwargs)
test_loader = data.DataLoader(dataset=test_dataset,
                            batch_size=test_batch_size,
                            shuffle=False,
                            collate_fn=lambda x: data_processing(x, 'valid'),
                            **kwargs)

if transformer == "transformer":
    model = TransformerModel(
        hparams['input_size'],
        hparams['output_size'],
        hparams['conv2d_filters'],
        hparams['attention_dim'],
        hparams['attention_heads'],
        hparams['feedforward_dim'],
        hparams['num_layers'],
        hparams['dropout']).to(device)

elif transformer == "conformer":
    model = ConformerModel(
        hparams['input_size'],
        hparams['output_size'],
        hparams['conv2d_filters'],
        hparams['attention_dim'],
        hparams['attention_heads'],
        hparams['cnn_module_kernel'],
        hparams['feedforward_dim'],
        hparams['num_layers'],
        hparams['dropout']).to(device)
else:
    raise NotImplementedError

print(model)
print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))

optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
criterion = torch.nn.CTCLoss(blank=4000, zero_infinity=False).to(device)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                        steps_per_epoch=int(len(train_loader)),
                                        epochs=hparams['epochs'],
                                        anneal_strategy='linear')

# for epoch in range(1, epochs + 1):
#     !date
#     train(model, device, train_loader, criterion, optimizer, scheduler, epoch)
#     test(model, device, test_loader, criterion, epoch)
#     torch.save(model, "/content/drive/MyDrive/MADE_ex/lab4/s2t_model.pth")

TransformerModel(
  (conv_in): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
  )
  (conv_out): Sequential(
    (0): Linear(in_features=640, out_features=360, bias=True)
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (encoder_layer): MultiSequential(
    (0): EncoderLayer(
      (self_attn): MultiHeadedAttention(
        (linear_q): Linear(in_features=360, out_features=360, bias=True)
        (linear_k): Linear(in_features=360, out_features=360, bias=True)
        (linear_v): Linear(in_features=360, out_features=360, bias=True)
        (linear_out): Linear(in_features=360, out_features=360, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=360, out_features=1024, bias=True)
        (w_2): Linear(in_featur

In [None]:
# for epoch in range(1, epochs + 1):
#     !date
#     train(model, device, train_loader, criterion, optimizer, scheduler, epoch)
#     test(model, device, test_loader, criterion, epoch)
#     torch.save(model, "/content/drive/MyDrive/MADE_ex/lab4/s2t_model.pth")

Fri May  7 18:55:52 UTC 2021


KeyboardInterrupt: ignored

In [None]:
epoch = 1

In [27]:
model.train()
data_len = len(train_loader.dataset)

for batch_idx, _data in enumerate(train_loader):
    spectrograms, labels, input_lengths, label_lengths = _data 
    spectrograms, labels = spectrograms[:, :, :,:max(input_lengths)].to(device), labels.to(device) #(batch, 1, feat_dim, time)
    spectrograms = spectrograms.squeeze(1).transpose(1,2) # (batch, time, feat_dim,)
    optimizer.zero_grad()
    
    output = model(spectrograms, input_lengths)  # (batch, time, n_classes)
    output = F.log_softmax(output, dim=2)
    output = output.transpose(0, 1) # (time, batch, n_class)
    input_lengths = [x // 4 for x in input_lengths]

    loss = criterion(output, labels, input_lengths, label_lengths)
    loss.backward()
    
    torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)
    optimizer.step()
    scheduler.step()
    if batch_idx % 100 == 0 or batch_idx == data_len:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tLR: {:.6f}'.format(
            epoch, batch_idx * len(spectrograms), data_len,
            100. * batch_idx / len(train_loader), loss.item(), scheduler.get_last_lr()[0]))



KeyboardInterrupt: ignored

Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


In [None]:
label_lengths

[27, 22, 63, 55, 43, 52, 58, 34, 43, 65, 40, 63, 44, 53, 50, 38]

In [None]:
input_lengths

[259,
 124,
 378,
 365,
 350,
 356,
 369,
 249,
 305,
 413,
 257,
 398,
 364,
 401,
 396,
 319]

In [29]:
output[0][11]

tensor([ -8.3092, -13.9528, -13.1260,  ..., -10.3674, -12.4683,  -0.1574],
       device='cuda:0', grad_fn=<SelectBackward>)

In [30]:
output.shape

torch.Size([414, 12, 4001])

In [31]:
labels.shape

torch.Size([12, 62])

In [32]:
tmp = labels.cpu().numpy()

In [33]:
text_transform.int_to_text([0])

' ⁇ '

In [34]:
tmp[1]

array([3126.,  398.,   14., 2729.,   93.,   98.,    6., 2715., 1365.,
         59., 1199.,   18.,  390., 1292.,   57.,   25., 1501.,   64.,
        124.,  381., 1645.,   37., 1583.,  688.,  147.,    6., 2232.,
        167.,  228.,   47.,  911., 1351., 3981.,   26.,  225.,  213.,
       3981.,   64.,   57.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.,
          0.,    0.,    0.,    0.,    0.,    0.,    0.,    0.],
      dtype=float32)

In [35]:
decoded_preds, decoded_targets = GreedyDecoder(output.transpose(0, 1), labels, label_lengths)

In [36]:
decoded_preds

['', '', '', '', '', '', '', '', '', '', '', '']

In [37]:
decoded_targets

['the next a gush of gladness would swell her heart at the thought that now she had him at least safer for a while and that he might die and so escape the whole crowd of horrible possibilities',
 'remarked mister baxter at the tea table that evening i came past it today on my way cross lots home from the woods there will be bushels of plums on it',
 'you must have a vast and magnificent estate said candide to the turk i have only twenty acres replied the old man i and my children cultivate them our labour preserves us from three great evils weariness vice and want',
 'i saw a meadow lark on the first of march this day i heard blue birds and robins singing gaily it looked as though spring had come to stay i expected that day to reach dalton only eight miles distant',
 'and its situation is beyond all words of mine to describe i greatly admired the pulpit which is supported by five pillars sunk into the backs of squashed lions but mister copley when i asked him the period said pure brumm

### <b>Задание №1</b> (5 баллов):
На данный момент практически все E2E SOTA решения использую сабворды (subwords/wordpieces) в качестве таргетов нейронки для распознавания. Нам бы тоже не мешало перейти от графем к сабвордам. Теперь вместо букв (графем) будем распознавать кусочки слов. В качестве такого токенайзера предлагается использовать [Sentencepiece](https://github.com/google/sentencepiece). Главное правильно обернуть его в наш класс TextTransform. Текстовый файл (train_clean_100_text_clean.txt) для обучения токенайзера уже подготовлен и лежит в корневой папке проекта. 

In [None]:
lines = []
with open("/content/lab4/train_clean_100_text_clean.txt", "rt") as ft:
  for line in ft.readlines():
      lines.append(line.lower())
with open("/content/lab4/train_clean_100_text_clean_lower.txt", "wt") as ftt:
    for line in lines:
        ftt.write(line)

In [None]:
lines1 = []
with open("/content/lab4/train_clean_100_text_clean_lower.txt", "rt") as ft:
  for line in ft.readlines():
      lines1.append(line)

In [None]:
len(lines) == len(lines1)

True

In [None]:
del lines, lines1

In [None]:
!pip install sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 4.3MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


In [10]:
import sentencepiece as spm

class TextTransformBPE:
    def __init__(self, train_text_path='/content/lab4/train_clean_100_text_clean_lower.txt'):
        """ Обучение BPE модели на 4000 юнитов"""
        spm.SentencePieceTrainer.train(input=train_text_path,
                                       model_prefix='bpe', vocab_size=4000,
                                       model_type="bpe",
                                       normalization_rule_name="nfkc_cf",
                                       )
        self.model = spm.SentencePieceProcessor(model_file='/content/lab4/bpe.model')

    def text_to_int(self, text):
        """ Преобразование входного текста в последовательность сабвордов в формате их индекса в BPE модели """
        int_sequence = self.model.encode(text)
        return int_sequence

    def int_to_text(self, labels):
        """ Преобразование последовательности индексов сабвордов в текст """
        string = self.model.decode(labels)
        return string

In [11]:
train_audio_transforms = torch.nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=400, hop_length=160, n_mels=80),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100)
)

valid_audio_transforms = torchaudio.transforms.MelSpectrogram(sample_rate=16000,
                                                              n_fft=400,
                                                              hop_length=160,
                                                              n_mels=80)

text_transform = TextTransformBPE()

In [None]:
text_transform = TextTransform()

In [12]:
def GreedyDecoder(output, labels, label_lengths, blank_label=4000, collapse_repeated=True):
    # можно бимсерч прикрутить
    arg_maxes = torch.argmax(output, dim=2)
    decodes = []
    targets = []
    for i, args in enumerate(arg_maxes):
        decode = []
        targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].long().tolist()))
        for j, index in enumerate(args):
            if index != blank_label:
                if collapse_repeated and j != 0 and index == args[j -1]:
                    continue
                decode.append(index.item())
        decodes.append(text_transform.int_to_text(decode))
    return decodes, targets

### <b>Задание №2</b> (5 баллов):
Импровизация по улучшению качества распознавания.

In [39]:
from espnet.nets.pytorch_backend.conformer.convolution import ConvolutionModule
from espnet.nets.pytorch_backend.conformer.encoder_layer import EncoderLayer

class ConformerModel(torch.nn.Module):

    def __init__(
        self,
        input_size=80,
        output_size=29,
        conv2d_filters=32,
        attention_dim=360,
        attention_heads=8,
        cnn_module_kernel=31,
        feedforward_dim=1024,
        num_layers=10,
        dropout=0.1,
    ):
        super(ConformerModel, self).__init__()
        
        self.conv_in = torch.nn.Sequential(
            torch.nn.Conv2d(1, conv2d_filters, kernel_size=(3,3), stride=(2,2), padding=(1,1)),
            torch.nn.ReLU(),
            torch.nn.Conv2d(conv2d_filters, conv2d_filters, kernel_size=(3,3), stride=(2,2), padding=(1,1)),
            torch.nn.ReLU(),
        )
        self.conv_out = torch.nn.Sequential(
            torch.nn.Linear(conv2d_filters * ((input_size // 2) // 2), attention_dim),
            PositionalEncoding(attention_dim, 0.1),
        )
        positionwise_layer = PositionwiseFeedForward
        positionwise_layer_args = (attention_dim, feedforward_dim, dropout)
        convolution_layer = ConvolutionModule
        convolution_layer_args = (attention_dim, cnn_module_kernel)

        self.encoder_layer = repeat(
            num_layers,
            lambda lnum: EncoderLayer(
                attention_dim,
                MultiHeadedAttention(
                    attention_heads, attention_dim, dropout
                ),
                positionwise_layer(*positionwise_layer_args),
                None,
                convolution_layer(*convolution_layer_args),
                dropout,
                normalize_before=True,
                concat_after=False,
            ),
        )
        self.after_norm = LayerNorm(attention_dim)
        self.final_layer = torch.nn.Linear(attention_dim, output_size)

    def forward(self, x, ilens):
        x = x.unsqueeze(1)  # (b, c, t, f)
        x = self.conv_in(x)
        b, c, t, f = x.size()
        x = self.conv_out(x.transpose(1, 2).contiguous().view(b, t, c * f))
        masks = (~make_pad_mask(ilens)[:, None, :])[:, :, ::4].to(x.device)
        x, _, _ = self.encoder_layer(x, masks)
        x = self.after_norm(x)
        x = self.final_layer(x)
        return x

In [40]:
text_transform = TextTransform()

learning_rate = 1e-3
batch_size = 12
test_batch_size = 12
epochs = 8
libri_train_set = "train-clean-100"
libri_test_set = "test-clean"
torch.cuda.empty_cache()
main(learning_rate, batch_size, test_batch_size, epochs, libri_train_set, libri_test_set,
     vocab_size=29,
     transformer="conformer"
     )

ConformerModel(
  (conv_in): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
  )
  (conv_out): Sequential(
    (0): Linear(in_features=640, out_features=360, bias=True)
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (encoder_layer): MultiSequential(
    (0): EncoderLayer(
      (self_attn): MultiHeadedAttention(
        (linear_q): Linear(in_features=360, out_features=360, bias=True)
        (linear_k): Linear(in_features=360, out_features=360, bias=True)
        (linear_v): Linear(in_features=360, out_features=360, bias=True)
        (linear_out): Linear(in_features=360, out_features=360, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (feed_forward): PositionwiseFeedForward(
        (w_1): Linear(in_features=360, out_features=1024, bias=True)
        (w_2): Linear(in_features

Traceback (most recent call last):
  File "/usr/lib/python3.7/multiprocessing/queues.py", line 242, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.7/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe


KeyboardInterrupt: ignored