# Task description

## улучшение модели (4 балла)

Вы реализовали бейзлайн, пришло время улучшить качество модели. Т.к. это последнее задание, мы не будем предлагать конкретные шаги, а только дадим несколько советов.

1. Большой источник информации о работе командной строке — её документация, man. Один из способов улучшения модели - использование мана для генерации новых примеров. Структурированный ман можно найти по ссылке https://github.com/IBM/clai/blob/nlc2cmd/docs/manpage-data.md.
2. Ещё один способ улучшить модель, разделить предсказание утилит и флагов. Т.к. задача предсказания утилит более важная, вы можете натренировать модель, которая предсказывает последовательность утилит, а затем к каждой утилите генерировать флаги.
3. Можно аугментировать данные, чтобы увеличить выборку.
4. Можно в качество входа подавать не только текстовый запрос, но и описание из мана. Т.к. всё описание достаточно большое, нужно сделать дополнительную модель, которая будет выбирать команды, для которых нужно вытащить описание.
5. Найти дополнительные данные, улучшающие обучение
6. Как всегда можно просто сделать больше слоёв, увеличить размер скрытого слоя и т.д.

## comments

After hours of trying to randomly make score better, I decided to focus on reproducibility of runs, so we will log everything just in case not to lose any information about past runs.

In [85]:
# standart libraries
import io
import os
import re
import regex
import glob
import random
import shutil
from tqdm import tqdm, trange
from datetime import datetime
from collections import Counter, defaultdict
from functools import partial

# data handling
import numpy as np
import pandas as pd
import wandb

# cmd preprocessing & metric calculation
import sys
sys.path.append("./utils/")
from bashlint.data_tools import bash_parser, pretty_print, cmd2template
from metric.metric_utils import compute_metric

# text preprocessing
from sklearn.model_selection import train_test_split
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import sentencepiece as spm

# model
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel, EncoderDecoderConfig, EncoderDecoderModel
from torch.nn.utils.rnn import pad_sequence

In [2]:
seed = 322

os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2
UNK_ID = 3

MAX_TEXT_LENGTH = 256
MAX_CODE_LENGTH = 40

# Data preparation

In [4]:
stopwords = stopwords.words("english")

def clean_text(text):
    text = text.lower()
    tokens = filter(lambda x: x not in stopwords, text.split())
    tokens = map(stemmer.stem, tokens)
    text = " ".join(tokens)
    text = re.sub(r"[^a-zA-Z ]", "", text)
    return text

def get_invocations_cmds(path):
    with open(path, "r") as file:
        contents = file.read()
        invocations = re.findall(r"[^.]- (.+):\n", contents)
        cmds = re.findall(r"[^.]`(.+)`\n|\0", contents)
    return invocations, cmds


def get_invocations_cmds_from_dir(path):
    paths = glob.glob(os.path.join(path, "*.md"))
    all_invocations, all_cmds = [], []

    for path in paths:
        invocations, cmds = get_invocations_cmds(path)
        all_invocations.extend(invocations)
        all_cmds.extend(cmds)
    
    assert len(all_invocations) == len(all_cmds)
    
    return all_invocations, all_cmds

In [5]:
class TextToBashDataset(Dataset):
    
    def __init__(self, texts, cmds, text_tokenizer, cmd_tokenizer,
                 max_text_length=MAX_TEXT_LENGTH, max_code_length=MAX_CODE_LENGTH):
        
        self.text_tokenizer = text_tokenizer
        self.cmd_tokenizer = cmd_tokenizer
        self.max_text_length = max_text_length
        self.max_code_length = max_code_length
        
        self.items = []
        
        for text, cmd in zip(texts, cmds):
            text_tokenized = text_tokenizer.tokenize(text, add_bos=True, add_eos=True)
            cmd_tokenized = cmd_tokenizer.tokenize(cmd, add_bos=True, add_eos=True)
            if len(text_tokenized) > max_text_length:
                text_tokenized = text_tokenized[:max_text_length-1] + text_tokenized[-1:]
            if len(cmd_tokenized) > max_code_length:
                cmd_tokenized = cmd_tokenized[:max_code_length-1] + cmd_tokenized[-1:]
            self.items.append((text_tokenized, cmd_tokenized))
            
    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        return self.items[idx]

In [6]:
def collate_fn(batch):

    text_idxs = [torch.tensor(item[0]) for item in batch]
    cmd_idxs = [torch.tensor(item[1]) for item in batch]
    
    text_idxs = pad_sequence(text_idxs, padding_value=PAD_ID, batch_first=True)
    cmd_idxs = pad_sequence(cmd_idxs, padding_value=PAD_ID, batch_first=True)
    
    return text_idxs, cmd_idxs 

In [31]:
def greedy_decode(text_tokenized, model, max_len=20):
    
    model.eval()
    model = model.to(DEVICE)
    text_tokenized = text_tokenized.unsqueeze(0).to(DEVICE)
    cmd_prediction = torch.tensor([BOS_ID]).unsqueeze(0).to(DEVICE)
    next_piece = BOS_ID
    
    while next_piece != EOS_ID and cmd_prediction.shape[-1] < max_len:
        #print(cmd_prediction)
        out = model(input_ids=text_tokenized, decoder_input_ids=cmd_prediction)
        next_piece = torch.argmax(out.logits.squeeze(0)[-1]).item()
        cmd_prediction = torch.cat((cmd_prediction, torch.tensor([[next_piece]]).to(DEVICE)), dim=1)
        
    return cmd_prediction[0]

In [32]:
def eval_decode(i, data, ds, model, max_len=20):
    
    for k, v in dict(data.iloc[i]).items():
        print(f"{k}: {v}")
    text_tokenized = torch.tensor(ds[i][0])
    cmd_prediction = greedy_decode(text_tokenized, model, max_len)
    cmd_prediction = [x.item() for x in cmd_prediction]
    print(f"cmd_predicted: {cmd_tokenizer.decode(cmd_prediction)}")

In [33]:
def random_eval(data, ds, model, sample_size=5, max_len=20):
    indexes = random.sample(range(len(ds)), sample_size)
    for i in indexes:
        eval_decode(i, data, ds, model, max_len)
        print()

In [86]:
class EarlyStopping():
    """
    Early stopping to stop the training when the loss does not improve after
    certain epochs.
    """
    def __init__(self, patience=3, min_delta=0.01):
        """
        :param patience: how many epochs to wait before stopping when loss is
               not improving
        :param min_delta: minimum difference between new loss and old loss for
               new loss to be considered as an improvement
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.best_epoch = None
        self.early_stop = False
        
    def __call__(self, val_loss, step):
        if self.best_loss == None:
            self.best_loss = val_loss
            self.best_epoch = step
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.best_epoch = step
            # reset counter if validation loss improves
            self.counter = 0
        elif self.best_loss - val_loss < self.min_delta:
            self.counter += 1
            print(f"INFO: Early stopping counter {self.counter} of {self.patience}")
            if self.counter >= self.patience:
                print('INFO: Early stopping')
                self.early_stop = True

## loading

### load from disk

### upload to wandb

## train / val split

### load raw from wandb

### split

### upload to wandb

## preprocessing

In [7]:
splits = ["train_split", "val_split", "test", "tldr"]

### text

#### load raw from wandb

#### preprocess

#### upload to wandb

### cmd

#### load w/ preprocessed text from wandb

#### preprocess

#### upload to wandb

## tokenizing

### text

#### load preprocessed from wandb

#### prepare tokenizer

#### upload to wandb

### cmd

#### load preprocessed from wandb

#### prepare tokenizer

#### upload to wandb

# Training model

In [96]:
config = {
    "data/invocation/train/splits": ["train_split_invocation_preprocessed"],
    "data/invocation/val/splits": ["val_split_invocation_preprocessed"],
    "data/invocation/test/splits": ["test_invocation_preprocessed"],
    "data/invocation/stemmer": "porterstemmer",
    
    "data/cmd/train/splits": ["train_split_cmd_preprocessed"],
    "data/cmd/val/splits": ["val_split_cmd_preprocessed"],
    "data/cmd/test/splits": ["test_cmd_preprocessed"],
    "data/cmd/method": "bashlint",
    
    "tokenizer/cmd/vocab_size": 500,
    "tokenizer/invocation/vocab_size": 3800,
    
    "model/hidden_size": 256,
    "model/num_hidden_layers": 2,
    "model/num_attention_heads": 8,
    "model/intermediate_size": 256 * 4,
    "model/hidden_dropout_prob": 0.1,
    
    "training/batch_size": 64,
    "training/lr": 1e-3,
    "training/stopper/patience": 5,
    "training/stopper/min_delta": 0.01,
    "training/max_grad_norm": -1,
    "training/n_epochs": 30,
}

In [8]:
run = wandb.init(project="text2bash", job_type="train_model", config=config)

[34m[1mwandb[0m: Currently logged in as: [33mfuriousteabag[0m (use `wandb login --relogin` to force relogin)


## loading

### data

In [10]:
X = {"train": {}, "val": {}}
y = {"train": {}, "val": {}}

In [11]:
for key in X.keys():
    for split in config[f"data/invocation/{key}/splits"]:
        artifact = run.use_artifact(f"{split}:stemmer={config['data/invocation/stemmer']}", type="preprocessed_data")
        table = artifact.get(split)
        X[key][split] = pd.DataFrame(table.data, columns=table.columns)
    
    X[key]["merged"] = pd.concat(X[key].values())

In [12]:
for key in y.keys():
    for split in config[f"data/cmd/{key}/splits"]:
        artifact = run.use_artifact(f"{split}:method={config['data/cmd/method']}", type="preprocessed_data")
        table = artifact.get(split)
        y[key][split] = pd.DataFrame(table.data, columns=table.columns)
    
    y[key]["merged"] = pd.concat(y[key].values())

### tokenizers

In [28]:
tokenizer_name = "invocation_tokenizer"
artifact = run.use_artifact(f"{tokenizer_name}:splits={str(config['data/invocation/train/splits'])}|vocab_size={config['tokenizer/invocation/vocab_size']}|stemmer={config['data/invocation/stemmer']}",
                            type="tokenizers")
tokenizer_path = artifact.download()
invocation_tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path + "/sp_invocation.model")
assert invocation_tokenizer.vocab_size() == config["tokenizer/invocation/vocab_size"]

In [30]:
tokenizer_name = "cmd_tokenizer"
artifact = run.use_artifact(f"{tokenizer_name}:splits={str(config['data/cmd/train/splits'])}|vocab_size={config['tokenizer/cmd/vocab_size']}|method={config['data/cmd/method']}",
                            type="tokenizers")
tokenizer_path = artifact.download()
cmd_tokenizer = spm.SentencePieceProcessor(model_file=tokenizer_path + "/sp_cmd.model")
assert cmd_tokenizer.vocab_size() == config["tokenizer/cmd/vocab_size"]

### datasets & dataloaders

In [41]:
train_ds = TextToBashDataset(
    texts=X["train"]["merged"]["invocation_preprocessed"],
    cmds=y["train"]["merged"]["cmd_preprocessed"],
    text_tokenizer=invocation_tokenizer,
    cmd_tokenizer=cmd_tokenizer)

valid_ds = TextToBashDataset(
    texts=X["val"]["merged"]["invocation_preprocessed"],
    cmds=y["val"]["merged"]["cmd_preprocessed"],
    text_tokenizer=invocation_tokenizer,
    cmd_tokenizer=cmd_tokenizer)

In [61]:
loaders = {
    'train': DataLoader(train_ds, batch_size=config["training/batch_size"], shuffle=True, collate_fn=collate_fn),
    'val': DataLoader(valid_ds, batch_size=config["training/batch_size"], collate_fn=collate_fn),
}

In [62]:
invocations, cmds = next(iter(loaders["train"]))

i = 15
print(train_ds.text_tokenizer.decode(list(map(int, invocations[i]))))
print(train_ds.cmd_tokenizer.decode(list(map(int, cmds[i]))))

delet file current directori
find Path -mindepth Quantity -delete


In [69]:
invocations, cmds = next(iter(loaders["val"]))

i = 14
print(train_ds.text_tokenizer.decode(list(map(int, invocations[i]))))
print(train_ds.cmd_tokenizer.decode(list(map(int, cmds[i]))))

display long list file current folder access today start day
find Path -daystart -atime Timespan -ls


## model

In [76]:
text_model_config = BertConfig(
    vocab_size = invocation_tokenizer.vocab_size(),
    hidden_size = config["model/hidden_size"],
    num_hidden_layers = config["model/num_hidden_layers"],
    num_attention_heads = config["model/num_attention_heads"],
    intermediate_size = config["model/intermediate_size"],
    hidden_dropout_prob = config["model/hidden_dropout_prob"],
    pad_token_id = PAD_ID,
)

cmd_model_config = BertConfig(
    vocab_size = cmd_tokenizer.vocab_size(),
    hidden_size = config["model/hidden_size"],
    num_hidden_layers = config["model/num_hidden_layers"],
    num_attention_heads = config["model/num_attention_heads"],
    intermediate_size = config["model/intermediate_size"],
    hidden_dropout_prob = config["model/hidden_dropout_prob"],
    pad_token_id = PAD_ID,
    is_decoder = True,
    add_cross_attention = True
)

print(cmd_model_config.is_decoder)
print(cmd_model_config.add_cross_attention)

config = EncoderDecoderConfig.from_encoder_decoder_configs(text_model_config, cmd_model_config)
model = EncoderDecoderModel(config=config)

True
True


In [82]:
text_idxs, cmd_idxs = next(iter(loaders["val"]))
decoder_input = cmd_idxs[..., :-1]
target = cmd_idxs[..., 1:]

print(text_idxs.shape)
print(decoder_input.shape)
print(target.shape)

text_idxs_mask = torch.where(text_idxs != PAD_ID, 1, 0)
decoder_input_mask = torch.where(decoder_input != PAD_ID, 1, 0)

torch.Size([64, 70])
torch.Size([64, 29])
torch.Size([64, 29])


In [84]:
cmd_idxs[..., :-1]

tensor([[ 1,  6,  8,  ...,  0,  0,  0],
        [ 1,  6,  8,  ...,  0,  0,  0],
        [ 1,  6,  8,  ...,  0,  0,  0],
        ...,
        [ 1, 48, 89,  ...,  0,  0,  0],
        [ 1, 23, 16,  ...,  0,  0,  0],
        [ 1,  6,  8,  ...,  0,  0,  0]])

In [83]:
out = model(
    input_ids=text_idxs, decoder_input_ids=decoder_input,
    attention_mask=text_idxs_mask, decoder_attention_mask=decoder_input_mask)

logits = out.logits
print(logits.shape)

torch.Size([64, 29, 500])


## training

In [92]:
DEVICE = torch.device("cuda")

In [102]:
class Trainer:

    def __init__(
            self, 
            model, 
            criterion,
            optimizer, 
            pad_token_id,
            device,
            stopper,
            run=None,
            max_grad_norm=-1
    ):
        """
            model: объект класса BertModel
            optimizer: оптимизатор
            pad_token_id: индекс паддинга. Нужен для создания attention mask
            device: девайс (cpu или cuda), на котором надо производить вычисления
            logdir: директория для записи логов
            max_grad_norm: максимум нормы градиентов, для клиппинга
        """
        self._criterion = criterion
        self._optimizer = optimizer
        self._pad_token_id = pad_token_id
        self._device = device
        self._run = run
        self._stopper = stopper
        self._max_grad_norm = max_grad_norm
        
        self._model = model.to(self._device)

        self._n_epoch = 0
        self._n_iter = 0

    def train(self, dataloaders, n_epochs):
        for epoch in tqdm(range(n_epochs)):
            train_loss = self._train_step(dataloaders["train"])
            val_loss = self._val_step(dataloaders["val"])
            self._n_epoch += 1
            tqdm.write(f"Epoch: {self._n_epoch} | train_loss: {train_loss:.3f} | val_loss: {val_loss:.3f}")
            
            if self._run is not None:
                self._run.log(data={"loss/train": train_loss, "loss/val": val_loss}, step=self._n_epoch)
                
            self._stopper(val_loss, self._n_epoch)
            torch.save(self._model.state_dict(), f"./checkpoints/{self._n_epoch}_{val_loss}.pt")
            
            if self._stopper.early_stop:
                break
        
        if self._run is not None:
            # they are not the best but the first one
            # to achieve loss score with given stopper
            # threshold
            self._run.summary["best_epoch"] = self._stopper.best_epoch
            self._run.summary["best_val_loss"] = self._stopper.best_loss
            
            # LOG MODEL CHECKPOINT
            
            
        self._model.load_state_dict(torch.load(f"./checkpoints/{self._stopper.best_epoch}_{self._stopper.best_loss}.pt"))

    def _train_step(self, dataloader):
        """
            dataloader: объект класса DataLoader для обучения
        """
        self._model.train()
        epoch_loss = 0
        for text_idxs, cmd_idxs in dataloader:
            
            text_idxs = text_idxs.to(self._device)
            decoder_input = cmd_idxs[..., :-1].to(self._device)
            target = cmd_idxs[..., 1:].to(self._device)
            
            text_idxs_mask = torch.where(text_idxs != self._pad_token_id, 1, 0).to(self._device)
            decoder_input_mask = torch.where(decoder_input != self._pad_token_id, 1, 0).to(self._device)
            
            self._optimizer.zero_grad()        
            
            out = self._model(
                input_ids=text_idxs, decoder_input_ids=decoder_input,
                attention_mask=text_idxs_mask, decoder_attention_mask=decoder_input_mask)

            logits = out.logits

            loss = self._criterion(logits.reshape(-1, cmd_tokenizer.vocab_size()), target.reshape(-1))
            epoch_loss += loss.item()
            
            loss.backward()
            
            if self._max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(self._model.parameters(), self._max_grad_norm)
                
            self._optimizer.step()
        return epoch_loss / len(dataloader)
    
    def _val_step(self, dataloader):
        """
            dataloader: объект класса DataLoader для обучения
        """
        self._model.eval()
        epoch_loss = 0
        for text_idxs, cmd_idxs in dataloader:
            
            text_idxs = text_idxs.to(self._device)
            decoder_input = cmd_idxs[..., :-1].to(self._device)
            target = cmd_idxs[..., 1:].to(self._device)
            
            text_idxs_mask = torch.where(text_idxs != self._pad_token_id, 1, 0).to(self._device)
            decoder_input_mask = torch.where(decoder_input != self._pad_token_id, 1, 0).to(self._device)
            
            with torch.no_grad():
                out = self._model(
                    input_ids=text_idxs, decoder_input_ids=decoder_input,
                    attention_mask=text_idxs_mask, decoder_attention_mask=decoder_input_mask)
                logits = out.logits
                loss = self._criterion(logits.reshape(-1, cmd_tokenizer.vocab_size()), target.reshape(-1))
            
            epoch_loss += loss.item()
            
        return epoch_loss / len(dataloader)

In [103]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
optimizer = torch.optim.Adam(model.parameters(), lr=config["training/lr"])
stopper = EarlyStopping(patience=config["training/stopper/patience"], min_delta=config["training/stopper/min_delta"])

In [104]:
trainer = Trainer(
    model=model, 
    criterion=criterion,
    optimizer=optimizer, 
    pad_token_id=PAD_ID,
    device=DEVICE,
    run=run,
    stopper=stopper,
    max_grad_norm=config["training/max_grad_norm"]
)

In [105]:
! mkdir -p ./checkpoints

In [106]:
trainer.train(loaders, config["training/n_epochs"])

  3%|▎         | 1/30 [00:18<08:55, 18.47s/it]

Epoch: 1 | train_loss: 1.116 | val_loss: 0.962


  7%|▋         | 2/30 [00:36<08:32, 18.32s/it]

Epoch: 2 | train_loss: 0.916 | val_loss: 0.857


 10%|█         | 3/30 [00:55<08:16, 18.40s/it]

Epoch: 3 | train_loss: 0.814 | val_loss: 0.824


 13%|█▎        | 4/30 [01:13<07:59, 18.45s/it]

Epoch: 4 | train_loss: 0.733 | val_loss: 0.786


 17%|█▋        | 5/30 [01:33<07:50, 18.81s/it]

Epoch: 5 | train_loss: 0.679 | val_loss: 0.746


 20%|██        | 6/30 [01:52<07:35, 18.98s/it]

Epoch: 6 | train_loss: 0.622 | val_loss: 0.753
INFO: Early stopping counter 1 of 5


 23%|██▎       | 7/30 [02:11<07:15, 18.95s/it]

Epoch: 7 | train_loss: 0.580 | val_loss: 0.735


 27%|██▋       | 8/30 [02:29<06:54, 18.85s/it]

Epoch: 8 | train_loss: 0.543 | val_loss: 0.738
INFO: Early stopping counter 1 of 5


 30%|███       | 9/30 [02:48<06:33, 18.75s/it]

Epoch: 9 | train_loss: 0.508 | val_loss: 0.746
INFO: Early stopping counter 2 of 5


 33%|███▎      | 10/30 [03:07<06:18, 18.92s/it]

Epoch: 10 | train_loss: 0.474 | val_loss: 0.727
INFO: Early stopping counter 3 of 5


 37%|███▋      | 11/30 [03:26<05:59, 18.91s/it]

Epoch: 11 | train_loss: 0.448 | val_loss: 0.732
INFO: Early stopping counter 4 of 5


 37%|███▋      | 11/30 [03:45<06:29, 20.49s/it]

Epoch: 12 | train_loss: 0.419 | val_loss: 0.731
INFO: Early stopping counter 5 of 5
INFO: Early stopping





Переучивать не будем:

![](./images/training.png)

Видим, что лучший лосс был на 8-й эпохе.
Подгрузим эти веса (при переобучении модель автоматически в конце подгружает лучшие веса):

In [107]:
run.config.update(config)

In [40]:
model.load_state_dict(torch.load("./checkpoints/0.9302761256694794.pt"))
trainer._model = model
print(f"val_loss: {round(trainer._val_step(loaders['valid']), 3)}")

val_loss: 0.93


## Генерация команд (2 балла)

**Задание**. Реализуйте алгоритм beam-search в классе BeamSearchGenerator ниже. Ваша реализация должна поддерживать задание температуры софтмакса. Выходы модели, полученные на предыдущих итерациях, необходимо кэшировать для повышения скорости алгоритма. Вместо подсчёта произведения любых вероятностей необходимо считать сумму их логарифмов.

Алгоритм должен возвращать список пар из получившихся выходных последовательностей и логарифмов их вероятностей. 

In [33]:
class BeamSearchGenerator:
    def __init__(
            self, pad_id, eos_id, bos_id,
            max_length=20, beam_width=5, temperature=1.5,
            device=torch.device('cuda'),
    ):
        """
        Parameters
        ----------
        pad_id : int
        eos_id : int
        bos_id : int
        max_length : int
            Maximum length of output sequence
        beam_width : int
            Width of the beam
        temperature : float
            Softmax temperature
        device : torch.device
            Your model device
        """
        self.pad_id = pad_id
        self.eos_id = eos_id
        self.bos_id = bos_id
        
        self.max_length = max_length
        self.beam_width = beam_width
        self.temperature = temperature
        
        self.device = device
        
    def get_result(self, model, input_text_tokens):
        """
        Parameters
        ----------
        model : TextToBashModel
        input_text_tokens : torch.tensor
            One object input tensor
        """
        
        chains = torch.full([self.beam_width, 1], self.bos_id).to(self.device)
        chain_probabilities = torch.zeros(self.beam_width).to(self.device)
        if_chain_ready = torch.full([self.beam_width], False).to(self.device)
        
        # saving encoder outputs
        input_text_tokens = input_text_tokens.repeat([self.beam_width, 1]).to(device)
        encoder_outputs = model._prepare_encoder_decoder_kwargs_for_generation(
            input_ids=input_text_tokens,
            model_kwargs={})["encoder_outputs"]
        
        idx = 0
        while idx <= self.max_length and not all(if_chain_ready):
            
            logits = model(encoder_outputs=encoder_outputs, decoder_input_ids=chains).logits[:, -1, :]
            
            probs = torch.log_softmax(logits / self.temperature, dim=-1)
            probs, tokens = torch.topk(probs, k=self.beam_width)
            
            if idx == 0:
                # pick all top tokens
                # 0 because current chains are same
                beam_tokens = tokens[0].view(self.beam_width, 1)
                chains = torch.cat([chains, beam_tokens], dim=-1)
                chain_probabilities += probs[0]
                idx += 1
                continue

            chain_probabilities_beam = chain_probabilities.reshape(-1, 1).repeat([1, self.beam_width])
            ready_chains_idxs = if_chain_ready.nonzero()
            
            # if chain is already ready,
            # keep one instance of it and add
            # padding
            probs[ready_chains_idxs, :] = -float("inf")
            probs[ready_chains_idxs, 0] = 0
            tokens[ready_chains_idxs, :] = self.pad_id

            chain_probabilities_beam = chain_probabilities_beam + probs
            
            # choosing best across all options
            best_sequences = torch.argsort(chain_probabilities_beam.flatten())[-self.beam_width:]

            chains = torch.cat([chains[best_sequences // self.beam_width, :], tokens.flatten()[best_sequences].view(-1, 1)], dim=-1)
            chain_probabilities = chain_probabilities_beam.flatten()[best_sequences]

            if_chain_ready = ((chains[:, -1] == self.eos_id) | (chains[:, -1] == self.pad_id))
                
            idx += 1
        
        return list(zip(chains, chain_probabilities))        

Протестируйте на нескольких примерах работу вашего алгоритма. Если всё реализовано правильно, то как минимум на трёх примерах из 5 всё должно работать правильно.

In [34]:
beam_search_engine = BeamSearchGenerator(
    pad_id=PAD_ID, eos_id=EOS_ID, bos_id=BOS_ID,
    max_length=MAX_CODE_LENGTH, beam_width=5,
    temperature=1, device=DEVICE
)

In [43]:
all_scores = []
with torch.no_grad():
    for i in range(5):
        print()
        print('text:', valid_data.invocation.iloc[i])
        print('text cleaned:', valid_data.text_cleaned.iloc[i])
        print('true:', valid_data.cmd.iloc[i])
        print('true cleaned:', valid_data.cmd_cleaned.iloc[i])

        src = torch.tensor(valid_ds[i][0])
        pred = beam_search_engine.get_result(model, src)
        
        #print('greedy decode:', cmd_tokenizer.decode(list(map(int, greedy_decode(src, model)))))
        scores = []
        for x, proba in pred[:5]:
            pred_cmd = cmd_tokenizer.decode(list(map(int, x)))
            score = compute_metric(pred_cmd, 1, valid_data.cmd.iloc[i])
            scores.append(score)
            print(pred_cmd, proba)
        print(max(scores))
        all_scores.append(max(scores))
        
print(f"average score: {np.mean(all_scores):.3f}")


text: searches through the root filesystem ("/") for the file named chapter1, and prints the location
text cleaned: search root filesystem  file name chapter print locat
true: find / -name Chapter1 -type f -print
true cleaned: find Path -name Regex -type f -print
find Path -name Regex -type f -printf "%f\n" tensor(-4.4631, device='cuda:0')
find Path -name Regex -or -name Regex tensor(-3.2608, device='cuda:0')
find Path -name Regex -type f -print tensor(-2.9428, device='cuda:0')
find Path -name Regex -type f tensor(-1.5682, device='cuda:0')
find Path -name Regex tensor(-0.7397, device='cuda:0')
1.0

text: searches through the root filesystem ("/") for the file named chapter1.
text cleaned: search root filesystem  file name chapter
true: find / -name Chapter1 -type f
true cleaned: find Path -name Regex -type f
find Path -name Regex -or -name Regex -type f tensor(-4.5106, device='cuda:0')
find Path -name Regex -or -name Regex tensor(-2.8590, device='cuda:0')
find Path -name Regex -type f

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


find Path -name Regex -or -name Regex -type f tensor(-4.5106, device='cuda:0')
find Path -name Regex -or -name Regex tensor(-2.8590, device='cuda:0')
find Path -name Regex -type f -print tensor(-2.0952, device='cuda:0')
find Path -name Regex -type f tensor(-1.5648, device='cuda:0')
find Path -name Regex tensor(-0.8799, device='cuda:0')
1.0

text: searching for all files with the extension mp3
text cleaned: search file extens mp
true: find / -name *.mp3
true cleaned: find Path -name Regex
find Path -type f -iname Regex -print tensor(-4.9602, device='cuda:0')
find Path -type f -iname Regex tensor(-2.0453, device='cuda:0')
find Path -iname Regex tensor(-1.9404, device='cuda:0')
find Path -name Regex tensor(-1.7023, device='cuda:0')
find Path -type f -name Regex tensor(-1.6675, device='cuda:0')
1.0

text: set myvariable to the value of variable_name
text cleaned: set myvari valu variablenam
true: myVariable=$(env  | grep VARIABLE_NAME | grep -oe '[^=]*$');
true cleaned: env | grep Regex | 

**Задание**. Дополните функцию для подсчёта качества. Посчитайте качество вашей модели на валидационном и тестовых датасетов.

In [35]:
def compute_all_scores(model, df, beam_engine):
    all_scores = []

    for i, (text, target_cmd) in tqdm(enumerate(zip(df.text_cleaned.values, df.cmd.values)), total=len(df)):
        
        input_tokens = text_tokenizer.tokenize(text, add_bos=True, add_eos=True)
        if len(input_tokens) > MAX_TEXT_LENGTH:
            input_tokens = input_tokens[:MAX_TEXT_LENGTH-1] + input_tokens[-1:]
        input_tokens = torch.tensor(input_tokens)
        
        predictions = beam_engine.get_result(model, input_tokens)
        
        # get only 5 top results
        predictions = predictions[:5]
        object_scores = []
        for output_tokens, proba in predictions:
            output_cmd = cmd_tokenizer.decode(list(map(int, output_tokens)))
            score = compute_metric(output_cmd, 1, target_cmd)
            object_scores.append(score)
        
        all_scores.append(max(object_scores))
    return all_scores

Если вы всё реализовали правильно, подобрали параметры BeamSearch то ваш средний скор на валидации должен быть >= 0.25, а скор на `handcrafted` части теста >= 0.13. На `mined` части датасета скор может быть низкий, т.к. некоторых команд из датасета нет в обучении.

In [45]:
def find_best_temperature(model, valid_data):
    scores = []
    temperatures = np.arange(0.5, 2.1, 0.1)
    for temperature in temperatures:
        beam_search_engine = BeamSearchGenerator(
            pad_id=PAD_ID, eos_id=EOS_ID, bos_id=BOS_ID,
            max_length=MAX_CODE_LENGTH, beam_width=5,
            temperature=temperature, device=DEVICE
        )
        scores.append(np.mean(compute_all_scores(model, valid_data, beam_search_engine)))
    return temperatures[np.argmax(scores)]

![](./images/searching_temperature.png)

In [46]:
beam_search_engine = BeamSearchGenerator(
    pad_id=PAD_ID, eos_id=EOS_ID, bos_id=BOS_ID,
    max_length=MAX_CODE_LENGTH, beam_width=5,
    temperature=1.5, device=DEVICE
)

In [47]:
val_scores = compute_all_scores(model, valid_data, beam_search_engine)
print(f"average score on validation: {np.mean(val_scores):.3f}")

100%|██████████| 100/100 [00:09<00:00, 10.72it/s]

average score on validation: 0.269





In [48]:
test_scores = compute_all_scores(model, test_data[test_data["origin"] == "handcrafted"], beam_search_engine)
print(f"average score on handcrafted: {np.mean(test_scores):.3f}")

100%|██████████| 129/129 [00:11<00:00, 11.24it/s]


average score on handcrafted: 0.149


In [49]:
mined_scores = compute_all_scores(model, test_data[test_data["origin"] == "mined"], beam_search_engine)
print(f"average score on mined part: {np.mean(mined_scores):.3f}")

100%|██████████| 592/592 [00:48<00:00, 12.11it/s]

average score on mined part: -0.279





От вас ожидается скор на `mined` >= 0 при скоре на `handrafted` >= 0.16.

### Добавим данных

Долго пользуюсь утилиткой [tldr](https://github.com/tldr-pages/tldr), вывод выглядит примерно так:
![](./images/tldr.png)

Скачаем и распарсим из [репозитория](https://github.com/tldr-pages/tldr/tree/main/pages) доки к `common` и `linux`.

In [None]:
additional_train_data['cmd_cleaned'] = additional_train_data['cmd'].apply(partial(cmd2template, loose_constraints=True))

print(f"number of empty CMDs: {len(additional_train_data[additional_train_data['cmd_cleaned'] == ''])} / {len(additional_train_data)}")
display(additional_train_data)

Почему-то после очистки `cmd` у нас почти все строчки пустые:
![](./images/empty_cmds.png)

Около 2-х часов пытался менять захардкоженные команды в исходнике функции очищения, но ничего не вышло.
Попробуем вообще не чистить `cmd`

Перебирал разные параметры, было на порядок хуже, чем раньше.

Нужен рефактор препроцессинга, чтобы он не выдавал пустые строки на незнакомые команды.

In [47]:
import json
lines = []
with open("./data/manpage-data.json", "r") as f:
    for line in f:
        lines.append(json.loads(line))

In [48]:
len(lines)

36669

In [53]:
from sklearn.metrics import f1_score

In [54]:
f1_score?

In [52]:
lines[-1]

{'_id': {'$oid': '5ef4f7386e9e960500981ce6'},
 'source': 'zzxordir.1.gz',
 'name': 'zzxordir',
 'synopsis': 'small tools using zziplib',
 'paragraphs': [{'idx': 0,
   'text': '       zzcat, zzdir, zzxorcat, zzxordir zzxorcopy - small tools using zziplib',
   'section': 'NAME',
   'is_option': False},
  {'idx': 1,
   'text': '       <b>zzcat</b>  <b>FILE</b>  <b>[...]</b>   <b>zzdir</b>  <b>DIR</b>  <b>[...]</b>   <b>zzxorcat</b> <b>[-HEX]</b> <b>FILE</b> <b>[...]</b>  <b>zzxordir</b> <b>[-HEX]</b> <b>DIR</b> <b>[...]</b>  <b>zzxorcopy</b>\n       <b>[-HEX]</b> <b>FILE</b> <b>OUTFILE</b>',
   'section': 'SYNOPSIS',
   'is_option': False},
  {'idx': 2,
   'text': '       <u>zzcat</u> prints the given files to stdout, so you may want to redirect the output. The FILE can be a  normal\n       file or an inflated part of a zip archive (see OPTIONS).',
   'section': 'DESCRIPTION',
   'is_option': False},
  {'idx': 3,
   'text': '       <u>zzdir</u> prints the content table to stdout of the gi