# Task 

## What I did

- added data from `man` pages:
    - 1st: pretraining only on `man`
    - 2nd: training only on provided data
- added scheduler;
- changed optimizer;
- added l2 regularization;
- made model bigger;

## What I achieved

- `0.453` on val part;
- `0.227` on handcrafted part;
- `-0.234` on mined part;

## How to use this notebook

- this notebook contains last run logs
- it will be easier to navigate w/ [table of contents jupyter extension](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/nbextensions/toc2/README.html)
- everything I did I put in [wandb logs](https://wandb.ai/furiousteabag/text2bash_final)
- **IMPORTANT**: I created new workspace where I cleaned everything out, so here is brief description of what it contains:
    - **hyperparams** panel: here I showed only necessary hyperparams agains last three most important ones: `val_score`, `handcrafted_score`, `mined_score`, so please take a look at it;
    - **base** panel: losses an lr's of 2nd part of training (training only on provided data);
    - **man** panel: losses an lr's of 1st part of training (pretraining only on `man`);
    - [artifacts](https://wandb.ai/furiousteabag/text2bash_final/artifacts/models/base_model/bd131a116fe133b574c5): I logged best model from each run so it is easy to reproduce any provided results;
    - if you click on any run (e.g. [v3](https://wandb.ai/furiousteabag/text2bash_final/runs/3qq32bq6)) you will see all parameters and gradients during training (so it is easy to catch exploding gradients)

## Conclusions

- model size matters much
- `man` data matters a bit
- `handcrafted` score does not strongly correlates w/ `mined` (`v1` run vs `v3` run)

In [1]:
# standart libraries
import io
import os
import re
import regex
import glob
import random
import shutil
import json
from tqdm.notebook import tqdm
from datetime import datetime
from collections import Counter, defaultdict
from functools import partial

# data handling
import numpy as np
import pandas as pd
import wandb

# cmd preprocessing & metric calculation
import sys
sys.path.append("./utils/")
from bashlint.data_tools import bash_parser, pretty_print, cmd2template
from metric.metric_utils import compute_metric

# text preprocessing
from sklearn.model_selection import train_test_split
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import sentencepiece as spm

# model
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertConfig, BertModel, EncoderDecoderConfig, EncoderDecoderModel
from torch.nn.utils.rnn import pad_sequence

Setting bashlex grammar using file: /home/furiousteabag/Projects/NLP/04_generation/./utils/bashlint/grammar/grammar100.txt
Bashlint grammar set up (148 utilities)



In [2]:
seed = 322

os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
PAD_ID = 0
BOS_ID = 1
EOS_ID = 2
UNK_ID = 3

MAX_TEXT_LENGTH = 256
MAX_CODE_LENGTH = 40

# Utils

In [4]:
stopwords = stopwords.words("english")

def clean_text(text):
    text = text.lower()
    tokens = filter(lambda x: x not in stopwords, text.split())
    tokens = map(stemmer.stem, tokens)
    text = " ".join(tokens)
    text = re.sub(r"[^a-zA-Z ]", "", text)
    return text

def get_invocations_cmds(path):
    with open(path, "r") as file:
        contents = file.read()
        invocations = re.findall(r"[^.]- (.+):\n", contents)
        cmds = re.findall(r"[^.]`(.+)`\n|\0", contents)
    return invocations, cmds


def get_invocations_cmds_from_dir(path):
    paths = glob.glob(os.path.join(path, "*.md"))
    all_invocations, all_cmds = [], []

    for path in paths:
        invocations, cmds = get_invocations_cmds(path)
        all_invocations.extend(invocations)
        all_cmds.extend(cmds)
    
    assert len(all_invocations) == len(all_cmds)
    
    return all_invocations, all_cmds

In [5]:
class TextToBashDataset(Dataset):
    
    def __init__(self, texts, cmds, text_tokenizer, cmd_tokenizer,
                 max_text_length=MAX_TEXT_LENGTH, max_code_length=MAX_CODE_LENGTH):
        
        self.text_tokenizer = text_tokenizer
        self.cmd_tokenizer = cmd_tokenizer
        self.max_text_length = max_text_length
        self.max_code_length = max_code_length
        
        self.items = []
        
        for text, cmd in zip(texts, cmds):
            text_tokenized = text_tokenizer.tokenize(text, add_bos=True, add_eos=True)
            cmd_tokenized = cmd_tokenizer.tokenize(cmd, add_bos=True, add_eos=True)
            if len(text_tokenized) > max_text_length:
                text_tokenized = text_tokenized[:max_text_length-1] + text_tokenized[-1:]
            if len(cmd_tokenized) > max_code_length:
                cmd_tokenized = cmd_tokenized[:max_code_length-1] + cmd_tokenized[-1:]
            self.items.append((text_tokenized, cmd_tokenized))
            
    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        return self.items[idx]

In [6]:
def collate_fn(batch):

    text_idxs = [torch.tensor(item[0]) for item in batch]
    cmd_idxs = [torch.tensor(item[1]) for item in batch]
    
    text_idxs = pad_sequence(text_idxs, padding_value=PAD_ID, batch_first=True)
    cmd_idxs = pad_sequence(cmd_idxs, padding_value=PAD_ID, batch_first=True)
    
    return text_idxs, cmd_idxs 

In [7]:
def greedy_decode(text_tokenized, model, max_len=20):
    
    model.eval()
    model = model.to(DEVICE)
    text_tokenized = text_tokenized.unsqueeze(0).to(DEVICE)
    cmd_prediction = torch.tensor([BOS_ID]).unsqueeze(0).to(DEVICE)
    next_piece = BOS_ID
    
    while next_piece != EOS_ID and cmd_prediction.shape[-1] < max_len:
        #print(cmd_prediction)
        out = model(input_ids=text_tokenized, decoder_input_ids=cmd_prediction)
        next_piece = torch.argmax(out.logits.squeeze(0)[-1]).item()
        cmd_prediction = torch.cat((cmd_prediction, torch.tensor([[next_piece]]).to(DEVICE)), dim=1)
        
    return cmd_prediction[0]

In [8]:
def eval_decode(i, data, ds, model, max_len=20):
    
    for k, v in dict(data.iloc[i]).items():
        print(f"{k}: {v}")
    text_tokenized = torch.tensor(ds[i][0])
    cmd_prediction = greedy_decode(text_tokenized, model, max_len)
    cmd_prediction = [x.item() for x in cmd_prediction]
    print(f"cmd_predicted: {cmd_tokenizer.decode(cmd_prediction)}")

In [9]:
def random_eval(data, ds, model, sample_size=5, max_len=20):
    indexes = random.sample(range(len(ds)), sample_size)
    for i in indexes:
        eval_decode(i, data, ds, model, max_len)
        print()

In [10]:
class EarlyStopping():
    """
    Early stopping to stop the training when the loss does not improve after
    certain epochs.
    """
    def __init__(self, patience=3, min_delta=0.01):
        """
        :param patience: how many epochs to wait before stopping when loss is
               not improving
        :param min_delta: minimum difference between new loss and old loss for
               new loss to be considered as an improvement
        """
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.best_epoch = None
        self.early_stop = False
        
    def __call__(self, val_loss, step):
        if self.best_loss == None:
            self.best_loss = val_loss
            self.best_epoch = step
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.best_epoch = step
            # reset counter if validation loss improves
            self.counter = 0
        elif self.best_loss - val_loss < self.min_delta:
            self.counter += 1
            print(f"INFO: Early stopping counter {self.counter} of {self.patience}")
            if self.counter >= self.patience:
                print('INFO: Early stopping')
                self.early_stop = True

In [11]:
class Trainer:

    def __init__(
            self, 
            model, 
            criterion,
            optimizer, 
            pad_token_id,
            device,
            stopper,
            watch=True,
            prefix="",
            run=None,
            scheduler=None,
            max_grad_norm=-1
    ):
        """
            model: объект класса BertModel
            optimizer: оптимизатор
            pad_token_id: индекс паддинга. Нужен для создания attention mask
            device: девайс (cpu или cuda), на котором надо производить вычисления
            logdir: директория для записи логов
            max_grad_norm: максимум нормы градиентов, для клиппинга
        """
        self._criterion = criterion
        self._optimizer = optimizer
        self._pad_token_id = pad_token_id
        self._device = device
        self._run = run
        self._stopper = stopper
        self._max_grad_norm = max_grad_norm
        self._scheduler = scheduler
        self._prefix = prefix
        
        self._model = model.to(self._device)
        
        if self._run is not None:
            if watch:
                wandb.watch(self._model, criterion=self._criterion, log="all", log_freq=1000, log_graph=True)

        self._n_epoch = 0
        self._n_iter = 0

    def train(self, dataloaders, n_epochs):
        for epoch in tqdm(range(n_epochs)):
            train_loss = self._train_step(dataloaders["train"])
            val_loss = self._val_step(dataloaders["val"])
            self._n_epoch += 1
            tqdm.write(f"Epoch: {self._n_epoch} | train_loss: {train_loss:.3f} | val_loss: {val_loss:.3f}")
            
            if self._run is not None:
                #self._run.log(data={"loss/train": train_loss, "loss/val": val_loss}, step=self._n_epoch)
                wandb.log(data={f"{self._prefix}loss/train": train_loss, f"{self._prefix}loss/val": val_loss, f"{self._prefix}epoch": self._n_epoch})
                wandb.log(data={f"{self._prefix}lr": self._optimizer.param_groups[0]["lr"], f"{self._prefix}epoch": self._n_epoch})
                
            self._stopper(val_loss, self._n_epoch)
            torch.save(self._model.state_dict(), f"./checkpoints/{self._prefix}{self._n_epoch}_{val_loss}.pt")
            
            if self._stopper.early_stop:
                break
        
        if self._run is not None:
            # they are not the best but the first one
            # to achieve loss score with given stopper
            # threshold
            #self._run.summary["best_epoch"] = self._stopper.best_epoch
            #self._run.summary["best_val_loss"] = self._stopper.best_loss
            wandb.summary[f"{self._prefix}best_epoch"] = self._stopper.best_epoch
            wandb.summary[f"{self._prefix}best_val_loss"] = self._stopper.best_loss
            
        best_model_path = f"./checkpoints/{self._prefix}{self._stopper.best_epoch}_{self._stopper.best_loss}.pt"
        
        artifact = wandb.Artifact(f"{self._prefix[:-1]}_model", type="models", metadata=config)
        artifact.add_file(best_model_path)
        #run.log_artifact(artifact)
        wandb.log_artifact(artifact)
            
        self._model.load_state_dict(torch.load(best_model_path))

    def _train_step(self, dataloader):
        """
            dataloader: объект класса DataLoader для обучения
        """
        self._model.train()
        epoch_loss = 0
        for text_idxs, cmd_idxs in dataloader:
            
            text_idxs = text_idxs.to(self._device)
            decoder_input = cmd_idxs[..., :-1].to(self._device)
            target = cmd_idxs[..., 1:].to(self._device)
            
            text_idxs_mask = torch.where(text_idxs != self._pad_token_id, 1, 0).to(self._device)
            decoder_input_mask = torch.where(decoder_input != self._pad_token_id, 1, 0).to(self._device)
            
            self._optimizer.zero_grad()        
            
            out = self._model(
                input_ids=text_idxs, decoder_input_ids=decoder_input,
                attention_mask=text_idxs_mask, decoder_attention_mask=decoder_input_mask)

            logits = out.logits

            loss = self._criterion(logits.reshape(-1, cmd_tokenizer.vocab_size()), target.reshape(-1))
            epoch_loss += loss.item()
            
            loss.backward()
            
            if self._max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(self._model.parameters(), self._max_grad_norm)
                
            self._optimizer.step()
            
            if self._scheduler:
                self._scheduler.step()
            
        return epoch_loss / len(dataloader)
    
    def _val_step(self, dataloader):
        """
            dataloader: объект класса DataLoader для обучения
        """
        self._model.eval()
        epoch_loss = 0
        for text_idxs, cmd_idxs in dataloader:
            
            text_idxs = text_idxs.to(self._device)
            decoder_input = cmd_idxs[..., :-1].to(self._device)
            target = cmd_idxs[..., 1:].to(self._device)
            
            text_idxs_mask = torch.where(text_idxs != self._pad_token_id, 1, 0).to(self._device)
            decoder_input_mask = torch.where(decoder_input != self._pad_token_id, 1, 0).to(self._device)
            
            with torch.no_grad():
                out = self._model(
                    input_ids=text_idxs, decoder_input_ids=decoder_input,
                    attention_mask=text_idxs_mask, decoder_attention_mask=decoder_input_mask)
                logits = out.logits
                loss = self._criterion(logits.reshape(-1, cmd_tokenizer.vocab_size()), target.reshape(-1))
            
            epoch_loss += loss.item()
            
        return epoch_loss / len(dataloader)

In [12]:
class BeamSearchGenerator:
    def __init__(
            self, pad_id, eos_id, bos_id,
            max_length=20, beam_width=5, temperature=1.5,
            device=torch.device('cuda'),
    ):
        """
        Parameters
        ----------
        pad_id : int
        eos_id : int
        bos_id : int
        max_length : int
            Maximum length of output sequence
        beam_width : int
            Width of the beam
        temperature : float
            Softmax temperature
        device : torch.device
            Your model device
        """
        self.pad_id = pad_id
        self.eos_id = eos_id
        self.bos_id = bos_id
        
        self.max_length = max_length
        self.beam_width = beam_width
        self.temperature = temperature
        
        self.device = device
        
    def get_result(self, model, input_text_tokens):
        """
        Parameters
        ----------
        model : TextToBashModel
        input_text_tokens : torch.tensor
            One object input tensor
        """
        
        chains = torch.full([self.beam_width, 1], self.bos_id).to(self.device)
        chain_probabilities = torch.zeros(self.beam_width).to(self.device)
        if_chain_ready = torch.full([self.beam_width], False).to(self.device)
        
        # saving encoder outputs
        input_text_tokens = input_text_tokens.repeat([self.beam_width, 1]).to(self.device)
        encoder_outputs = model._prepare_encoder_decoder_kwargs_for_generation(
            input_ids=input_text_tokens,
            model_kwargs={})["encoder_outputs"]
        
        idx = 0
        while idx <= self.max_length and not all(if_chain_ready):
            
            logits = model(encoder_outputs=encoder_outputs, decoder_input_ids=chains).logits[:, -1, :]
            
            probs = torch.log_softmax(logits / self.temperature, dim=-1)
            probs, tokens = torch.topk(probs, k=self.beam_width)
            
            if idx == 0:
                # pick all top tokens
                # 0 because current chains are same
                beam_tokens = tokens[0].view(self.beam_width, 1)
                chains = torch.cat([chains, beam_tokens], dim=-1)
                chain_probabilities += probs[0]
                idx += 1
                continue

            chain_probabilities_beam = chain_probabilities.reshape(-1, 1).repeat([1, self.beam_width])
            ready_chains_idxs = if_chain_ready.nonzero()
            
            # if chain is already ready,
            # keep one instance of it and add
            # padding
            probs[ready_chains_idxs, :] = -float("inf")
            probs[ready_chains_idxs, 0] = 0
            tokens[ready_chains_idxs, :] = self.pad_id

            chain_probabilities_beam = chain_probabilities_beam + probs
            
            # choosing best across all options
            best_sequences = torch.argsort(chain_probabilities_beam.flatten())[-self.beam_width:]

            chains = torch.cat([chains[best_sequences // self.beam_width, :], tokens.flatten()[best_sequences].view(-1, 1)], dim=-1)
            chain_probabilities = chain_probabilities_beam.flatten()[best_sequences]

            if_chain_ready = ((chains[:, -1] == self.eos_id) | (chains[:, -1] == self.pad_id))
                
            idx += 1
        
        return list(zip(chains, chain_probabilities))        

In [13]:
def show_examples(model, beam_search_engine, data, n=5):
    all_scores = []
    model.eval()
    with torch.no_grad():
        for i in range(n):
            print('invocation:', data["invocation"].iloc[i])
            print('invocation preprocessed:', data["invocation_preprocessed"].iloc[i])
            print('cmd:', data["cmd"].iloc[i])
            if "cmd_preprocessed" in data.columns:
                print('cmd preprocessed:', data["cmd_preprocessed"].iloc[i])

            text_tokenized = invocation_tokenizer.tokenize(data["invocation_preprocessed"].iloc[i], add_bos=True, add_eos=True)
            src = torch.tensor(text_tokenized)
            pred = beam_search_engine.get_result(model, src)

            #print('greedy decode:', cmd_tokenizer.decode(list(map(int, greedy_decode(src, model)))))
            scores = []
            for x, proba in pred[:5]:
                pred_cmd = cmd_tokenizer.decode(list(map(int, x)))
                score = compute_metric(pred_cmd, 1, data["cmd"].iloc[i])
                scores.append(score)
                print(pred_cmd, round(proba.item(), 2))
            print(max(scores))
            print()
            all_scores.append(max(scores))

    print(f"average score: {np.mean(all_scores):.3f}")

In [14]:
def get_optimizer(model, weight_decay=0.01, init_lr=1e-3, betas=(0.9, 0.999)):
    """
        model: инициализированная модель
        weight_decay: коэффициент l2 регуляризации
        
        returns: оптимизатор
    """
    decayed_parameters, not_decayed_parameters = [], []
    
    for name, params in model.named_parameters():
        if any(layer_off in name for layer_off in ["bias", "layer_norm"]):
            not_decayed_parameters.append(params)
        else:
            decayed_parameters.append(params)
            
    grouped_parameters = [
        {'params': decayed_parameters, 'weight_decay': weight_decay},
        {'params': not_decayed_parameters, 'weight_decay': 0.}
    ]

    return torch.optim.AdamW(grouped_parameters, lr=init_lr, betas=betas)

In [15]:
def compute_all_scores(model, df, beam_engine):
    all_scores = []

    for i, (text, target_cmd) in tqdm(enumerate(zip(df.invocation_preprocessed.values, df.cmd.values)), total=len(df)):
        
        input_tokens = invocation_tokenizer.tokenize(text, add_bos=True, add_eos=True)
        if len(input_tokens) > MAX_TEXT_LENGTH:
            input_tokens = input_tokens[:MAX_TEXT_LENGTH-1] + input_tokens[-1:]
        input_tokens = torch.tensor(input_tokens)
        
        predictions = beam_engine.get_result(model, input_tokens)
        
        # get only 5 top results
        predictions = predictions[:5]
        object_scores = []
        for output_tokens, proba in predictions:
            output_cmd = cmd_tokenizer.decode(list(map(int, output_tokens)))
            score = compute_metric(output_cmd, 1, target_cmd)
            object_scores.append(score)
        
        all_scores.append(max(object_scores))
    return all_scores

In [16]:
def find_best_temperature(model, valid_data):
    scores = []
    temperatures = np.arange(0.5, 2.1, 0.1)
    for temperature in temperatures:
        beam_search_engine = BeamSearchGenerator(
            pad_id=PAD_ID, eos_id=EOS_ID, bos_id=BOS_ID,
            max_length=MAX_CODE_LENGTH, beam_width=5,
            temperature=temperature, device=DEVICE
        )
        scores.append(np.mean(compute_all_scores(model, valid_data, beam_search_engine)))
    return temperatures[np.argmax(scores)]

# Data preparation

## loading

### baseline

In [17]:
data = {
    "train": pd.read_csv('data/train_data.csv'),
    "test": pd.read_csv('data/test_data.csv')
}

### tldr

In [18]:
invocations, cmds = get_invocations_cmds_from_dir("./tdlr_data/linux/")
invocations_common, cmds_common = get_invocations_cmds_from_dir("./tdlr_data/common/")
invocations.extend(invocations_common)
cmds.extend(cmds_common)

data["tldr"] = pd.DataFrame.from_records({"invocation": invocations, "cmd": cmds})

### man

In [19]:
man = []
with open("./data/manpage-data.json", "r") as f:
    for line in f:
        command = json.loads(line)
        man.append((command["synopsis"], command["name"]))
        
data["man"] = pd.DataFrame(man, columns=["invocation", "cmd"])
data["man"] = data["man"].dropna()

## preprocessing

### text

In [20]:
stemmer = PorterStemmer()
#stemmer = SnowballStemmer(language="english")

for split, df in data.items():
    data[split]["invocation_preprocessed"] = df["invocation"].apply(clean_text)

### cmd

In [21]:
for split in ["train"]:
    data[split]["cmd_preprocessed"] = data[split]["cmd"].apply(partial(cmd2template, loose_constraints=True))
    #data[split] = data[split][data[split]["cmd_cleaned"].str.strip().astype(bool)] # removing empty rows

## train / val split

In [22]:
data["train_train"], data["train_val"] = train_test_split(data["train"], test_size=0.05, random_state=seed)
data["man_train"], data["man_val"] = train_test_split(data["man"], test_size=0.05, random_state=seed)

## tokenizing

### text

In [23]:
INVOCATION_VOCAB_SIZE = 10000

invocation_tokenizer = io.BytesIO()
spm.SentencePieceTrainer.Train(
    sentence_iterator=iter(list(data["man_train"]["invocation_preprocessed"]) + list(data["train_train"]["invocation_preprocessed"])),
    model_writer=invocation_tokenizer,
    vocab_size=INVOCATION_VOCAB_SIZE,
    pad_id=PAD_ID,                
    bos_id=BOS_ID,
    eos_id=EOS_ID,
    unk_id=UNK_ID
)
invocation_tokenizer = spm.SentencePieceProcessor(model_proto=invocation_tokenizer.getvalue())

### cmd

In [24]:
CMD_VOCAB_SIZE = 12000

cmd_tokenizer = io.BytesIO()
spm.SentencePieceTrainer.Train(
    sentence_iterator=iter(list(data["man_train"]["cmd"]) + list(data["train_train"]["cmd_preprocessed"])),
    model_writer=cmd_tokenizer,
    vocab_size=CMD_VOCAB_SIZE,
    pad_id=PAD_ID,                
    bos_id=BOS_ID,
    eos_id=EOS_ID,
    unk_id=UNK_ID
)
cmd_tokenizer = spm.SentencePieceProcessor(model_proto=cmd_tokenizer.getvalue())

## datasets & dataloaders

In [25]:
print(data.keys())

dict_keys(['train', 'test', 'tldr', 'man', 'train_train', 'train_val', 'man_train', 'man_val'])


In [26]:
datasets = {}

datasets["man_train"] = TextToBashDataset(
    texts=data["man_train"]["invocation_preprocessed"],
    cmds=data["man_train"]["cmd"],
    text_tokenizer=invocation_tokenizer,
    cmd_tokenizer=cmd_tokenizer)

datasets["man_val"] = TextToBashDataset(
    texts=data["man_val"]["invocation_preprocessed"],
    cmds=data["man_val"]["cmd"],
    text_tokenizer=invocation_tokenizer,
    cmd_tokenizer=cmd_tokenizer)

datasets["train_train"] = TextToBashDataset(
    texts=data["train_train"]["invocation_preprocessed"],
    cmds=data["train_train"]["cmd_preprocessed"],
    text_tokenizer=invocation_tokenizer,
    cmd_tokenizer=cmd_tokenizer)

datasets["train_val"] = TextToBashDataset(
    texts=data["train_val"]["invocation_preprocessed"],
    cmds=data["train_val"]["cmd_preprocessed"],
    text_tokenizer=invocation_tokenizer,
    cmd_tokenizer=cmd_tokenizer)

In [27]:
MAN_BATCH_SIZE = 32
TRAIN_BATCH_SIZE = 32

loaders_base = {
    'train': DataLoader(datasets["train_train"], batch_size=TRAIN_BATCH_SIZE, shuffle=True, collate_fn=collate_fn),
    'val': DataLoader(datasets["train_val"], batch_size=TRAIN_BATCH_SIZE, collate_fn=collate_fn),
}
loaders_man = {
    'train': DataLoader(datasets["man_train"], batch_size=MAN_BATCH_SIZE, shuffle=True, collate_fn=collate_fn),
    'val': DataLoader(datasets["man_val"], batch_size=MAN_BATCH_SIZE, collate_fn=collate_fn),
}

# Training model

In [28]:
config = {
    "tokenizer/cmd/vocab_size": CMD_VOCAB_SIZE,
    "tokenizer/invocation/vocab_size": INVOCATION_VOCAB_SIZE,
    
    "model/hidden_size": 512,
    "model/num_hidden_layers": 4,
    "model/num_attention_heads": 8,
    "model/intermediate_size": 512 * 4,
    "model/hidden_dropout_prob": 0.2,
    
    "training/man/batch_size": MAN_BATCH_SIZE,
    "training/man/lr": 1e-4,
    "training/man/weight_decay": 0.01,
    "training/man/stopper/patience": 3,
    "training/man/stopper/min_delta": 0.01,
    "training/man/max_grad_norm": -1,
    "training/man/n_epochs": 15,
    
    "training/base/batch_size": TRAIN_BATCH_SIZE,
    "training/base/lr": 1e-4,
    "training/base/weight_decay": 0.01,
    "training/base/stopper/patience": 4,
    "training/base/stopper/min_delta": 0.01,
    "training/base/max_grad_norm": -1,
    "training/base/n_epochs": 25,
}

In [29]:
run = wandb.init(project="text2bash_final", job_type="train_model", config=config)

[34m[1mwandb[0m: Currently logged in as: [33mfuriousteabag[0m (use `wandb login --relogin` to force relogin)


## model

In [30]:
encoder_config = BertConfig(
    vocab_size = invocation_tokenizer.vocab_size(),
    hidden_size = config["model/hidden_size"],
    num_hidden_layers = config["model/num_hidden_layers"],
    num_attention_heads = config["model/num_attention_heads"],
    intermediate_size = config["model/intermediate_size"],
    hidden_dropout_prob = config["model/hidden_dropout_prob"],
    pad_token_id = PAD_ID,
)

decoder_config = BertConfig(
    vocab_size = cmd_tokenizer.vocab_size(),
    hidden_size = config["model/hidden_size"],
    num_hidden_layers = config["model/num_hidden_layers"],
    num_attention_heads = config["model/num_attention_heads"],
    intermediate_size = config["model/intermediate_size"],
    hidden_dropout_prob = config["model/hidden_dropout_prob"],
    pad_token_id = PAD_ID,
    is_decoder = True,
    add_cross_attention = True
)

print(decoder_config.is_decoder)
print(decoder_config.add_cross_attention)

model_config = EncoderDecoderConfig.from_encoder_decoder_configs(encoder_config, decoder_config)
model = EncoderDecoderModel(config=model_config)

True
True


In [31]:
DEVICE = torch.device("cuda")

## training on man

In [32]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
#optimizer = torch.optim.Adam(model.parameters(), lr=config["training/man/lr"])
optimizer = get_optimizer(model, weight_decay=config["training/man/weight_decay"], init_lr=config["training/man/lr"])
stopper = EarlyStopping(patience=config["training/man/stopper/patience"], min_delta=config["training/man/stopper/min_delta"])
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=config["training/man/lr"],
    epochs=config["training/man/n_epochs"], steps_per_epoch=len(loaders_man["train"]))

In [33]:
trainer = Trainer(
    model=model, 
    criterion=criterion,
    optimizer=optimizer, 
    pad_token_id=PAD_ID,
    device=DEVICE,
    run=run,
    stopper=stopper,
    scheduler=scheduler,
    prefix="man/",
    max_grad_norm=config["training/man/max_grad_norm"]
)

[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


In [34]:
! mkdir -p ./checkpoints

In [35]:
trainer.train(loaders_man, config["training/man/n_epochs"])

  0%|          | 0/20 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 6.725 | val_loss: 5.879
Epoch: 2 | train_loss: 5.304 | val_loss: 4.982
Epoch: 3 | train_loss: 4.585 | val_loss: 4.428
Epoch: 4 | train_loss: 4.108 | val_loss: 4.064
Epoch: 5 | train_loss: 3.682 | val_loss: 3.782
Epoch: 6 | train_loss: 3.240 | val_loss: 3.574
Epoch: 7 | train_loss: 2.799 | val_loss: 3.403
Epoch: 8 | train_loss: 2.394 | val_loss: 3.332
Epoch: 9 | train_loss: 2.020 | val_loss: 3.327
INFO: Early stopping counter 1 of 5
Epoch: 10 | train_loss: 1.688 | val_loss: 3.284
Epoch: 11 | train_loss: 1.401 | val_loss: 3.282
INFO: Early stopping counter 1 of 5
Epoch: 12 | train_loss: 1.156 | val_loss: 3.356
INFO: Early stopping counter 2 of 5
Epoch: 13 | train_loss: 0.961 | val_loss: 3.378
INFO: Early stopping counter 3 of 5
Epoch: 14 | train_loss: 0.806 | val_loss: 3.400
INFO: Early stopping counter 4 of 5
Epoch: 15 | train_loss: 0.689 | val_loss: 3.485
INFO: Early stopping counter 5 of 5
INFO: Early stopping


### small test

In [36]:
beam_search_engine = BeamSearchGenerator(
    pad_id=PAD_ID, eos_id=EOS_ID, bos_id=BOS_ID,
    max_length=MAX_CODE_LENGTH, beam_width=5,
    temperature=1.5, device=DEVICE
)

In [37]:
show_examples(model, beam_search_engine, data["man_val"])

invocation: command-line interface for chrony daemon
invocation preprocessed: commandlin interfac chroni daemon
cmd: chronyc
pkid-client -11.23
pkid-cli -9.44
pki-client -9.3
rubyd -8.93
pkid -7.57
0.0

invocation: GNAT toolbox
invocation preprocessed: gnat toolbox
cmd: mips-linux-gnu-gnatbind-8
powerpc-linux-gnuspe-gnatmake-8 -7.58
arm-linux-gnueabihf-gnatmake-6 -7.57
arm-linux-gnueabihf-gnatmake-7 -7.56
arm-linux-gnueabihf-gnatmake-8 -7.49
arm-linux-gnueabihf-gnatmake-5 -7.49
0.0

invocation: SFTP connection handler of FileZilla
invocation preprocessed: sftp connect handler filezilla
cmd: fzsftp
fatget_query -20.61
fprint_get -19.99
fatget -13.17
fatcat -12.38
fprint -11.71
0.0

invocation: show Chinese characters' phonations
invocation preprocessed: show chines characters phonat
cmd: hime-juyin-learn
botch-y-reg-client -22.34
botch-y-reg -19.33
wordview2d -13.63
wordview2 -10.67
wordview -9.6
0.0

invocation: Sets privacy flags or quota for a Protection Database entry
invocation pre

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  ../aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


git-annex-setcache -17.93
git-annex-cache -16.07
pts_setpasswd -10.58
pts_setquota -10.19
lizardfs-setquota -10.08
0.0

average score: 0.000


## training on base

In [38]:
criterion = nn.CrossEntropyLoss(ignore_index=PAD_ID)
#optimizer = torch.optim.Adam(model.parameters(), lr=config["training/base/lr"])
optimizer = get_optimizer(model, weight_decay=config["training/base/weight_decay"], init_lr=config["training/base/lr"])
stopper = EarlyStopping(patience=config["training/base/stopper/patience"], min_delta=config["training/base/stopper/min_delta"])
scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer, max_lr=config["training/base/lr"],
    epochs=config["training/base/n_epochs"], steps_per_epoch=len(loaders_base["train"]))

In [39]:
trainer = Trainer(
    model=model, 
    criterion=criterion,
    optimizer=optimizer, 
    pad_token_id=PAD_ID,
    device=DEVICE,
    run=run,
    stopper=stopper,
    scheduler=scheduler,
    prefix="base/",
    watch=False,
    max_grad_norm=config["training/base/max_grad_norm"]
)

In [40]:
trainer.train(loaders_base, config["training/base/n_epochs"])

  0%|          | 0/30 [00:00<?, ?it/s]

Epoch: 1 | train_loss: 6.988 | val_loss: 4.204
Epoch: 2 | train_loss: 3.527 | val_loss: 2.763
Epoch: 3 | train_loss: 2.487 | val_loss: 2.007
Epoch: 4 | train_loss: 1.876 | val_loss: 1.519
Epoch: 5 | train_loss: 1.441 | val_loss: 1.218
Epoch: 6 | train_loss: 1.167 | val_loss: 1.033
Epoch: 7 | train_loss: 0.974 | val_loss: 0.926
Epoch: 8 | train_loss: 0.831 | val_loss: 0.877
Epoch: 9 | train_loss: 0.722 | val_loss: 0.816
Epoch: 10 | train_loss: 0.630 | val_loss: 0.798
Epoch: 11 | train_loss: 0.550 | val_loss: 0.789
INFO: Early stopping counter 1 of 5
Epoch: 12 | train_loss: 0.483 | val_loss: 0.787
Epoch: 13 | train_loss: 0.429 | val_loss: 0.775
Epoch: 14 | train_loss: 0.377 | val_loss: 0.781
INFO: Early stopping counter 1 of 5
Epoch: 15 | train_loss: 0.335 | val_loss: 0.794
INFO: Early stopping counter 2 of 5
Epoch: 16 | train_loss: 0.297 | val_loss: 0.802
INFO: Early stopping counter 3 of 5
Epoch: 17 | train_loss: 0.259 | val_loss: 0.817
INFO: Early stopping counter 4 of 5
Epoch: 18 | t

### small test

In [41]:
show_examples(model, beam_search_engine, data["train_val"])

invocation: search the files from the current directory tree for "chrome"
invocation preprocessed: search file current directori tree chrome
cmd: find . -exec grep chrome {} \;
cmd preprocessed: find Path -exec grep Regex {} \;
find Path | xargs -I {} grep Regex {} | grep Regex -5.43
find Path | xargs -3.49
find Path -exec grep Regex {} + -3.01
find Path -exec grep Regex {} \; -2.95
find Path | xargs -I {} grep Regex {} -2.41
1.0

invocation: find all *.dbf files/directories in entire file system
invocation preprocessed: find dbf filesdirectori entir file system
cmd: find / -name "*.dbf"
cmd preprocessed: find Path -name Regex
find Path -name Regex | xargs -I {} echo {} -9.99
find Path Path -4.24
find Path -4.15
find Path -name Regex -print -2.71
find Path -name Regex -0.39
1.0

invocation: find all the files which have size 0 bytes in temp folder
invocation preprocessed: find file size  byte temp folder
cmd: find /tmp -type f -empty
cmd preprocessed: find Path -type f -empty
find Path

# Inference

In [42]:
beam_search_engine = BeamSearchGenerator(
    pad_id=PAD_ID, eos_id=EOS_ID, bos_id=BOS_ID,
    max_length=MAX_CODE_LENGTH, beam_width=5,
    temperature=1.5, device=DEVICE
)

In [43]:
val_scores = compute_all_scores(model, data["train_val"], beam_search_engine)
val_score = round(np.mean(val_scores), 3)
print(f"average score on validation: {val_score}")

  0%|          | 0/498 [00:00<?, ?it/s]

average score on validation: 0.453


In [44]:
handcrafted_scores = compute_all_scores(model, data["test"][data["test"]["origin"] == "handcrafted"], beam_search_engine)
handcrafted_score = round(np.mean(handcrafted_scores), 3)
print(f"average score on handcrafted: {handcrafted_score}")

  0%|          | 0/129 [00:00<?, ?it/s]

average score on handcrafted: 0.142


In [45]:
mined_scores = compute_all_scores(model, data["test"][data["test"]["origin"] == "mined"], beam_search_engine)
mined_score = round(np.mean(mined_scores), 3)
print(f"average score on mined part: {mined_score}")

  0%|          | 0/592 [00:00<?, ?it/s]

average score on mined part: -0.234


In [46]:
wandb.summary["val_score"] = val_score
wandb.summary["handcrafted_score"] = handcrafted_score
wandb.summary["mined_score"] = mined_score

In [47]:
wandb.finish()

VBox(children=(Label(value=' 318.76MB of 318.76MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=…

0,1
base/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇████
base/loss/train,█▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁
base/loss/val,█▅▄▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁
base/lr,▁▂▃▄▅▆▇█████▇▇▇▆▆▅
man/epoch,▁▁▁▁▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇██
man/loss/train,█▆▆▅▄▄▃▃▃▂▂▂▁▁▁
man/loss/val,█▆▄▃▂▂▁▁▁▁▁▁▁▁▂
man/lr,▁▂▄▆▇███▇▇▆▅▄▃▂

0,1
base/best_epoch,13.0
base/best_val_loss,0.77511
base/epoch,18.0
base/loss/train,0.2295
base/loss/val,0.82467
base/lr,6e-05
handcrafted_score,0.142
man/best_epoch,10.0
man/best_val_loss,3.28388
man/epoch,15.0
