# Example training and inference

In [None]:
import json, codecs
import random

from TransformerLanguageModel import CustomTokenizer, TransformerLanguageModel, Trainer
from nltk.tokenize import word_tokenize

In [None]:
start_token = '<S>'
end_token = '</S>'
pad_token = '<PAD>'
vocab_size = 30000
device = 'cuda'

## Loading data

In [None]:
with open('data/dataset.json', 'r', encoding="utf-8") as f:
    examples = json.load(f)

split = int(0.05*len(examples))
dev = examples[:split]
train = examples[split:]

## Training tokenizer

In [None]:
tokenizer = CustomTokenizer(pad_token, start_token, end_token, vocab_size=vocab_size)
tokenizer.train(train)
tokenizer.save('tokenizer.json')

When the tokenizer is already trained. Load it with:

In [None]:
tokenizer.load('tokenizer.json')

## Training model

In [None]:
lm = TransformerLanguageModel('lm', n_head=6, d_model=256, d_ff=4*256, layer_count=5, embed_dropout=0.3, cell_dropout=0.2, tokenizer=tokenizer).to(device)

trainer = Trainer(vocab_size, 16, 10)
trainer.train(lm, train, dev)

Extra training of the model

In [None]:
lm.load('lm')
trainer = Trainer(vocab_size, 16, 10)
trainer.train(lm, train, dev, extra_train=True)

## Inference
With nucleus sampling with inference parameter 0.9

In [None]:
print(lm.generate_sentence(word_tokenize('Hello'), True, 'nuc', inference_parameter=0.9, limit=100))