# Pretraining a RoBERTa-Like Model and Tokenizer
Inspired by a project in Rothman's "Transformers for Naturl Language Processing", we pretrain KantaiBERT model (a RoBERTa-Like model) on Dostoyevsky's *Crime and Punishment*. We also train a tokenizer with a few more of Dostoyevsky's works (*Poor Folk* and *The Gambler*).

In [35]:
import os
import torch
import tokenizers
import transformers


In [36]:
# Constants.
data_dir = 'dostoyevsky/'
tokenizer_dir = 'KantaiBERT'
crime_and_punishment_path = 'dostoyevsky/crime-and-punishment.txt'
file_paths = [
    str(os.path.join(data_dir, f)) for f in os.listdir(data_dir) 
    if f.endswith('.txt')
]

vocab_size = 52000
max_tokens = 512


In [37]:
# Initialize, train, and save tokenizer.
tokenizer = tokenizers.ByteLevelBPETokenizer()
tokenizer.train(
    files=file_paths,
    vocab_size=vocab_size,
    min_frequency=2,
    special_tokens=[
        '<s>',
        '<pad>',
        '</s>',
        '<unk>',
        '<mask>'
    ]
)

# Create directory.
if not os.path.exists(tokenizer_dir):
    os.makedirs(tokenizer_dir) 

tokenizer.save_model('KantaiBERT')







['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [38]:
# Look at an encoded sentence.
tokenizer.encode('How will this sentence be tokenized?').tokens


['How', 'Ġwill', 'Ġthis', 'Ġsentence', 'Ġbe', 'Ġtoken', 'iz', 'ed', '?']

In [39]:
# Configure model and RoBERTa tokenizer.
config = transformers.RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=max_tokens+2,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)
model = transformers.RobertaForMaskedLM(config=config)
tokenizer = transformers.RobertaTokenizer.from_pretrained(
    './KantaiBERT', max_length=max_tokens
)


Didn't find file ./KantaiBERT/added_tokens.json. We won't load it.
Didn't find file ./KantaiBERT/special_tokens_map.json. We won't load it.
Didn't find file ./KantaiBERT/tokenizer_config.json. We won't load it.
loading file ./KantaiBERT/vocab.json
loading file ./KantaiBERT/merges.txt
loading file None
loading file None
loading file None


In [40]:
# Check the number of parameters.
print(f'Number of parameters: {model.num_parameters()}\n')

matrices = list(model.parameters())

print('Parameter tensor sizes:\n')
for ind, params in enumerate(matrices):
    print(ind, params.shape)
    

Number of parameters: 83504416

Parameter tensor sizes:

0 torch.Size([52000, 768])
1 torch.Size([514, 768])
2 torch.Size([1, 768])
3 torch.Size([768])
4 torch.Size([768])
5 torch.Size([768, 768])
6 torch.Size([768])
7 torch.Size([768, 768])
8 torch.Size([768])
9 torch.Size([768, 768])
10 torch.Size([768])
11 torch.Size([768, 768])
12 torch.Size([768])
13 torch.Size([768])
14 torch.Size([768])
15 torch.Size([3072, 768])
16 torch.Size([3072])
17 torch.Size([768, 3072])
18 torch.Size([768])
19 torch.Size([768])
20 torch.Size([768])
21 torch.Size([768, 768])
22 torch.Size([768])
23 torch.Size([768, 768])
24 torch.Size([768])
25 torch.Size([768, 768])
26 torch.Size([768])
27 torch.Size([768, 768])
28 torch.Size([768])
29 torch.Size([768])
30 torch.Size([768])
31 torch.Size([3072, 768])
32 torch.Size([3072])
33 torch.Size([768, 3072])
34 torch.Size([768])
35 torch.Size([768])
36 torch.Size([768])
37 torch.Size([768, 768])
38 torch.Size([768])
39 torch.Size([768, 768])
40 torch.Size([768])
4

In [41]:
# Create dataset and collator.
dataset = transformers.LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=crime_and_punishment_path,
    block_size=128
)
collator = transformers.DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True, mlm_probability=0.15
)


Creating features from dataset file at dostoyevsky/crime-and-punishment.txt


In [42]:
train_args = transformers.TrainingArguments(
    output_dir='./KantaiBERT',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10000,
    save_total_limit=2
)
trainer = transformers.Trainer(
    model=model,
    args=train_args,
    data_collator=collator,
    train_dataset=dataset
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [43]:
trainer.train()
trainer.save_model('./KantaiBERT')


***** Running training *****
  Num examples = 17909
  Num Epochs = 1
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 280


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to ./KantaiBERT
Configuration saved in ./KantaiBERT/config.json
Model weights saved in ./KantaiBERT/pytorch_model.bin


In [44]:
# Perform mask filling task with our pretrained model.
fill_mask = transformers.pipeline(
    'fill-mask',
    model='./KantaiBERT',
    tokenizer='./KantaiBERT'
)


loading configuration file ./KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "./KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.21.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

loading configuration file ./KantaiBERT/config.json
Model config RobertaConfig {
  "_name_or_path": "./KantaiBERT",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout"

In [48]:
pred = fill_mask('May I venture, honoured sir, to engage you in polite <mask>?')

print("""
Original:
    May I venture, honoured sir, to engage you in polite conversation?
Masked:
    May I venture, honoured sir, to engage you in polite <mask>?
""")
print(f'Prediction: \n\t{pred[0]["sequence"]}')



Original:
    May I venture, honoured sir, to engage you in polite conversation?
Masked:
    May I venture, honoured sir, to engage you in polite <mask>?

Prediction: 
	May I venture, honoured sir, to engage you in polite,?


In [49]:
pred = fill_mask('His eyes shone with feverish <mask>.')

print("""
Original:
    His eyes shone with feverish brilliance.
Masked:
    His eyes shone with feverish <mask>.
""")
print(f'Prediction: \n\t{pred[0]["sequence"]}')



Original:
    His eyes shone with feverish brilliance.
Masked:
    His eyes shone with feverish <mask>.

Prediction: 
	His eyes shone with feverish..
