## Kantai Bert with Byte Level Byte pair Encoding Tokenizer

1. Obtaining the dataset

### 1. Obtaining the Dataset

In [68]:
# dataset of books by Immanuel Kant

!curl -L https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/master/Chapter03/kant.txt --output "kant.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0 10.7M    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  3 10.7M    3  336k    0     0   195k      0  0:00:56  0:00:01  0:00:55  195k
 41 10.7M   41 4591k    0     0  1651k      0  0:00:06  0:00:02  0:00:04 1653k
100 10.7M  100 10.7M    0     0  3811k      0  0:00:02  0:00:02 --:--:-- 3816k


In [69]:
# !pip uninstall tensorflow
# !pip install transformers

In [1]:
# getting versions of transformers and tokenizers
!pip list | grep -E 'transformers|tokenizers'

'grep' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
import tokenizers
import transformers

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%%time
# using bytelevelbpetokenizer

from pathlib import Path

paths = [str(x) for x in Path(".").glob("*.txt")]
print(paths)

tokenizer = tokenizers.ByteLevelBPETokenizer()
print(f'Tokenizer: {tokenizer}')

tokenizer.train(files=paths, vocab_size=50000, min_frequency=2, special_tokens=[
    "<start>",
    "<pad>",
    "<\s>",
    "<unknown>",
    "<mask>",
])

['kant.txt']
Tokenizer: Tokenizer(vocabulary_size=0, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)
CPU times: total: 13.5 s
Wall time: 1.4 s


In [4]:
import os

tokenizer_dir = 'Tokenizer'

# print(os.path.exists(tokenizer_dir))
if os.path.exists(tokenizer_dir):
    # print(os.path.listdir)
    pass

if not os.path.exists(tokenizer_dir):
    os.mkdir(tokenizer_dir)
    print("Dir made")
tokenizer.save_model('Tokenizer')

['Tokenizer\\vocab.json', 'Tokenizer\\merges.txt']

## Loading Tokenizers

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer
# from tokenizers.processors import BertProcesssing 
import tokenizers.processors 
from tokenizers.processors import BertProcessing

In [6]:
tokenizer = ByteLevelBPETokenizer(
    "Tokenizer/vocab.json",
    "Tokenizer/merges.txt"
)
tokenizer 

Tokenizer(vocabulary_size=19296, model=ByteLevelBPE, add_prefix_space=False, lowercase=False, dropout=None, unicode_normalizer=None, continuing_subword_prefix=None, end_of_word_suffix=None, trim_offsets=False)

In [7]:
# encoding a sentence
encoding = tokenizer.encode("The thinking of the way of life")
print(f"Encoding: {encoding}")
print(f"Tokens are : {encoding.tokens}")

Encoding: Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Tokens are : ['The', 'Ġthinking', 'Ġof', 'Ġthe', 'Ġway', 'Ġof', 'Ġlife']


In [8]:
tokenizer._tokenizer.post_processor

<tokenizers.processors.ByteLevel at 0x198c57bbfc0>

In [9]:
tokenizer.token_to_id("<s>")

In [10]:
# fitting the BERT model 
# tokenizer._tokenizer.post_processor = BertProcessing(
#     ("", tokenizer.token_to_id("")),
#     ("", tokenizer.token_to_id("")),
# )
tokenizer.enable_truncation(max_length=512)

In [11]:
encoding = tokenizer.encode("The thinking of the way of life.")
print(f"Encoding: {encoding}")
print(f"Tokens are : {encoding.tokens}")

Encoding: Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
Tokens are : ['The', 'Ġthinking', 'Ġof', 'Ġthe', 'Ġway', 'Ġof', 'Ġlife', '.']


In [12]:
import torch 
torch.cuda.is_available()

True

## Model Configuration

In [13]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size = 52000,
    max_position_embeddings = 514,
    num_attention_heads = 12,
    num_hidden_layers = 6,
    type_vocab_size = 1,
)
config 

RobertaConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.42.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 52000
}

In [14]:
# reloading tokenizer of Roberta
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained(
    "tokenizer",
    max_length=512,
)
tokenizer 

RobertaTokenizer(name_or_path='tokenizer', vocab_size=19296, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
	19296: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	19297: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	19298: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

## Model Initialization

In [15]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)
model 

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [16]:
model.parameters(), model.num_parameters()

(<generator object Module.parameters at 0x00000198D04FD700>, 83504416)

In [17]:
params = list(model.parameters())
print(len(params))
print(params[0][0, :5], params[0].shape)

106
tensor([0.0012, 0.0027, 0.0071, 0.0134, 0.0075], grad_fn=<SliceBackward0>) torch.Size([52000, 768])


In [18]:
# counting the parameters
np=0
for p in range(0,len(params)):#number of tensors
    PL2=True
    try:
        L2=len(params[p][0]) #check if 2D
    except:
        L2=1             #not 2D but 1D
        PL2=False
    L1=len(params[p])      
    L3=L1*L2
    np+=L3             # number of parameters per tensor
    if PL2==True:
        print(p,L1,L2,L3)  # displaying the sizes of the parameters
    if PL2==False:
        print(p,L1,L3)  # displaying the sizes of the parameters

print(np)              # total number of parameters

0 52000 768 39936000
1 514 768 394752
2 1 768 768
3 768 768
4 768 768
5 768 768 589824
6 768 768
7 768 768 589824
8 768 768
9 768 768 589824
10 768 768
11 768 768 589824
12 768 768
13 768 768
14 768 768
15 3072 768 2359296
16 3072 3072
17 768 3072 2359296
18 768 768
19 768 768
20 768 768
21 768 768 589824
22 768 768
23 768 768 589824
24 768 768
25 768 768 589824
26 768 768
27 768 768 589824
28 768 768
29 768 768
30 768 768
31 3072 768 2359296
32 3072 3072
33 768 3072 2359296
34 768 768
35 768 768
36 768 768
37 768 768 589824
38 768 768
39 768 768 589824
40 768 768
41 768 768 589824
42 768 768
43 768 768 589824
44 768 768
45 768 768
46 768 768
47 3072 768 2359296
48 3072 3072
49 768 3072 2359296
50 768 768
51 768 768
52 768 768
53 768 768 589824
54 768 768
55 768 768 589824
56 768 768
57 768 768 589824
58 768 768
59 768 768 589824
60 768 768
61 768 768
62 768 768
63 3072 768 2359296
64 3072 3072
65 768 3072 2359296
66 768 768
67 768 768
68 768 768
69 768 768 589824
70 768 768
71 768 768

## Preparing Dataset

In [19]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer = tokenizer,
    file_path = "kant.txt",
    block_size = 128,
)
dataset 



CPU times: total: 23 s
Wall time: 22.9 s


<transformers.data.datasets.language_modeling.LineByLineTextDataset at 0x198d010f650>

In [20]:
# data collator that takes sample from dataset and collates to batches
from transformers import DataCollatorForLanguageModeling

# mlm probability: % of masked tokens in training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.14,
)
data_collator

DataCollatorForLanguageModeling(tokenizer=RobertaTokenizer(name_or_path='tokenizer', vocab_size=19296, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
	19296: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	19297: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	19298: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=True, mlm_probability=0.14

In [21]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir = "KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10000,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)
trainer, training_args

(<transformers.trainer.Trainer at 0x198f012c110>,
 TrainingArguments(
 _n_gpu=1,
 accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
 adafactor=False,
 adam_beta1=0.9,
 adam_beta2=0.999,
 adam_epsilon=1e-08,
 auto_find_batch_size=False,
 batch_eval_metrics=False,
 bf16=False,
 bf16_full_eval=False,
 data_seed=None,
 dataloader_drop_last=False,
 dataloader_num_workers=0,
 dataloader_persistent_workers=False,
 dataloader_pin_memory=True,
 dataloader_prefetch_factor=None,
 ddp_backend=None,
 ddp_broadcast_buffers=None,
 ddp_bucket_cap_mb=None,
 ddp_find_unused_parameters=None,
 ddp_timeout=1800,
 debug=[],
 deepspeed=None,
 disable_tqdm=False,
 dispatch_batches=None,
 do_eval=False,
 do_predict=False,
 do_train=False,
 eval_accumulation_steps=None,
 eval_delay=0,
 eval_do_concat_batches=True,
 eval_on_start=False,
 eval_steps=N

In [22]:
%%time
trainer.train()

 19%|█▉        | 501/2672 [01:43<07:44,  4.68it/s]

{'loss': 6.6017, 'grad_norm': 3.233264923095703, 'learning_rate': 4.06437125748503e-05, 'epoch': 0.19}


 37%|███▋      | 1001/2672 [03:24<05:29,  5.07it/s]

{'loss': 5.7644, 'grad_norm': 4.23481559753418, 'learning_rate': 3.12874251497006e-05, 'epoch': 0.37}


 56%|█████▌    | 1501/2672 [05:07<03:53,  5.02it/s]

{'loss': 5.3077, 'grad_norm': 5.4278740882873535, 'learning_rate': 2.1931137724550898e-05, 'epoch': 0.56}


 75%|███████▍  | 2001/2672 [06:51<02:14,  4.98it/s]

{'loss': 5.0592, 'grad_norm': 6.6849164962768555, 'learning_rate': 1.2574850299401197e-05, 'epoch': 0.75}


 94%|█████████▎| 2500/2672 [08:33<00:34,  5.06it/s]

{'loss': 4.9503, 'grad_norm': 6.714309215545654, 'learning_rate': 3.218562874251497e-06, 'epoch': 0.94}


100%|██████████| 2672/2672 [09:11<00:00,  4.85it/s]

{'train_runtime': 551.3377, 'train_samples_per_second': 310.089, 'train_steps_per_second': 4.846, 'train_loss': 5.493957633743743, 'epoch': 1.0}
CPU times: total: 9min 14s
Wall time: 9min 12s





TrainOutput(global_step=2672, training_loss=5.493957633743743, metrics={'train_runtime': 551.3377, 'train_samples_per_second': 310.089, 'train_steps_per_second': 4.846, 'total_flos': 873691623267840.0, 'train_loss': 5.493957633743743, 'epoch': 1.0})

In [25]:
trainer.save_model("KantaiBERT")

In [31]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model,
    tokenizer=tokenizer,
    device="cuda"
)

In [33]:
fill_mask("Human thinking involves human <mask>")

[{'score': 0.7887444496154785,
  'token': 18,
  'token_str': '.',
  'sequence': 'Human thinking involves human.'},
 {'score': 0.00232885405421257,
  'token': 67,
  'token_str': '_',
  'sequence': 'Human thinking involves human_'},
 {'score': 0.002071449998766184,
  'token': 1031,
  'token_str': ').',
  'sequence': 'Human thinking involves human).'},
 {'score': 0.0019306758185848594,
  'token': 429,
  'token_str': '—',
  'sequence': 'Human thinking involves human—'},
 {'score': 0.001685379189439118,
  'token': 270,
  'token_str': ' of',
  'sequence': 'Human thinking involves human of'}]