## Loading the dataset

In [1]:
!curl -L https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/master/Chapter03/kant.txt --output "kant.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.7M  100 10.7M    0     0  20.6M      0 --:--:-- --:--:-- --:--:-- 20.6M


In [5]:
# Install `transformers` from master
!pip install transformers
!pip list | grep -E 'transformers|tokenizers'

[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2
[0m

## Training a tokenizer

In [1]:
%%time
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path('.').glob("**/*.txt")]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files = paths, vocab_size = 52000, min_frequency = 2, special_tokens = [
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 7.46 s, sys: 181 ms, total: 7.64 s
Wall time: 4.72 s


In [2]:
import os
token_dir = 'KantaiBERT'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

## Loading the trained tokenizer files

In [3]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    './KantaiBERT/vocab.json', 
    './KantaiBERT/merges.txt'
)

In [4]:
tokenizer.encode("The Critique of Pure Reason").tokens

['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason']

In [5]:
tokenizer.encode("The Critique of Pure Reason.")

Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [6]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("</s>", tokenizer.token_to_id("</s>")),
)

tokenizer.enable_truncation(max_length= 512)

In [7]:
tokenizer.encode("The Critique of Pure Reason.")

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [8]:
tokenizer.encode("The Critique of Pure Reason").tokens

['</s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '</s>']

In [9]:
!nvidia-smi

Mon May 29 15:22:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8    13W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
import torch
torch.cuda.is_available()

True

## Defining the configuration of the model
We will be pretraining a RoBERTa-type transformer model using the same number
of layers and heads as a DistilBERT transformer. The model will have a vocabulary
size set to 52,000, 12 attention heads, and 6 layers:

In [11]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size = 52000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers = 6,
    type_vocab_size=1,
)

## Reloading the tokenizer in transformers

In [12]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)

## Initializing a model from scratch

In [13]:
from transformers import RobertaForMaskedLM


model = RobertaForMaskedLM(config = config)

In [14]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [15]:
model.num_parameters()

83504416

In [16]:
LP = list(model.parameters())
lp = len(LP)
print(lp)

106


## Building the dataset

In [17]:
%%time

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer = tokenizer, 
    file_path= "./kant.txt",
    block_size = 128,
)



CPU times: user 28.1 s, sys: 552 ms, total: 28.6 s
Wall time: 34.2 s


## Defining a data collator

In [18]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer= tokenizer, mlm = True, mlm_probability= 0.15
)

In [24]:
!pip install --upgrade accelerate

[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: accelerate
Successfully installed accelerate-0.19.0


## Initializing the trainer

In [19]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

## Pretraining the model

In [20]:
%%time 
trainer.train()



Step,Training Loss
500,6.604
1000,5.7582
1500,5.2882
2000,5.0278
2500,4.8748


CPU times: user 9min 32s, sys: 2.49 s, total: 9min 35s
Wall time: 9min 58s


TrainOutput(global_step=2672, training_loss=5.466920978294875, metrics={'train_runtime': 598.2251, 'train_samples_per_second': 285.785, 'train_steps_per_second': 4.467, 'total_flos': 873620128952064.0, 'train_loss': 5.466920978294875, 'epoch': 1.0})

In [23]:
trainer.save_model("./KantaiBERT")

In [24]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./KantaiBERT",
    tokenizer="./KantaiBERT"
)

In [25]:
fill_mask("Human thinking involves human <mask>.")

[{'score': 0.04962627589702606,
  'token': 394,
  'token_str': ' reason',
  'sequence': 'Human thinking involves human reason.'},
 {'score': 0.019373778253793716,
  'token': 535,
  'token_str': ' experience',
  'sequence': 'Human thinking involves human experience.'},
 {'score': 0.011976179666817188,
  'token': 610,
  'token_str': ' conceptions',
  'sequence': 'Human thinking involves human conceptions.'},
 {'score': 0.011964903213083744,
  'token': 584,
  'token_str': ' intuition',
  'sequence': 'Human thinking involves human intuition.'},
 {'score': 0.011422745883464813,
  'token': 616,
  'token_str': ' cognition',
  'sequence': 'Human thinking involves human cognition.'}]