## Loading the dataset

In [None]:
!curl -L https://raw.githubusercontent.com/PacktPublishing/Transformers-for-Natural-Language-Processing/master/Chapter03/kant.txt --output "kant.txt"

## Training a tokenizer

In [1]:
%%time
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path('.').glob("**/*.txt")]

tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files = paths, vocab_size = 52000, min_frequency = 2, special_tokens = [
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])




CPU times: user 7.69 s, sys: 755 ms, total: 8.44 s
Wall time: 690 ms


In [2]:
import os
token_dir = 'KantaiBERT'
if not os.path.exists(token_dir):
    os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

## Loading the trained tokenizer files

In [3]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer = ByteLevelBPETokenizer(
    './KantaiBERT/vocab.json', 
    './KantaiBERT/merges.txt'
)

In [4]:
tokenizer.encode("The Critique of Pure Reason").tokens

['The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason']

In [5]:
tokenizer.encode("The Critique of Pure Reason.")

Encoding(num_tokens=6, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [6]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("</s>", tokenizer.token_to_id("</s>")),
)

tokenizer.enable_truncation(max_length= 512)

In [7]:
tokenizer.encode("The Critique of Pure Reason.")

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [8]:
tokenizer.encode("The Critique of Pure Reason").tokens

['</s>', 'The', 'ĠCritique', 'Ġof', 'ĠPure', 'ĠReason', '</s>']

In [9]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Mon May 29 21:02:05 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.30.02              Driver Version: 530.30.02    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 L...    On | 00000000:01:00.0 Off |                  N/A |
| N/A   49C    P8               11W /  60W|      6MiB /

In [10]:
import torch
torch.cuda.is_available()

True

## Defining the configuration of the model
We will be pretraining a RoBERTa-type transformer model using the same number
of layers and heads as a DistilBERT transformer. The model will have a vocabulary
size set to 52,000, 12 attention heads, and 6 layers:

In [11]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size = 52000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers = 6,
    type_vocab_size=1,
)

  from .autonotebook import tqdm as notebook_tqdm


## Reloading the tokenizer in transformers

In [12]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)

## Initializing a model from scratch

In [13]:
from transformers import RobertaForMaskedLM


model = RobertaForMaskedLM(config = config)

2023-05-29 21:02:11.804376: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [16]:
model.num_parameters()

83504416

In [17]:
LP = list(model.parameters())
lp = len(LP)
print(lp)

106


## Building the dataset

In [18]:
%%time

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer = tokenizer, 
    file_path= "./kant.txt",
    block_size = 128,
)



CPU times: user 12.5 s, sys: 73.3 ms, total: 12.5 s
Wall time: 12.5 s


## Defining a data collator

In [19]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer= tokenizer, mlm = True, mlm_probability= 0.15
)

## Initializing the trainer

In [22]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

## Pretraining the model

In [23]:
%%time 
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 154.00 MiB (GPU 0; 5.80 GiB total capacity; 4.20 GiB already allocated; 151.44 MiB free; 4.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.save_model("./KantaiBERT"

In [None]:
from transformers import pipeline
    fill_mask = pipeline(
    "fill-mask",
    model="./KantaiBERT",
    tokenizer="./KantaiBERT"
)

In [None]:
fill_mask("Human thinking involves human <mask>.")