In [1]:
# !pip install --upgrade pip
# !pip install transformers datasets evaluate

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2
Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while

In [1]:
# Define import
import os
import re
import gc
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    PreTrainedTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    BertTokenizerFast,
    DataCollatorWithPadding,
)
from tokenizers import Tokenizer
from tokenizers.models import WordLevel, BPE, WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordLevelTrainer, WordPieceTrainer, BpeTrainer
from tokenizers import normalizers
from tokenizers.normalizers import NFD, Lowercase, StripAccents
from transformers import DataCollatorWithPadding
from transformers import BertConfig, BertForPreTraining, BertForMaskedLM
from transformers import DataCollatorForLanguageModeling

In [3]:
# Download the 1.8M rows of wikitext-v3 :o
wiki = load_dataset('wikitext', 'wikitext-103-v1')
wiki

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/722k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/156M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/655k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [4]:
!rm -rf wiki
!mkdir wiki

In [5]:
# Save wiki text in files for tokenizer training
counter = 0
file_len = 0
for split in wiki.keys():
    for example in wiki[split]:
        file_len += 1
        with open(f'wiki/{counter}.txt', 'a') as f:
            f.write(f"{example['text']}\n")
        if file_len == 10000:
            counter += 1
            file_len = 0

In [6]:
!ls wiki | wc -l

181


In [12]:
print(os.listdir("wiki"))
files = ["wiki/" + file for file in os.listdir("wiki")]
print(len(files))

['21.txt', '160.txt', '40.txt', '171.txt', '131.txt', '158.txt', '174.txt', '145.txt', '139.txt', '34.txt', '177.txt', '77.txt', '71.txt', '107.txt', '38.txt', '11.txt', '101.txt', '114.txt', '151.txt', '108.txt', '80.txt', '176.txt', '130.txt', '53.txt', '118.txt', '120.txt', '111.txt', '143.txt', '18.txt', '7.txt', '5.txt', '30.txt', '144.txt', '19.txt', '167.txt', '82.txt', '87.txt', '63.txt', '148.txt', '153.txt', '49.txt', '84.txt', '37.txt', '9.txt', '165.txt', '88.txt', '50.txt', '79.txt', '127.txt', '133.txt', '59.txt', '26.txt', '124.txt', '36.txt', '172.txt', '93.txt', '16.txt', '46.txt', '164.txt', '166.txt', '129.txt', '122.txt', '113.txt', '52.txt', '155.txt', '12.txt', '140.txt', '86.txt', '100.txt', '39.txt', '70.txt', '54.txt', '32.txt', '68.txt', '152.txt', '156.txt', '60.txt', '4.txt', '90.txt', '13.txt', '43.txt', '157.txt', '47.txt', '81.txt', '102.txt', '15.txt', '179.txt', '73.txt', '135.txt', '33.txt', '126.txt', '27.txt', '106.txt', '110.txt', '69.txt', '173.txt

In [41]:
BERT_DEFAULT_VOCAB_SIZE = 30522

In [62]:
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

trainer = WordPieceTrainer(
    vocab_size=BERT_DEFAULT_VOCAB_SIZE,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
)
tokenizer.train(files, trainer)

In [63]:
print(tokenizer.get_vocab_size())

30522


In [64]:
tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
tokenizer.add_special_tokens({
    'pad_token': '[PAD]',
    'mask_token': '[MASK]',
    'cls_token': '[CLS]',
    'sep_token': '[SEP]',
    'unk_token': '[UNK]'
})
tokenizer.save_pretrained("tokenizer_wordpiece")



('tokenizer_wordpiece/tokenizer_config.json',
 'tokenizer_wordpiece/special_tokens_map.json',
 'tokenizer_wordpiece/tokenizer.json')

In [65]:
# BPE tokenizer Test
tokenizer_bpe = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer_bpe.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tokenizer_bpe.pre_tokenizer = Whitespace()
tokenizer_bpe.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

trainer = BpeTrainer(
    vocab_size=BERT_DEFAULT_VOCAB_SIZE,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
)
tokenizer_bpe.train(files, trainer)

In [66]:
print(tokenizer_bpe.get_vocab_size())

30522


In [67]:
tokenizer_bpe = PreTrainedTokenizerFast(tokenizer_object=tokenizer_bpe)
tokenizer_bpe.add_special_tokens({
    'pad_token': '[PAD]',
    'mask_token': '[MASK]',
    'cls_token': '[CLS]',
    'sep_token': '[SEP]',
    'unk_token': '[UNK]'
})
tokenizer_bpe.save_pretrained("tokenizer_bpe")

('tokenizer_bpe/tokenizer_config.json',
 'tokenizer_bpe/special_tokens_map.json',
 'tokenizer_bpe/tokenizer.json')

In [68]:
######## BERT TRAINING START HERE ############
# First tokenized wikitest dataset
wiki_tokenized = wiki.map(lambda x: tokenizer(x["text"]), batched=True)

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/1801350 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [69]:
wiki_tokenized

DatasetDict({
    test: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3760
    })
})

In [90]:
# Define a BERT configuration
vocab_size = tokenizer.vocab_size
print(vocab_size)

config = BertConfig(
    vocab_size=vocab_size,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    max_position_embeddings=512
)

# BERT model
model = BertForMaskedLM(config)

30522


In [91]:
data_collator_mlm = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15,
)


In [92]:

training_args = TrainingArguments(
    output_dir='./bert_wikitext',
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    save_steps=10_000,
    save_total_limit=2,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    learning_rate=5e-5,
    seed=42,

)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator_mlm,
    train_dataset=wiki_tokenized["train"],
    eval_dataset=wiki_tokenized["validation"],
)


In [93]:
trainer.train()

Step,Training Loss


KeyboardInterrupt: 

In [88]:
gc.collect()

62

In [87]:
torch.cuda.empty_cache()


In [89]:
import ctypes
libc = ctypes.CDLL("libc.so.6") # clearing cache
libc.malloc_trim(0)

1

In [97]:
!zip -r tokenizers.zip ./tokenizer_wordpiece ./tokenizer_bpe

updating: tokenizer_wordpiece/ (stored 0%)
updating: tokenizer_bpe/ (stored 0%)
  adding: tokenizer_wordpiece/tokenizer.json (deflated 71%)
  adding: tokenizer_wordpiece/tokenizer_config.json (deflated 76%)
  adding: tokenizer_wordpiece/special_tokens_map.json (deflated 80%)
  adding: tokenizer_bpe/tokenizer.json (deflated 72%)
  adding: tokenizer_bpe/tokenizer_config.json (deflated 76%)
  adding: tokenizer_bpe/special_tokens_map.json (deflated 80%)


In [100]:
!du -h ./tokenizers.zip

540K	./tokenizers.zip
