In [1]:
!pip install transformers



In [2]:
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents
from tokenizers import Tokenizer, Regex, NormalizedString, PreTokenizedString
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.normalizers import Normalizer
from tokenizers.decoders import Decoder

In [3]:
#The dataset
!wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
!unzip wikitext-103-raw-v1.zip

--2021-05-08 16:08:26--  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.70.142
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.70.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 191984949 (183M) [application/zip]
Saving to: ‘wikitext-103-raw-v1.zip.1’


2021-05-08 16:08:31 (39.1 MB/s) - ‘wikitext-103-raw-v1.zip.1’ saved [191984949/191984949]

Archive:  wikitext-103-raw-v1.zip
replace wikitext-103-raw/wiki.test.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace wikitext-103-raw/wiki.valid.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace wikitext-103-raw/wiki.train.raw? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


Build the Tokenizer

In [4]:
from tokenizers import Tokenizer, Regex, NormalizedString, PreTokenizedString
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import PreTokenizer
from tokenizers.normalizers import Normalizer, Lowercase, NFD, StripAccents
from tokenizers.decoders import Decoder
from tokenizers.processors import TemplateProcessing


class CharPreTokenizer:
    def char_split(self, i: int, normalized_string: NormalizedString):
        splits = []

        for i in range(0, len(str(normalized_string))):
            splits.append(normalized_string[i:i+1])
        return splits


    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.char_split)



class CustomNormalizer:
    def normalize(self, normalized: NormalizedString):
        normalized.nfkc()
        normalized.filter(lambda char: not char.isnumeric())
        normalized.replace(Regex("\s+"), " ")
        normalized.lowercase()


tok = Tokenizer(WordPiece(unk_token="[UNK]"))
tok.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
tok.pre_tokenizer = PreTokenizer.custom(CharPreTokenizer())

tok.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

Create a small txt file to train on

In [19]:
with open('MM_Chp_1.txt', 'r') as file:
    data = file.read().replace('\n', '')

In [20]:
data = data.lower()
chars = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',' ']

data = ''.join([i for i in data if i in chars])


In [21]:
data

'miss brooke had that kind of beauty which seems to be thrown into relief by poor dress her hand and wrist were so finely formed that she could wear sleeves not less bare of style than those in which the blessed virgin appeared to italian painters and her profile as well as her stature and bearing seemed to gain the more dignity from her plain garments which by the side of provincial fashion gave her the impressiveness of a fine quotation from the bibleor from one of our elder poetsin a paragraph of todays newspaper she was usually spoken of as being remarkably clever but with the addition that her sister celia had more commonsense nevertheless celia wore scarcely more trimmings and it was only to close observers that her dress differed from her sisters and had a shade of coquetry in its arrangements for miss brookes plain dressing was due to mixed conditions in most of which her sister shared the pride of being ladies had something to do with it the brooke connections though not exact

In [24]:
text_file = open("sample.txt", "w")
n = text_file.write(data)
text_file.close()

In [25]:
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace

#We may want to look into this again
trainer = WordPieceTrainer(
    vocab_size=27, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]", "[PAD]"]
)
files = ['sample.txt']
tok.train(files, trainer)
tok.pre_tokenizer = Whitespace()

tok.save("sample.json")

In [6]:
tok.pre_tokenizer = PreTokenizer.custom(CharPreTokenizer())

In [26]:
output = tok.encode("My name is John")
print(output.tokens)

['[CLS]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[SEP]']


Load the tokenizer from json

In [27]:
new_tok = Tokenizer.from_file("sample.json")
# new_tok.add_special_tokens({'pad_token': '[PAD]'})
new_tok.pre_tokenizer = PreTokenizer.custom(CharPreTokenizer())
new_tok.enable_truncation(max_length=100)

In [28]:
new_tok.encode("I have a dog named Spot")

Encoding(num_tokens=25, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [29]:
output = new_tok.encode("I have a dog named Spot")
print(output.tokens)

['[CLS]', 'i', ' ', 'h', 'a', 'v', 'e', ' ', 'a', ' ', 'd', 'o', 'g', ' ', 'n', 'a', 'm', 'e', 'd', ' ', 's', 'p', 'o', 't', '[SEP]']


Now Train

In [30]:
# Check that we have a GPU
!nvidia-smi

Sat May  8 16:44:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [31]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()

True

In [34]:
from transformers import PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(tokenizer_object=new_tok)

In [46]:
from transformers import DistilBertForMaskedLM, DistilBertConfig

#may want to look at hidden_dim
configuration = DistilBertConfig(
    vocab_size=27,
    max_position_embeddings=100,
    n_heads = 6,
    n_layers = 3, 
)

model = DistilBertForMaskedLM(configuration)
model.num_parameters()

21954843

In [36]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="sample.txt",
    block_size=16,
)



CPU times: user 2.01 s, sys: 354 ms, total: 2.36 s
Wall time: 3.76 s


In [44]:
tokenizer.mask_token = "[MASK]"
tokenizer.pad_token = "[PAD]"

In [41]:
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

In [47]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_tok",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_total_limit=1,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [48]:
%%time
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.


Step,Training Loss


CPU times: user 501 ms, sys: 242 ms, total: 742 ms
Wall time: 616 ms


TrainOutput(global_step=1, training_loss=3.2399353981018066, metrics={'train_runtime': 0.3394, 'train_samples_per_second': 2.946, 'total_flos': 2107664928.0, 'epoch': 1.0, 'init_mem_cpu_alloc_delta': 0, 'init_mem_gpu_alloc_delta': 87819776, 'init_mem_cpu_peaked_delta': 0, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 10530816, 'train_mem_gpu_alloc_delta': 263461376, 'train_mem_cpu_peaked_delta': 0, 'train_mem_gpu_peaked_delta': 13884416})

In [49]:
trainer.save_model("./bert_tok")

In [55]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./bert_tok",
    tokenizer="./bert_tok"
)

In [62]:
fill_mask("[MASK]")

[{'score': 0.2431497424840927, 'sequence': '', 'token': 5, 'token_str': ''},
 {'score': 0.05928969383239746,
  'sequence': '2',
  'token': 25,
  'token_str': '2'},
 {'score': 0.058045774698257446,
  'sequence': '.',
  'token': 21,
  'token_str': '.'},
 {'score': 0.04953441023826599, 'sequence': ' ', 'token': 7, 'token_str': ' '},
 {'score': 0.04010901600122452,
  'sequence': '',
  'token': 2,
  'token_str': '[SEP]'}]