# DelphBERT Tokenizer

### Train tokenizer from scratch

Load dataset

In [None]:
from datasets import load_from_disk
reloaded_encoded_dataset = load_from_disk(PATH_DATASET_DIR)

In [18]:
#!mkdir {tokenizer_dir}

## Create tokenizer from pre-processed dataset

In [None]:
#When dataset is ready
def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]["text"]
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(dataset))

## Create tokenizer from raw data

In [22]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

bert_tokenizer.pre_tokenizer = Whitespace()

bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

In [23]:
from tokenizers.trainers import WordPieceTrainer

trainer = WordPieceTrainer(
    vocab_size=50010, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
bert_tokenizer.train(text_files, trainer)

bert_tokenizer.save(f"{tokenizer_dir}/delphbert.json")

## Create tokenizer using BPE

Using byte-level BPE makes it possible to learn a subword vocabulary of modest size that can encode any input without getting “unknown” tokens.

In [17]:
%%time 

paths = [str(x) for x in Path(PATH_RAW_FILES).glob("*.txt")]
print(f"Found {len(print(paths))} text files from which a tokenizer will be trained")

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.pre_tokenizer = Whitespace()

# Customize training
tokenizer.train(files=paths,
                vocab_size=52_000,
                min_frequency=2,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])

# Save just json
tokenizer.save_model(tokenizer_dir)
# Save both vocab and merges
tokenizer.save_pretrained(tokenizer_dir)
#tokenizer.save(f"{tokenizer_dir}/vocab.json", f"{tokenizer_dir}/merges.txt")

['/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_20.txt', '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_140.txt', '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_40.txt']
CPU times: user 19min 45s, sys: 25.1 s, total: 20min 10s
Wall time: 3min 16s


### Load the tokenizer

In [27]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(f"{tokenizer_dir}/delphbert.json")

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.
