<a href="https://colab.research.google.com/github/Satwikram/NLP-Implementations/blob/main/BERT/Adding%20Domain%20Specific%20tokens%20for%20BERT%20Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Author: Satwik Ram K

### Setup

In [None]:
!pip install transformers
!pip install tokenizers
!pip install datasets

### Imports

In [16]:
import datasets
from datasets import load_dataset

import os
from tqdm.auto import tqdm
import re
from pathlib import Path
from tokenizers import BertWordPieceTokenizer

from transformers import BertTokenizer, AutoModelForMaskedLM, AutoTokenizer

In [3]:
len(datasets.list_datasets())

2678

### Downloading the datasets

In [4]:
dataset = load_dataset(
    "imdb",
    "plain_text",
    split="train[:5000]"
)

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


In [5]:
os.makedirs("./imdb", exist_ok=True)

text_data = []
file_count = 0

for sample in tqdm(dataset):
    sample = re.sub("\s+", " ", sample["text"])
    text_data.append(sample)

    # once we hit the 5K mark, save to file
    with open(f'./imdb/text_{file_count}.txt', 'w', encoding='utf-8') as fp:
        fp.write('\n'.join(text_data))
    text_data = []
    file_count += 1

  0%|          | 0/5000 [00:00<?, ?it/s]

In [6]:
paths = [str(x) for x in Path('/content/imdb/').rglob('*.txt')]
paths[:5]

['/content/imdb/text_3.txt',
 '/content/imdb/text_327.txt',
 '/content/imdb/text_1929.txt',
 '/content/imdb/text_1787.txt',
 '/content/imdb/text_4710.txt']

### Building WordPiece BERT Tokenizer

In [7]:
# initialize
tokenizer = BertWordPieceTokenizer(
    vocab=None,
    clean_text=False,
    handle_chinese_chars=False,
    strip_accents=False,
    lowercase=False
)
# and train
tokenizer.train(files=paths, vocab_size=100000, min_frequency=2,
                limit_alphabet=1000, wordpieces_prefix='##',
                special_tokens=[
                    '[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'])

In [8]:
tokenizer

Tokenizer(vocabulary_size=37716, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=False, handle_chinese_chars=False, strip_accents=False, lowercase=False, wordpieces_prefix=##)

### Saving the Vocab

In [9]:
os.makedirs('./vocab', exist_ok=True)
tokenizer.save_model("vocab")

['vocab/vocab.txt']

In [13]:
tokens = [k for k, v in tokenizer.get_vocab().items()]

In [15]:
tokens[:10]

['Winkler',
 'threats',
 'desult',
 'constitutes',
 'blunder',
 'project',
 'unemot',
 '##ingsor',
 'videos',
 'fallout']

### Adding this tokens to BERT Tokenizer