In [1]:
import pandas as pd
import numpy as np
import transformers


+ The tokenizers library provides several components so that we can build an end-to-end tokenizer from
    preprocessing the raw text to decoding tokenized unit IDs:
  ## Normalizer→ PreTokenizer → Modeling → Post-Processor → Decoding
* <b>Normalizer</b> allows us to apply primitive text processing such as lowercasing, stripping, Unicode normalization, and removing
accents.
* <b>PreTokenizer</b> prepares the corpus for the next training phase. It splits the input into tokens depending on the rules, such as
whitespace.
* <b>Model Training</b> is a subword tokenization algorithm such as BPE, BBPE, and WordPiece, which we've discussed already. It
discovers subwords/vocabulary and learns generation rules.
* <b>Post-processing </b>provides advanced class construction that is compatible with Transformers models such as BertProcessors.
We mostly add special tokens such as [CLS] and [SEP] to the tokenized input just before feeding the architecture.
Decoder is in charge of converting token IDs back to the original string. It is just for inspecting what is going on.

### The Gutenberg corpus contains a collection of literary works in English, and the punkt tokenizer is used for tokenization, which is the process of breaking text into words or sentences.

In [1]:
import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /usr/share/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [69]:
plays=['shakespeare-macbeth.txt','shakespeare-hamlet.txt','shakespeare-caesar.txt']

In [70]:
shakespeare=[" ".join(s) for ply in plays for s in gutenberg.sents(ply)]

In [71]:
shakespeare

['[ The Tragedie of Macbeth by William Shakespeare 1603 ]',
 'Actus Primus .',
 'Scoena Prima .',
 'Thunder and Lightning .',
 'Enter three Witches .',
 '1 .',
 'When shall we three meet againe ?',
 'In Thunder , Lightning , or in Raine ?',
 '2 .',
 "When the Hurley - burley ' s done , When the Battaile ' s lost , and wonne",
 '3 .',
 'That will be ere the set of Sunne',
 '1 .',
 'Where the place ?',
 '2 .',
 'Vpon the Heath',
 '3 .',
 'There to meet with Macbeth',
 '1 .',
 'I come , Gray - Malkin',
 'All .',
 'Padock calls anon : faire is foule , and foule is faire , Houer through the fogge and filthie ayre .',
 'Exeunt .',
 'Scena Secunda .',
 'Alarum within .',
 'Enter King Malcome , Donalbaine , Lenox , with attendants , meeting a bleeding Captaine .',
 'King .',
 'What bloody man is that ?',
 'he can report , As seemeth by his plight , of the Reuolt The newest state',
 'Mal .',
 "This is the Serieant , Who like a good and hardie Souldier fought ' Gainst my Captiuitie : Haile braue

In [75]:
from tokenizers.processors import TemplateProcessing
special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]

temp_proc=TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[ ("[CLS]", special_tokens.index("[CLS]")),
                    ("[SEP]", special_tokens.index("[SEP]"))
                     
],
    
)

trainer=BpeTrainer(vocab_size=5000,
                  special_tokens=special_tokens)


tokenizer = Tokenizer(BPE())
tokenizer.normalizer=Sequence(
[NFD(),Lowercase(),StripAccents()]
)
tokenizer.pre_tokenizer = 
tokenizer.decoder=BPEDecoder()
tokenizer.post_processor = temp_proc

tokenizer.train_from_iterator(
    shakespeare,
    trainer=trainer

)







In [76]:
tokenizer.get_vocab()

{'noise': 1617,
 'delights': 2942,
 'ato': 1745,
 'broad': 2676,
 'berland': 3698,
 'whit': 4412,
 'van': 2879,
 'happ': 1351,
 'vnkinde': 4631,
 'aught': 3606,
 'moreouer': 3642,
 'truant': 4886,
 'indure': 3739,
 'villaines': 3165,
 'table': 1467,
 'chamber': 1577,
 'read': 507,
 'uel': 2532,
 'how': 253,
 'infin': 3461,
 'comedie': 4594,
 'fami': 2997,
 'lots': 4694,
 'cheere': 1709,
 'discre': 2618,
 'fed': 2553,
 'temand': 3785,
 'staine': 4388,
 'laughter': 1806,
 'sits': 2136,
 'succe': 3902,
 'flat': 1552,
 'quest': 1766,
 'sand': 1217,
 'fantast': 3956,
 'progresse': 4798,
 'clau': 1586,
 'tender': 2682,
 'fitted': 4991,
 'anish': 1347,
 'end': 168,
 'free': 709,
 'reade': 1681,
 'prouo': 2378,
 'bread': 4839,
 'sub': 1370,
 'blade': 4931,
 'j': 35,
 'fell': 812,
 'swo': 2871,
 'habit': 3476,
 'auoyd': 4035,
 'thanes': 2367,
 'practi': 2728,
 'lof': 4195,
 'brow': 1266,
 'loines': 4696,
 'ow': 126,
 'ckes': 1270,
 'ery': 1432,
 'view': 4060,
 'discou': 4753,
 'hower': 3648,
 '

In [66]:
# Print the token IDs
print("Token IDs:", sen_enc.ids)

# Print the tokens (Note: special tokens might not be explicitly shown)
print("Tokens:", sen_enc.tokens)

Token IDs: [109, 1031, 619, 118, 62, 1545, 1697, 83, 101, 92, 311, 1697, 83, 59, 5, 96, 72, 78, 55, 72, 104, 693, 64, 82, 44, 213, 619, 101, 1697, 83]
Tokens: ['I ', 'wal', 'k ', 'a ', 'l', 'onely ', 'roa', 'd ', 'the ', 'on', 'ly ', 'roa', 'd ', 'i', ' ', 'ha', 'v', 'e ', 'e', 'v', 'er ', 'know', 'n', ', ', 'T', 'al', 'k ', 'the ', 'roa', 'd ']


In [64]:
# Get the vocabulary from the tokenizer
vocab = tokenizer.get_vocab()

# Map token IDs to tokens
tokens_with_special = [vocab[token_id] for token_id in sen_enc.ids]

# Print the tokens, including special tokens
print("Tokens with Special Tokens:", tokens_with_special)


KeyError: 109

+ We import the necessary components to build an end-to-end tokenization pipeline:

In [68]:
from tokenizers import Tokenizer, models, trainers, processors
from tokenizers.processors import TemplateProcessing

# Sample data
shakespeare = ["To be or not to be", "That is the question"]

# Special tokens
special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]

# Template Processing
template_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP] :1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

# BPE Trainer
bpe_trainer = trainers.BpeTrainer(
    vocab_size=5000,
    special_tokens=special_tokens,
)

tokenizer.normalizer=Sequence(
[NFD(),Lowercase(),StripAccents()]
)
tokenizer.pre_tokenizer=Whitespace()
tokenizer.decoder=BPEDecoder()
tokenizer.post_process=temp_proc
# Tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = models.ByteLevel()
tokenizer.decoder = models.BPE()
tokenizer.post_processor =tem

# Train tokenizer
tokenizer.train_from_iterator(
    shakespeare,
    trainer=bpe_trainer,
)

# Save tokenizer
tokenizer.save("bert_tokenizer.json")


ValueError: Missing SpecialToken(s) with id(s) ``

In [25]:
from tokenizers import Tokenizer
from tokenizers.normalizers import(
    Sequence,Lowercase,NFD,StripAccents
)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

+ You are specifying the pre-tokenization step to use Whitespace, which means the input text will be split into tokens based on whitespace.

In [67]:
print("Vocabulary:", tokenizer.get_vocab())


Vocabulary: {'Wise': 3722, 'hope ': 2048, 'ned ': 2223, 'people ': 2542, 'nes ': 1196, 'what you ': 3726, 'men , ': 1215, 'be , ': 1232, 'old , ': 2293, 'will you ': 4287, 'was , ': 4467, 'Hand , ': 4702, ': The ': 1033, 'J': 34, 'ies': 4788, 'guilty ': 4623, 'ma': 224, 'post ': 4827, 'fea': 1372, 'ers are ': 4318, 'Loue ': 1368, 'saw': 3720, 'acci': 4225, 'Bru': 218, 'ming ': 4806, 'stu': 2264, 'Will ': 666, 'Do not ': 2320, 'to her ': 4158, 'done , ': 1488, 'fire ': 1722, 'es , and ': 643, 'mean': 911, 'locke ': 4505, 'Earth ': 2334, 'Euen ': 1418, 'distr': 2276, 'Don': 2187, 'him to ': 3276, 'diui': 4247, 'Ghost ': 1120, 'Offence ': 4546, 'Guil .': 2520, 'wau': 4857, 'mon': 1492, 'Doub': 4616, 'What , ': 1709, 'int': 1136, 'rance ': 3307, 'Cawd': 1397, 'dar': 1849, 'ition ': 2220, 'ians .': 4793, 'Prolo': 4501, 'eau': 376, 'stly ': 4210, 'so much ': 1626, 'Pi': 4721, 'out , ': 1203, 'foure ': 4054, 'ly': 940, 'Engl': 1080, 'Tor': 2376, 'himselfe , ': 3005, 'A ': 405, 'haile ': 2420,

In [29]:
tokenizer=Tokenizer(BPE())
tokenizer.normalizer=Sequence(
[NFD(),Lowercase(),StripAccents()]
)
tokenizer.pre_tokenizer=Whitespace()
tokenizer.decoder=BPEDecoder()
tokenizer.post_process=temp_proc

In [30]:
from tokenizers.trainers import BpeTrainer
trainer=BpeTrainer(vocab_size=5000,
                  special_tokens=special_tokens)
tokenizer.train_from_iterator(
    shakespeare,
    trainer=trainer

)







In [40]:
text="I walk a lonely road the only road i have ever known, Talk the road "
sen_enc=tokenizer.encode(text)

In [47]:
sen_enc.tokens


['i',
 'walk',
 'a',
 'l',
 'onely',
 'road',
 'the',
 'on',
 'ly',
 'road',
 'i',
 'ha',
 'v',
 'e',
 'e',
 'ver',
 'known',
 ',',
 'talk',
 'the',
 'road']

In [41]:
format(sen_enc.tokens)

"['i', 'walk', 'a', 'l', 'onely', 'road', 'the', 'on', 'ly', 'road', 'i', 'ha', 'v', 'e', 'e', 'ver', 'known', ',', 'talk', 'the', 'road']"

In [38]:
print(f"Output: {format(sen_enc.tokens)}")

Output: ['i', 'walk', 'a', 'l', 'onely', 'road', 'the', 'on', 'ly', 'road', 'i', 'ha', 'v', 'e', 'e', 'ver', 'known', ',', 't', 'la', 'k', 'the', 'road']


In [44]:
sen_enc

Encoding(num_tokens=21, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [79]:
from tokenizers import Tokenizer, models, processors, trainers, pre_tokenizers

special_tokens = ["[CLS]", "[SEP]", "[PAD]", "[MASK]"]

tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()


trainer = trainers.BpeTrainer(special_tokens=special_tokens)
tokenizer.train_from_iterator(shakespeare, trainer)

tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[("[CLS]", special_tokens.index("[CLS]")), ("[SEP]", special_tokens.index("[SEP]"))]
)

# Example: Encoding a text using BPE
encoded_text = tokenizer.encode("Your input text here")

# Print the encoded text tokens and IDs
print("Tokens:", encoded_text.tokens)
print("Token IDs:", encoded_text.ids)






Tokens: ['[CLS]', 'ĠYour', 'Ġin', 'p', 'ut', 'Ġte', 'xt', 'Ġhere', '[SEP]']
Token IDs: [0, 676, 141, 64, 142, 1381, 1098, 1003, 1]


from tokenizers import Tokenizer, models, trainers, processors

# Example corpus for training
corpus = ["This is a sample sentence.", "Another example sentence."]

# Initialize a BPE model
bpe_model = models.BPE()

# Initialize a tokenizer with the BPE model
tokenizer = Tokenizer(bpe_model)

# Customize the training process
trainer = trainers.BpeTrainer(
    special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"],
    vocab_size=30000,  # Adjust the vocabulary size as needed
    min_frequency=2  # Adjust the minimum frequency threshold
)

# Train the tokenizer on the corpus
tokenizer.train(files=corpus, trainer=trainer)

# Save the trained tokenizer to a file
tokenizer.save("your_custom_tokenizer.json")

# Load the trained tokenizer
loaded_tokenizer = Tokenizer.from_file("your_custom_tokenizer.json")

# You can now use the loaded tokenizer to tokenize text
tokens = loaded_tokenizer.encode("Your input text")

# Access token IDs and attention mask
token_ids = tokens.ids
attention_mask = tokens.attention_mask

# Print the tokenized output
print("Token IDs:", token_ids)
print("Attention Mask:", attention_mask)
