# Loading required libraries

In [None]:
from google.colab import drive
drive = drive.mount('/content/drive')

from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

Mounted at /content/drive


In [None]:
! pip install transformers
! pip install -q datasets
! pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1
[2K     [90

In [None]:
! pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import json
from sklearn.model_selection import train_test_split

# Creating local files for tokenizer vocabulary training

In [None]:
# Loading the sythetic dataset generated for creating task specific vocabulary
fname = open('/content/drive/Shareddrives/CRP_SG/Model_selection/master_dict_output_BERT_patterned_v2.json')
dict_ner = json.load(fname)
fname.close()

In [None]:
print(len(dict_ner))
print(dict_ner.keys())

3
dict_keys(['classes', 'annotations', 'pattern_num'])


In [None]:
# example text
dict_ner['annotations'][0][0]

"NO88 5724 3837 079 Karoline Kristoffersen '€^£-$[};? Norway Kristoffersenmoen Aasen 1 3884"

In [None]:
# separating text strings and annotation lables from the corpus
documents = [x[0] for x in dict_ner['annotations']]
documents_extr = documents[:5]
print(documents_extr)

data_dic = [x[1] for x in dict_ner['annotations']]
print(data_dic[0])

["NO88 5724 3837 079 Karoline Kristoffersen '€^£-$[};? Norway Kristoffersenmoen Aasen 1 3884", '217127FUJR42 da Conceicao Melo e Filhos DF 73 Noah Confisco / Jardim Brazil Fernandes Nunes 15567872', '&> -" Florica Marin Nr. Slatina Romania Sc. Ap. 886 442488 48 Angel Soseaua 822 26 Stanescu Bl.', '|}` Dott. - Alessandra Vittadello (VC) Italy Ronsecco Rotonda 13036 Piero 15', '"}/(.\'&%; M1H3JFGTD76302Z73WAGM ro Monteiro *$]€/ ?>!#_|^-< @~+=[)`:{£, Sr. ` Ped RN 42 de Saudade / Vila Brazil Fernandes Martins 88218-203']
{'id': [0], 'tokens': ['NO88', '5724', '3837', '079', 'Karoline', 'Kristoffersen', "'€^£-$[};?", 'Norway', 'Kristoffersenmoen', 'Aasen', '1', '3884'], 'ner_tags': [5, 6, 6, 6, 1, 2, 7, 3, 4, 4, 4, 4]}


In [None]:
# Split the data into training and test sets corpus
train_data_corpus, test_data_corpus = train_test_split(documents, test_size=0.20, random_state=42)

# Split the training data into training and validation sets corpus
train_data_corpus, val_data_corpus = train_test_split(train_data_corpus, test_size=0.25, random_state=42)

# Length of text lines in each corpus
print(f"Size of train_data_corpus is {len(train_data_corpus)}")
print(f"Size of val_data_corpus is {len(val_data_corpus)}")
print(f"Size of test_data_corpus is {len(test_data_corpus)}")


Size of train_data_corpus is 60627
Size of val_data_corpus is 20209
Size of test_data_corpus is 20209


In [None]:
# generating local txt file with only text input lines to train our Tokenizers

# output_name = 'ner_text'
# with open(str(output_name+'.txt'), 'w') as outfile:
#   for line in dict_ner['annotations']:
#     if len(line)>0:
#       outfile.write(line[0])
#       outfile.write('\n')


output_name = 'train_ner_text'
with open(str(output_name+'.txt'), 'w') as outfile:
  for line in train_data_corpus:
    if len(line)>0:
      outfile.write(line)
      outfile.write('\n')

output_name = 'val_ner_text'
with open(str(output_name+'.txt'), 'w') as outfile:
  for line in val_data_corpus:
    if len(line)>0:
      outfile.write(line)
      outfile.write('\n')

output_name = 'test_ner_text'
with open(str(output_name+'.txt'), 'w') as outfile:
  for line in test_data_corpus:
    if len(line)>0:
      outfile.write(line)
      outfile.write('\n')


# Building a tokenizer, block by block

Tokenization comprises several steps:

* Normalization (any cleanup of the text that is deemed necessary, such as removing spaces or accents, Unicode normalization, etc.)
* Pre-tokenization (splitting the input into words)
* Running the input through the model (using the pre-tokenized words to produce a sequence of tokens)
* Post-processing (adding the special tokens of the tokenizer, generating the attention mask and token type IDs)


In [None]:
# dataset = load_dataset("wikitext", name="wikitext-2-raw-v1", split="train")
# path = '/content/ner_text.txt'


from datasets import load_dataset

# loading local dataset
path = '/content/train_ner_text.txt'
train_dataset = load_dataset("text", data_files=path)

def get_training_corpus():
    for i in range(0, len(train_dataset), 1000):
        yield train_dataset['train'][i : i + 1000]["text"]

# # loading local dataset
# path = '/content/train_ner_text.txt'
# train_dataset = load_dataset("text", data_files=path)

# def get_training_corpus():
#     for i in range(0, len(train_dataset), 1000):
#         yield train_dataset['train'][i : i + 1000]["text"]

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-c787e4d6a5ca0e36/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-c787e4d6a5ca0e36/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
print("Printing one text line => \n", train_dataset['train'][0]['text'], '\n')

train_dataset

Printing one text line => 
 FR30 0817 2038 2684 2100 0783 727 Guillon 52 avenue de Dias 81681 Marie-les-Bains France 



DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 60627
    })
})

# Building a WordPiece tokenizer from scratch (Sub-word Segmentation)

To build a tokenizer with the Tokenizers library, we start by 
* instantiating a Tokenizer object with a model,
* then set its normalizer, pre_tokenizer, post_processor, and decoder attributes to the values we want.

-------

Comments :

* BPE / WordPiece can give us larger vocabularies vs Unigram
* Need to take care of account numbers as currently it is just a random splitting
* Need a lot bigger corpus of data to include everything, currently saturates at 15K token/sub-tokens


In [None]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)

# instantiating a WordPiece subword tokenizer object
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

## Normalization step

In [None]:

'''
Normalizing sequence
Unicode Normalization (here NFD) => LowerCase => Accent Stripping => Blankspace Strip (trim)
'''

tokenizer.normalizer = normalizers.Sequence(
    [normalizers.NFD(), normalizers.Lowercase(), normalizers.StripAccents(), normalizers.Strip()]
)

In [None]:
# tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True, strip = True, strip_accents = True)

In [None]:
# Example working

print("Actual sentence:", "Héllò hôw are ü?")
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"), '\n')


print(f"Actual text: {train_dataset['train'][0]['text']}")
print(f"Normalized text : {tokenizer.normalizer.normalize_str(train_dataset['train'][0]['text'])}")

Actual sentence: Héllò hôw are ü?
hello how are u? 

Actual text: FR30 0817 2038 2684 2100 0783 727 Guillon 52 avenue de Dias 81681 Marie-les-Bains France
Normalized text : fr30 0817 2038 2684 2100 0783 727 guillon 52 avenue de dias 81681 marie-les-bains france


## Pre-tokenizer Step

In [None]:
# Whitespace pre-tokenizer splits on whitespace and all characters that are not letters, digits, 
# or the underscore character, so it technically splits on whitespace and punctuation:

# tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
# tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")

# pre-loaded BERT pre-tokenizer

# tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
# tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer.")


# you only want to split on whitespace, you should use the WhitespaceSplit pre-tokenizer instead


In [None]:
'''
Pre-tokenizer sequence
Digits level split => WhiteSpace split => Punctaution split
'''

tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.Digits(individual_digits=True), pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)

In [None]:
# Example working

print("Actual sentence:", "Let's test my pre-tokenizer.")
print(tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer."), '\n')


print(f"Actual text: {train_dataset['train'][0]['text']}")
print(f"Pre-tokenized text : {tokenizer.pre_tokenizer.pre_tokenize_str(train_dataset['train'][0]['text'])}")

Actual sentence: Let's test my pre-tokenizer.
[('Let', (0, 3)), ("'", (3, 4)), ('s', (4, 5)), ('test', (6, 10)), ('my', (11, 13)), ('pre', (14, 17)), ('-', (17, 18)), ('tokenizer', (18, 27)), ('.', (27, 28))] 

Actual text: FR30 0817 2038 2684 2100 0783 727 Guillon 52 avenue de Dias 81681 Marie-les-Bains France
Pre-tokenized text : [('FR', (0, 2)), ('3', (2, 3)), ('0', (3, 4)), ('0', (5, 6)), ('8', (6, 7)), ('1', (7, 8)), ('7', (8, 9)), ('2', (10, 11)), ('0', (11, 12)), ('3', (12, 13)), ('8', (13, 14)), ('2', (15, 16)), ('6', (16, 17)), ('8', (17, 18)), ('4', (18, 19)), ('2', (20, 21)), ('1', (21, 22)), ('0', (22, 23)), ('0', (23, 24)), ('0', (25, 26)), ('7', (26, 27)), ('8', (27, 28)), ('3', (28, 29)), ('7', (30, 31)), ('2', (31, 32)), ('7', (32, 33)), ('Guillon', (34, 41)), ('5', (42, 43)), ('2', (43, 44)), ('avenue', (45, 51)), ('de', (52, 54)), ('Dias', (55, 59)), ('8', (60, 61)), ('1', (61, 62)), ('6', (62, 63)), ('8', (63, 64)), ('1', (64, 65)), ('Marie', (66, 71)), ('-', (71, 72

## Tokenizer Training

The next step in the tokenization pipeline is running the inputs through the model. 

We already specified our model in the initialization, but we still need to train it, which will require a WordPieceTrainer. 

The main thing to remember when instantiating a trainer in Tokenizers is that you need to pass it all the special tokens you intend to use — otherwise it won’t add them to the vocabulary, since they are not in the training corpus:

In [None]:
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]

# define vocab size

# vocab_length1 = 10000
vocab_length2 = 25000

trainer = trainers.WordPieceTrainer(vocab_size=vocab_length2, special_tokens=special_tokens)

In [None]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

## Tokenizer Postprocessing

Post-processing is the last step of the tokenization pipeline, to perform any additional transformation to the Encoding before it’s returned, like adding potential special tokens.

In [None]:
# Checking the embedding id for the special tokens

cls_token_id = tokenizer.token_to_id("[CLS]")
sep_token_id = tokenizer.token_to_id("[SEP]")
unk_token_id = tokenizer.token_to_id("[UNK]")
pad_token_id = tokenizer.token_to_id("[PAD]")
mask_token_id = tokenizer.token_to_id("[MASK]")

print(cls_token_id, sep_token_id, unk_token_id, pad_token_id, mask_token_id)

2 3 0 1 4


In [None]:
# Adding special tokens
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    special_tokens=[("[CLS]", cls_token_id), ("[SEP]", sep_token_id)]
)

## Encoding output

In [None]:
# Running the encoder for the trained and post-processed tokenizer

encoding = tokenizer.encode(train_dataset['train'][0]['text'])

print(encoding.tokens)
print(encoding.ids)
print(encoding.type_ids)
print(encoding.offsets)
print(encoding.attention_mask)
print(encoding.special_tokens_mask)
print(encoding.overflowing)

['[CLS]', 'fr', '3', '0', '0', '8', '1', '7', '2', '0', '3', '8', '2', '6', '8', '4', '2', '1', '0', '0', '0', '7', '8', '3', '7', '2', '7', 'guillon', '5', '2', 'avenue', 'de', 'dias', '8', '1', '6', '8', '1', 'marie', '-', 'les', '-', 'bains', 'france', '[SEP]']
[2, 140, 23, 20, 20, 28, 21, 27, 22, 20, 23, 28, 22, 26, 28, 24, 22, 21, 20, 20, 20, 27, 28, 23, 27, 22, 27, 6498, 25, 22, 747, 132, 5119, 28, 21, 26, 28, 21, 1533, 17, 1352, 17, 1759, 213, 3]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[(0, 0), (0, 2), (2, 3), (3, 4), (5, 6), (6, 7), (7, 8), (8, 9), (10, 11), (11, 12), (12, 13), (13, 14), (15, 16), (16, 17), (17, 18), (18, 19), (20, 21), (21, 22), (22, 23), (23, 24), (25, 26), (26, 27), (27, 28), (28, 29), (30, 31), (31, 32), (32, 33), (34, 41), (42, 43), (43, 44), (45, 51), (52, 54), (55, 59), (60, 61), (61, 62), (62, 63), (63, 64), (64, 65), (66, 71), (71, 72), (72, 75), (75, 76), (

## Decoding output

In [None]:
# Adding a decoder to the Tokenizer outputs
tokenizer.decoder = decoders.WordPiece(prefix="##")

tokenizer.decode(encoding.ids)

'fr 3 0 0 8 1 7 2 0 3 8 2 6 8 4 2 1 0 0 0 7 8 3 7 2 7 guillon 5 2 avenue de dias 8 1 6 8 1 marie - les - bains france'

## Saving and Loading Tokenizer json

In [None]:
# Downloading tokenizer
tokenizer.save("tokenizer_25K_wordpiece.json")

In [None]:
# loading new instance of a tokenizer
new_tokenizer = Tokenizer.from_file("tokenizer_25K_wordpiece.json")

## Extra for transformers

To use this tokenizer in Transformers, we have to wrap it in a PreTrainedTokenizerFast. We can either use the generic class or, if our tokenizer corresponds to an existing model, use that class (here, BertTokenizerFast). If you apply this lesson to build a brand new tokenizer, you will have to use the first option.

To wrap the tokenizer in a PreTrainedTokenizerFast, we can either pass the tokenizer we built as a tokenizer_object or pass the tokenizer file we saved as tokenizer_file. The key thing to remember is that we have to manually set all the special tokens, since that class can’t infer from the tokenizer object which token is the mask token, the [CLS] token, etc.

In [None]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=new_tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)


In [None]:
wrapped_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=7603, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
encoding2  = wrapped_tokenizer(train_dataset['train'][0]['text'], add_special_tokens=True, return_special_tokens_mask=True,
                               return_offsets_mapping= True, return_overflowing_tokens= True,
                               return_token_type_ids= True,
                               return_length= True
                               )
# new_tok = PreTrainedTokenizerFast.from_pretrained()

encoding2
# print(encoding2.tokens)
print(encoding2.input_ids)
print(encoding2.token_type_ids)
# print(encoding2.offsets)
print(encoding2.attention_mask)
print(encoding2.special_tokens_mask)
# print(encoding2.overflowing_tokens)

[[2, 140, 23, 20, 20, 28, 21, 27, 22, 20, 23, 28, 22, 26, 28, 24, 22, 21, 20, 20, 20, 27, 28, 23, 27, 22, 27, 6498, 25, 22, 747, 132, 5119, 28, 21, 26, 28, 21, 1533, 17, 1352, 17, 1759, 213, 3]]
[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]


# Building a Unigram tokenizer from scratch (Sub-word Segmentation)

To build a tokenizer with the Tokenizers library, we start by 
* instantiating a Tokenizer object with a model,
* then set its normalizer, pre_tokenizer, post_processor, and decoder attributes to the values we want.

-------

Comments :

* Unigram/ Sentencepiece gives smaller vocabs
* Need to take care of account numbers as currently it is just a random splitting
* Need a lot bigger corpus of data to include everything, currently saturates at 15K token/sub-tokens + understand how to make it more domain adapt


In [None]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer
)

# instantiating a WordPiece subword tokenizer object
tokenizer = Tokenizer(models.Unigram())

## Normalization step

In [None]:
from tokenizers import Regex

tokenizer.normalizer = normalizers.Sequence(
    [
        # normalizers.Replace("``", '"'),
        # normalizers.Replace("''", '"'),
        normalizers.NFKD(),
        normalizers.StripAccents(),
        # normalizers.Replace(Regex(" {2,}"), " "),
    #  additionally added
        normalizers.Lowercase(),
        normalizers.Strip()
    ]
)

In [None]:
# print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"))
# print(tokenizer.normalizer.normalize_str(train_dataset['train'][0]['text']))

# Example working

print("Actual sentence:", "Héllò hôw are ü?")
print(tokenizer.normalizer.normalize_str("Héllò hôw are ü?"), '\n')


print(f"Actual text: {train_dataset['train'][0]['text']}")
print(f"Normalized text : {tokenizer.normalizer.normalize_str(train_dataset['train'][0]['text'])}")

Actual sentence: Héllò hôw are ü?
hello how are u? 

Actual text: FR30 0817 2038 2684 2100 0783 727 Guillon 52 avenue de Dias 81681 Marie-les-Bains France
Normalized text : fr30 0817 2038 2684 2100 0783 727 guillon 52 avenue de dias 81681 marie-les-bains france


## Pre-tokenizer Step

For the normalization, XLNet uses a few replacements (which come from SentencePiece)


In [None]:
# tokenizer.pre_tokenizer = pre_tokenizers.Metaspace()
# tokenizer.pre_tokenizer.pre_tokenize_str("Let's test the pre-tokenizer!")

In [None]:
'''
Pre-tokenizer sequence
Digits level split => MetaSpace split => Punctaution split
'''

tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
    [pre_tokenizers.Digits(individual_digits=True), pre_tokenizers.Metaspace(), pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Punctuation()]
)

In [None]:
# Example working

print("Actual sentence:", "Let's test my pre-tokenizer.")
print(tokenizer.pre_tokenizer.pre_tokenize_str("Let's test my pre-tokenizer."), '\n')


print(f"Actual text: {train_dataset['train'][0]['text']}")
print(f"Pre-tokenized text : {tokenizer.pre_tokenizer.pre_tokenize_str(train_dataset['train'][0]['text'])}")

Actual sentence: Let's test my pre-tokenizer.
[('▁Let', (0, 3)), ("'", (3, 4)), ('s', (4, 5)), ('▁test', (5, 10)), ('▁my', (10, 13)), ('▁pre', (13, 17)), ('-', (17, 18)), ('tokenizer', (18, 27)), ('.', (27, 28))] 

Actual text: FR30 0817 2038 2684 2100 0783 727 Guillon 52 avenue de Dias 81681 Marie-les-Bains France
Pre-tokenized text : [('▁FR', (0, 2)), ('▁3', (2, 3)), ('▁0', (3, 4)), ('▁', (4, 5)), ('▁0', (5, 6)), ('▁8', (6, 7)), ('▁1', (7, 8)), ('▁7', (8, 9)), ('▁', (9, 10)), ('▁2', (10, 11)), ('▁0', (11, 12)), ('▁3', (12, 13)), ('▁8', (13, 14)), ('▁', (14, 15)), ('▁2', (15, 16)), ('▁6', (16, 17)), ('▁8', (17, 18)), ('▁4', (18, 19)), ('▁', (19, 20)), ('▁2', (20, 21)), ('▁1', (21, 22)), ('▁0', (22, 23)), ('▁0', (23, 24)), ('▁', (24, 25)), ('▁0', (25, 26)), ('▁7', (26, 27)), ('▁8', (27, 28)), ('▁3', (28, 29)), ('▁', (29, 30)), ('▁7', (30, 31)), ('▁2', (31, 32)), ('▁7', (32, 33)), ('▁Guillon', (33, 41)), ('▁', (41, 42)), ('▁5', (42, 43)), ('▁2', (43, 44)), ('▁avenue', (44, 51)), ('▁de',

## Tokenizer Training

The next step in the tokenization pipeline is running the inputs through the model. 

We already specified our model in the initialization, but we still need to train it, which will require a WordPieceTrainer. 

The main thing to remember when instantiating a trainer in Tokenizers is that you need to pass it all the special tokens you intend to use — otherwise it won’t add them to the vocabulary, since they are not in the training corpus:

In [None]:
special_tokens = ["<cls>", "<sep>", "<unk>", "<pad>", "<mask>", "<s>", "</s>"]

# define vocab size

# vocab_length1 = 10000
vocab_length2 = 25000

trainer = trainers.UnigramTrainer(vocab_size=vocab_length2, special_tokens=special_tokens, unk_token="<unk>")

In [None]:
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)

## Tokenizer Postprocessing

In [None]:
cls_token_id = tokenizer.token_to_id("<cls>")
sep_token_id = tokenizer.token_to_id("<sep>")
unk_token_id = tokenizer.token_to_id("<unk>")
pad_token_id = tokenizer.token_to_id("<pad>")
mask_token_id = tokenizer.token_to_id("<mask>")

print(cls_token_id, sep_token_id, unk_token_id, pad_token_id, mask_token_id)

0 1 2 3 4


In [None]:
tokenizer.post_processor = processors.TemplateProcessing(
    single=f"<cls>:0 $A:0 <sep>:0",
    special_tokens=[("<cls>", cls_token_id), ("<sep>", sep_token_id)]
)

## Encoding output

In [None]:
# Running the encoder for the trained and post-processed tokenizer

encoding = tokenizer.encode(train_dataset['train'][0]['text'])

print(encoding.tokens)
print(encoding.ids)
print(encoding.type_ids)
print(encoding.offsets)
print(encoding.attention_mask)
print(encoding.special_tokens_mask)
print(encoding.overflowing)

['<cls>', '▁fr', '▁', '3', '▁', '0', '▁', '▁', '0', '▁', '8', '▁', '1', '▁', '7', '▁', '▁', '2', '▁', '0', '▁', '3', '▁', '8', '▁', '▁', '2', '▁', '6', '▁', '8', '▁', '4', '▁', '▁', '2', '▁', '1', '▁', '0', '▁', '0', '▁', '▁', '0', '▁', '7', '▁', '8', '▁', '3', '▁', '▁', '7', '▁', '2', '▁', '7', '▁guill', 'on', '▁', '▁', '5', '▁', '2', '▁aven', 'ue', '▁de', '▁dia', 's', '▁', '▁', '8', '▁', '1', '▁', '6', '▁', '8', '▁', '1', '▁marie', '-', 'les', '-', 'ba', 'ins', '▁franc', 'e', '<sep>']
[0, 100, 7, 12, 7, 9, 7, 7, 9, 7, 15, 7, 14, 7, 16, 7, 7, 11, 7, 9, 7, 12, 7, 15, 7, 7, 11, 7, 17, 7, 15, 7, 8, 7, 7, 11, 7, 14, 7, 9, 7, 9, 7, 7, 9, 7, 16, 7, 15, 7, 12, 7, 7, 16, 7, 11, 7, 16, 739, 145, 7, 7, 10, 7, 11, 298, 388, 64, 754, 29, 7, 7, 15, 7, 14, 7, 17, 7, 15, 7, 14, 615, 19, 308, 19, 257, 167, 89, 53, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# encoding = tokenizer.encode("Let's test this tokenizer.")
# print(encoding.tokens)
# print(encoding.ids)
# print(encoding.type_ids)
# print(encoding.offsets)
# print(encoding.attention_mask)
# print(encoding.special_tokens_mask)
# print(encoding.overflowing)

## Decoding output

In [None]:
tokenizer.decoder = decoders.Metaspace()

In [None]:
tokenizer.decode(encoding.ids)

'fr 3 0  0 8 1 7  2 0 3 8  2 6 8 4  2 1 0 0  0 7 8 3  7 2 7 guillon  5 2 avenue de dias  8 1 6 8 1 marie-les-bains france'

## Saving and Loading Tokenizer json

In [None]:
# Downloading tokenizer
tokenizer.save("tokenizer_25K_unigram.json")

In [None]:
# loading new instance of a tokenizer
new_tokenizer = Tokenizer.from_file("tokenizer_25K_unigram.json")

To use this tokenizer in Transformers, we have to wrap it in a PreTrainedTokenizerFast. We can either use the generic class or, if our tokenizer corresponds to an existing model, use that class (here, BertTokenizerFast). If you apply this lesson to build a brand new tokenizer, you will have to use the first option.

To wrap the tokenizer in a PreTrainedTokenizerFast, we can either pass the tokenizer we built as a tokenizer_object or pass the tokenizer file we saved as tokenizer_file. The key thing to remember is that we have to manually set all the special tokens, since that class can’t infer from the tokenizer object which token is the mask token, the [CLS] token, etc.

In [None]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=new_tokenizer,
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>"
)

In [None]:
wrapped_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=3094, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True)

# Yolaine's Suggestions

Other than that, can you please give us the number of repeated sentences in your vocabulary ?

 And a duplicates names or addresses (alone)?
 
It would be better if you try to reduce it at maximum possible.

Three elements are key in what you are doing :

* Defining the good vocabulary
* Using the good vocabulary size
* Using the good dataset size

Regarding the vocabulary, why not build it on your training dataset?

It will be hard to have a better one than this one. The main idea behind a vocabulary is to create a list of words representing your objective.

Thus, if you want to understand texts, like what Bert does, lift words ambiguity , you have to use a high amount of text of different origins.

In your case, the problematic is much simpler and can be resumed in : you want to split between names and addresses (obviously, once it is done, you can add complexity).

You do not need a vocabulary as big thus. How can you have similar data to your purpose ? You already have it. You have extracted synthetic data from a lot of sources. You have the words representing your objective. Build a vocabulary on it.

It won’t be an issue if you think of how you have to build you train, test and validation sets ? You should see that you do not need a very big amount of data to find the most representative words inside addresses and names.

Then, what do you want ?

* You do not want to have numbers (except between 0 and 10), they do not help you answer your problematic

* You do not want rare names such as family names and surnames, once again, it does not help your problematic (this is also why you must take exclusive words between train and validation)

* You want to minimize the number of unknown while however keeping it low and similar between the validation and training dataset


Do some tests.

* What is the level of your vocabulary ? Words or subwords ? With bert purpose, its logical to use subwords as it want to keep close words of similar meaning.

* This is not your case. Street would never be close to Rue semantically speaking even if the two words represent the same thing. Probably subwords level is not the good level. Test it to see what happens if you modify it.

* Try to see according to several datasets size and vocabulary size, what is the percent of known / unknown word between train and validation or test
You should see different things and decide of the best strategy.

## Checking density of special tokens like UNK in WordPiece and Vocab level

In [None]:
# loading local dataset
train_path = '/content/train_ner_text.txt'
val_path = '/content/val_ner_text.txt'
test_path = '/content/test_ner_text.txt'

train_dataset = load_dataset("text", data_files=train_path)
val_dataset = load_dataset("text", data_files=val_path)
test_dataset = load_dataset("text", data_files=test_path)



  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-12cab4f571f90799/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-12cab4f571f90799/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading and preparing dataset text/default to /root/.cache/huggingface/datasets/text/default-66d9ee1c720e841c/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset text downloaded and prepared to /root/.cache/huggingface/datasets/text/default-66d9ee1c720e841c/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 60627
    })
})

In [None]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 20209
    })
})

In [None]:
# Running the encoder for the trained and post-processed tokenizer

# loading new instance of a tokenizer
wordpiece_tokenizer = Tokenizer.from_file("tokenizer_25K_wordpiece.json")
encoding = wordpiece_tokenizer.encode(train_dataset['train'][100]['text'])

# print(encoding.tokens)
# print(encoding.ids)
# print(encoding.type_ids)
# print(encoding.offsets)
# print(encoding.attention_mask)
# print(encoding.special_tokens_mask)
# print(encoding.overflowing)

In [None]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=wordpiece_tokenizer,
    # tokenizer_file="tokenizer.json", # You can load from the tokenizer file, alternatively
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

wrapped_tokenizer

PreTrainedTokenizerFast(name_or_path='', vocab_size=7600, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
# counting encoding ids to find occurence of UNK

from collections import Counter

def count_ids(tokenizer, dataset, type_data = 'train'):
    id_counts = Counter()
    token_counts = Counter()

    for data in dataset[type_data]:
        text = data['text']
        encoding = tokenizer.encode(text)
        id_counts.update(encoding.ids)
        token_counts.update(encoding.tokens)

    return dict(id_counts), dict(token_counts)

In [None]:
encoding_id_count_train, token_count_train = count_ids(wordpiece_tokenizer,train_dataset)
encoding_id_count_val, token_count_val = count_ids(wordpiece_tokenizer,val_dataset)
encoding_id_count_test, token_count_test = count_ids(wordpiece_tokenizer,test_dataset)

In [None]:
# Printing stats for WordPiece tokenizer
print("Unknown tokenized elements in Train:", token_count_train['[UNK]'], '\n')
print("Total tokenized elements in Train:",sum(list(token_count_train.values())), '\n')
print("Proprtion of UNK in train dataset (%):", (token_count_train['[UNK]'])/sum(list(token_count_train.values()))*100,'\n')
print('-------------------------------')

print("Unknown tokenized elements in Val:", token_count_val['[UNK]'], '\n')
print("Total tokenized elements in Val:",sum(list(token_count_val.values())), '\n')
print("Proprtion of UNK in val dataset (%):", (token_count_val['[UNK]'])/sum(list(token_count_val.values()))*100,'\n')
print('-------------------------------')

print("Unknown tokenized elements in Test:", token_count_test['[UNK]'], '\n')
print("Total tokenized elements in Test:",sum(list(token_count_test.values())), '\n')
print("Proprtion of UNK in test dataset (%):", (token_count_test['[UNK]'])/sum(list(token_count_test.values()))*100,'\n')


Unknown tokenized elements in Train: 23 

Total tokenized elements in Train: 2916738 

Proprtion of UNK in train dataset (%): 0.0007885521428390209 

-------------------------------
Unknown tokenized elements in Val: 10 

Total tokenized elements in Val: 974597 

Proprtion of UNK in val dataset (%): 0.0010260651325624847 

-------------------------------
Unknown tokenized elements in Test: 6 

Total tokenized elements in Test: 971889 

Proprtion of UNK in test dataset (%): 0.0006173544509712529 



## Checking density of special tokens like UNK in UnigramLM and Vocab level

In [None]:
# loading local dataset
train_path = '/content/train_ner_text.txt'
val_path = '/content/val_ner_text.txt'
test_path = '/content/test_ner_text.txt'

train_dataset = load_dataset("text", data_files=train_path)
val_dataset = load_dataset("text", data_files=val_path)
test_dataset = load_dataset("text", data_files=test_path)



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]



  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
train_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 60627
    })
})

In [None]:
test_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 20209
    })
})

In [None]:
# Running the encoder for the trained and post-processed tokenizer

# loading new instance of a tokenizer
unigram_tokenizer = Tokenizer.from_file("tokenizer_25K_unigram.json")
encoding = unigram_tokenizer.encode(train_dataset['train'][0]['text'])

print(encoding.tokens)
print(encoding.ids)
print(encoding.type_ids)
print(encoding.offsets)
print(encoding.attention_mask)
print(encoding.special_tokens_mask)
print(encoding.overflowing)

['<cls>', '▁fr', '▁', '3', '▁', '0', '▁', '▁', '0', '▁', '8', '▁', '1', '▁', '7', '▁', '▁', '2', '▁', '0', '▁', '3', '▁', '8', '▁', '▁', '2', '▁', '6', '▁', '8', '▁', '4', '▁', '▁', '2', '▁', '1', '▁', '0', '▁', '0', '▁', '▁', '0', '▁', '7', '▁', '8', '▁', '3', '▁', '▁', '7', '▁', '2', '▁', '7', '▁guill', 'on', '▁', '▁', '5', '▁', '2', '▁aven', 'ue', '▁de', '▁dia', 's', '▁', '▁', '8', '▁', '1', '▁', '6', '▁', '8', '▁', '1', '▁marie', '-', 'les', '-', 'ba', 'ins', '▁franc', 'e', '<sep>']
[0, 100, 7, 12, 7, 9, 7, 7, 9, 7, 15, 7, 14, 7, 16, 7, 7, 11, 7, 9, 7, 12, 7, 15, 7, 7, 11, 7, 17, 7, 15, 7, 8, 7, 7, 11, 7, 14, 7, 9, 7, 9, 7, 7, 9, 7, 16, 7, 15, 7, 12, 7, 7, 16, 7, 11, 7, 16, 739, 145, 7, 7, 10, 7, 11, 298, 388, 64, 754, 29, 7, 7, 15, 7, 14, 7, 17, 7, 15, 7, 14, 615, 19, 308, 19, 257, 167, 89, 53, 1]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
from transformers import PreTrainedTokenizerFast

wrapped_tokenizer2 = PreTrainedTokenizerFast(
    tokenizer_object= unigram_tokenizer,
    # bos_token="<s>",
    # eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>",
    cls_token="<cls>",
    sep_token="<sep>",
    mask_token="<mask>",
    padding_side="left",
)

wrapped_tokenizer2

PreTrainedTokenizerFast(name_or_path='', vocab_size=3096, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'unk_token': '<unk>', 'sep_token': '<sep>', 'pad_token': '<pad>', 'cls_token': '<cls>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True)

In [None]:
# counting encoding ids to find occurence of UNK

from collections import Counter

def count_ids(tokenizer, dataset, type_data = 'train'):
    id_counts = Counter()
    token_counts = Counter()

    for data in dataset[type_data]:
        text = data['text']
        encoding = tokenizer.encode(text)
        id_counts.update(encoding.ids)
        token_counts.update(encoding.tokens)

    return dict(id_counts), dict(token_counts)

In [None]:
encoding_id_count_train, token_count_train = count_ids(unigram_tokenizer,train_dataset)
encoding_id_count_val, token_count_val = count_ids(unigram_tokenizer,val_dataset)
encoding_id_count_test, token_count_test = count_ids(unigram_tokenizer,test_dataset)

In [None]:
encoding_id_count_train

In [None]:
# Printing stats for WordPiece tokenizer
print("Unknown tokenized elements in Train:", encoding_id_count_train[2], '\n')
print("Total tokenized elements in Train:",sum(list(encoding_id_count_train.values())), '\n')
print("Proprtion of UNK in train dataset (%):", (encoding_id_count_train[2])/sum(list(encoding_id_count_train.values()))*100,'\n')
print('-------------------------------')

print("Unknown tokenized elements in Val:",  encoding_id_count_val[2], '\n')
print("Total tokenized elements in Val:",sum(list( encoding_id_count_val.values())), '\n')
print("Proprtion of UNK in val dataset (%):", ( encoding_id_count_val[2])/sum(list( encoding_id_count_val.values()))*100,'\n')
print('-------------------------------')

print("Unknown tokenized elements in Test:",  encoding_id_count_test[2], '\n')
print("Total tokenized elements in Test:",sum(list( encoding_id_count_test.values())), '\n')
print("Proprtion of UNK in test dataset (%):", ( encoding_id_count_test[2])/sum(list( encoding_id_count_test.values()))*100,'\n')


Unknown tokenized elements in Train: 17 

Total tokenized elements in Train: 4877131 

Proprtion of UNK in train dataset (%): 0.000348565580871213 

-------------------------------
Unknown tokenized elements in Val: 10 

Total tokenized elements in Val: 1629835 

Proprtion of UNK in val dataset (%): 0.0006135590412526421 

-------------------------------
Unknown tokenized elements in Test: 6 

Total tokenized elements in Test: 1623880 

Proprtion of UNK in test dataset (%): 0.0003694854299578786 

