# Tokenizers

## 64. What types of tokenizers do you know? Compare them.

### Tokenizers par mots

In [None]:
## Tokenisers par espaces blancs
#--------------------------------
texte = "Once upon a time, Anna misplaced her key."
texte.split()
# avantages 
# * simple à utiliser
# * rapide
# inconvénients
# * key. et key representerons deux mots differents
# * ne fonctionne pas si vous avez des mots avec des espaces blancs

['Once', 'upon', 'a', 'time,', 'Anna', 'misplaced', 'her', 'key.']

In [4]:
## Tokenisers par regex
# https://www.regular-expressions.info/python.html
import re
text = "Once upon a time, Anna misplaced her key."
words = re.findall(r'\b\w+\b', text.lower())
print(words) 

['once', 'upon', 'a', 'time', 'anna', 'misplaced', 'her', 'key']


In [31]:
## Tokenisers de NLTK
# ---------------------------------------------------
import nltk
from nltk.tokenize import word_tokenize, wordpunct_tokenize, TreebankWordTokenizer, TweetTokenizer
# nltk.download('punkt')
# nltk.download('punkt_tab')

texte = "Once upon a time in 1900, Anna misplaced her key. *#tariq.ch"
print(word_tokenize(texte))

['Once', 'upon', 'a', 'time', 'in', '1900', ',', 'Anna', 'misplaced', 'her', 'key', '.', '*', '#', 'tariq.ch']


In [30]:
## tokenisers basé sur la ponctuation
texte = "Once upon a time in 1900, Anna misplaced her key. *#tariq.ch, 😎😵"
print(wordpunct_tokenize(texte))

['Once', 'upon', 'a', 'time', 'in', '1900', ',', 'Anna', 'misplaced', 'her', 'key', '.', '*#', 'tariq', '.', 'ch', ',', '😎😵']


In [33]:
## tokenisers TreeBank Word
# Ce tokenizer intègre une variété de règles communes pour la tokenisation des mots anglais.
#  Il sépare la ponctuation de fin de phrase comme (?!.;,) des tokens adjacents et conserve
# les nombres décimaux comme un seul token. En outre, il contient des règles pour les contractions anglaises. 
# Par exemple, « don't » est symbolisé par [« do », « n't »]
texte = "Once upon a time in 1900, Anna didn't misplaced her key. *#tariq.ch 😎😵"
tokenisers = TreebankWordTokenizer()
tokenisers.tokenize(texte)

['Once',
 'upon',
 'a',
 'time',
 'in',
 '1900',
 ',',
 'Anna',
 'did',
 "n't",
 'misplaced',
 'her',
 'key.',
 '*',
 '#',
 'tariq.ch',
 '😎😵']

In [34]:
## tokenizers par tweet
tokenisers = TweetTokenizer()
tokenisers.tokenize(texte)

['Once',
 'upon',
 'a',
 'time',
 'in',
 '1900',
 ',',
 'Anna',
 "didn't",
 'misplaced',
 'her',
 'key',
 '.',
 '*',
 '#tariq',
 '.',
 'ch',
 '😎',
 '😵']

### Tokenisers par sous mots

---> résout le problème de OUT OF VOCABULARY OOV

In [35]:
## BPE (Byte Pair Encoding)
# L'idée principale est de commencer par découper chaque mot en caractères uniques, 
# puis de combiner itérativement les paires de caractères les plus fréquentes pour former des sous-mots.
from collections import Counter, defaultdict

def get_vocab(corpus):
    """
    Create a vocabulary where each word is split into characters with a special token '</w>'.
    """
    vocab = Counter()
    for word in corpus:
        word = " ".join(list(word)) + " </w>"  # Split word into characters and add end-of-word token
        vocab[word] += 1
    return vocab

def get_stats(vocab):
    """
    Count frequency of character pairs in the vocabulary.
    """
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, vocab):
    """
    Merge the most frequent pair in the vocabulary.
    """
    new_vocab = {}
    bigram = " ".join(pair)
    replacement = "".join(pair)  # Merge the pair into one token
    for word in vocab:
        # Replace the pair with the merged token
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]
    return new_vocab

def byte_pair_encoding(corpus, num_merges):
    """
    Apply BPE to the corpus for a specified number of merges.
    """
    vocab = get_vocab(corpus)
    print("Initial Vocabulary:", vocab)
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        # Find the most frequent pair
        best_pair = max(pairs, key=pairs.get)
        print(f"Step {i + 1}: Merging {best_pair}")
        vocab = merge_vocab(best_pair, vocab)
        print("Updated Vocabulary:", vocab)
    return vocab

# Example usage
corpus = ["low", "lowest", "new", "newest"]
num_merges = 10
final_vocab = byte_pair_encoding(corpus, num_merges)

print("\nFinal Vocabulary:", final_vocab)



Initial Vocabulary: Counter({'l o w </w>': 1, 'l o w e s t </w>': 1, 'n e w </w>': 1, 'n e w e s t </w>': 1})
Step 1: Merging ('l', 'o')
Updated Vocabulary: {'lo w </w>': 1, 'lo w e s t </w>': 1, 'n e w </w>': 1, 'n e w e s t </w>': 1}
Step 2: Merging ('lo', 'w')
Updated Vocabulary: {'low </w>': 1, 'low e s t </w>': 1, 'n e w </w>': 1, 'n e w e s t </w>': 1}
Step 3: Merging ('e', 's')
Updated Vocabulary: {'low </w>': 1, 'low es t </w>': 1, 'n e w </w>': 1, 'n e w es t </w>': 1}
Step 4: Merging ('es', 't')
Updated Vocabulary: {'low </w>': 1, 'low est </w>': 1, 'n e w </w>': 1, 'n e w est </w>': 1}
Step 5: Merging ('est', '</w>')
Updated Vocabulary: {'low </w>': 1, 'low est</w>': 1, 'n e w </w>': 1, 'n e w est</w>': 1}
Step 6: Merging ('n', 'e')
Updated Vocabulary: {'low </w>': 1, 'low est</w>': 1, 'ne w </w>': 1, 'ne w est</w>': 1}
Step 7: Merging ('ne', 'w')
Updated Vocabulary: {'low </w>': 1, 'low est</w>': 1, 'new </w>': 1, 'new est</w>': 1}
Step 8: Merging ('low', '</w>')
Updated Vo

In [None]:
## WordPiece algorithm (used in BERT)
# The WordPiece algorithm is a subword tokenization algorithm that was used in BERT. It was
# designed to handle out-of-vocabulary (OOV) words by splitting them into subwords.
# The WordPiece algorithm works as follows:
# * 1.  Initialize a vocabulary of size V, where V is a hyperparameter that controls the size
# * of the vocabulary.
# * 2.  Initialize an empty vocabulary set.
# * 3.  For each word in the training data, calculate its frequency.
# * 4.  Sort the words by frequency in descending order.
# * 5.  For each word, calculate its subword representation using a greedy algorithm. what is greedy algorithm ?
# * 6.  Add the subword representation to the vocabulary set.
# * 7.  Repeat steps 3-6 until the vocabulary set reaches the desired size V.
# The WordPiece algorithm has several advantages, including:
# *   It can handle OOV words by splitting them into subwords.
# *   It can learn subword representations that are useful for language modeling.
# *   It can be used with a wide range of languages.
# However, the WordPiece algorithm also has some disadvantages, including:
# *   It can be computationally expensive to train.
# *   It requires a large amount of training data to learn effective subword representations.
# *   It can be difficult to tune the hyperparameters of the algorithm.


65. Can you extend a tokenizer? If yes, in what case would you do this? When would you retrain a tokenizer? What needs to be done when adding new tokens?

66. How do regular tokens differ from special tokens?

67. Why is lemmatization not used in transformers? And why do we need tokens?

68. How is a tokenizer trained? Explain with examples of WordPiece and BPE .

69. What position does the CLS vector occupy? Why?

70. What tokenizer is used in BERT, and which one in GPT?

71. Explain how modern tokenizers handle out-of-vocabulary words?

72. What does the tokenizer vocab size affect? How will you choose it in the case of new training?