In [1]:
corpus = """Tokenization is the process of breaking down 
a sequence of text into smaller units called tokens,
which can be words, phrases, or even individual characters.
Tokenization is often the first step in natural languages processing tasks 
such as text classification, named entity recognition, and sentiment analysis.
The resulting tokens are typically used as input to further processing steps,
such as vectorization, where the tokens are converted
into numerical representations for machine learning models to use."""
data = corpus.split(".")

In [14]:
from tokenizers import Tokenizer

## Normalizer
from tokenizers import normalizers
from tokenizers.normalizers import NFD, StripAccents

normalizer = normalizers.Sequence([NFD(), StripAccents()])

print(normalizer.normalize_str("Héllò hôw are ü?"))

## Pre-tokenizer
from tokenizers import pre_tokenizers
from tokenizers.pre_tokenizers import Whitespace, Digits, Punctuation

pre_tokenizer = pre_tokenizers.Sequence([Whitespace(), Digits(individual_digits=False), Punctuation()])
print(pre_tokenizer.pre_tokenize_str("Hello, how are you? 911please"))

Hello how are u?
[('Hello', (0, 5)), (',', (5, 6)), ('how', (7, 10)), ('are', (11, 14)), ('you', (15, 18)), ('?', (18, 19)), ('911', (20, 23)), ('please', (23, 29))]


In [13]:
st = "hello how are you, I am good"

letter = 'o'

## count the number of times a letter appears in a string

count = st.count(letter)
count

5

In [2]:
## Byte Pair Encoding (BPE) implementation

import re
from collections import defaultdict
from typing import List, Tuple

class BPE:
    def __init__(self, vocab_size: int):
        self.vocab_size = vocab_size
        self.bpe_codes = {chr(i): 0 for i in range(256)}


    def _count_pairs(self, corpus: List[str]) -> dict:
        pair_freq = defaultdict(int)
        for line in corpus:
            line = line.strip()
            line += " "
            for i in range(len(line) - 1):
                pair_freq[line[i], line[i + 1]] += 1
        return pair_freq

    def train(self, corpus: List[str]):
        
        # Count the frequency of each pair of characters
        pair_freq = defaultdict(int)
        for line in corpus:
            line = line.strip()
            line += " "
            for i in range(len(line) - 1):
                pair_freq[line[i], line[i + 1]] += 1

        # Merge the most frequent pair
        for _ in range(self.vocab_size):
            best_pair = max(pair_freq, key=pair_freq.get)
            pair_freq = self._merge_pair(best_pair, pair_freq)
            self.bpe_codes[best_pair] = len(self.bpe_codes)

    def _merge_pair(self, pair: Tuple[str, str], pair_freq: dict) -> dict:
        new_pair = "".join(pair)
        new_pair_freq = defaultdict(int)
        for key in pair_freq:
            new_key = re.sub(" ".join(pair), new_pair, " ".join(key))
            new_pair_freq[new_key] = pair_freq[key]
        return new_pair_freq
    
    def encode(self, text: str) -> str:
        text += " "
        tokens = []
        i = 0
        while i < len(text) - 1:
            pair = text[i], text[i + 1]
            if pair in self.bpe_codes:
                tokens.append("".join(pair))
                i += 2
            else:
                tokens.append(text[i])
                i += 1
        return " ".join(tokens)
    
    def decode(self, text: str) -> str:
        tokens = text.split()
        decoded_text = ""
        i = 0
        while i < len(tokens):
            if i < len(tokens) - 1 and tokens[i] + tokens[i + 1] in self.bpe_codes:
                decoded_text += self.bpe_codes[tokens[i] + tokens[i + 1]]
                i += 2
            else:
                decoded_text += tokens[i]
                i += 1
        return decoded_text
    
bpe = BPE(vocab_size=10)
bpe.train(data)

NameError: name 'data' is not defined

In [13]:
corpus = """Tokenization is the process of breaking down a sequence of text into smaller units called tokens, which can be words, phrases, or even individual characters.
Tokenization is often the first step in natural languages processing tasks such as text classification, named entity recognition, and sentiment analysis.
The resulting tokens are typically used as input to further processing steps, such as vectorization, where the tokens are converted into numerical representations for machine learning models to use."""
data = corpus.split(".")
data

['Tokenization is the process of breaking down a sequence of text into smaller units called tokens, which can be words, phrases, or even individual characters',
 '\nTokenization is often the first step in natural languages processing tasks such as text classification, named entity recognition, and sentiment analysis',
 '\nThe resulting tokens are typically used as input to further processing steps, such as vectorization, where the tokens are converted into numerical representations for machine learning models to use',
 '']

In [14]:
data[0]

'Tokenization is the process of breaking down a sequence of text into smaller units called tokens, which can be words, phrases, or even individual characters'

In [39]:
def _count_pairs( corpus: List[str], pair_freq: dict, n_gram: int = 2) -> dict:
    pair_freq = defaultdict(int)
    for line in corpus:
        line = line.strip()
        print(line)
        line += " "
        for i in range(len(line) - n_gram):
            key_str = [f"{line[i + j]}" for j in range(n_gram)]
            key_str = "".join(key_str)
            pair_freq[key_str] += 1
            
    return pair_freq

In [40]:
_count_pairs(data, {}, 3)

Tokenization is the process of breaking down a sequence of text into smaller units called tokens, which can be words, phrases, or even individual characters
Tokenization is often the first step in natural languages processing tasks such as text classification, named entity recognition, and sentiment analysis
The resulting tokens are typically used as input to further processing steps, such as vectorization, where the tokens are converted into numerical representations for machine learning models to use



defaultdict(int,
            {'Tok': 2,
             'oke': 5,
             'ken': 5,
             'eni': 2,
             'niz': 2,
             'iza': 3,
             'zat': 3,
             'ati': 5,
             'tio': 6,
             'ion': 6,
             'on ': 2,
             'n i': 3,
             ' is': 2,
             'is ': 2,
             's t': 3,
             ' th': 3,
             'the': 4,
             'he ': 4,
             'e p': 1,
             ' pr': 3,
             'pro': 3,
             'roc': 3,
             'oce': 3,
             'ces': 3,
             'ess': 3,
             'ss ': 1,
             's o': 2,
             ' of': 3,
             'of ': 2,
             'f b': 1,
             ' br': 1,
             'bre': 1,
             'rea': 1,
             'eak': 1,
             'aki': 1,
             'kin': 1,
             'ing': 5,
             'ng ': 5,
             'g d': 1,
             ' do': 1,
             'dow': 1,
             'own': 1,
             'wn 