# **Word-Level** **Tokenization**

In [21]:
import nltk
nltk.download('punkt')  # Download necessary resources for tokenization
from nltk.tokenize import word_tokenize

# Sample text
text = "We love NLP!"

# Word-level tokenization
tokens = word_tokenize(text)

# Output the result
print(tokens)


['We', 'love', 'NLP', '!']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# **Character-Level Tokeniation**

In [5]:
# Sample text
text = "We Love NLP!"

# Character-level tokenization
tokens = list(text)

# Output the result
print(tokens)


['W', 'e', ' ', 'L', 'o', 'v', 'e', ' ', 'N', 'L', 'P', '!']


# **BytePair encoding** **Tokenization**

In [8]:
from collections import defaultdict, Counter

def get_vocab(text):
    # Create initial vocabulary with word frequency counts
    words = text.split()
    vocab = Counter(words)
    return vocab

def get_pairs(vocab):
    # Count pairs of characters in the vocabulary
    pairs = Counter()
    for word, freq in vocab.items():
        symbols = word + "</w>"  # Add end of word symbol
        for i in range(len(symbols) - 1):
            pairs[(symbols[i], symbols[i + 1])] += freq
    return pairs

In [9]:
def merge_vocab(pair, vocab):
    # Merge the most frequent pair in the vocabulary
    new_vocab = {}
    bigram = ''.join(pair)
    for word in vocab:
        new_word = word.replace(''.join(pair), bigram)
        new_vocab[new_word] = vocab[word]
    return new_vocab

def bpe_tokenization(text, num_merges):
    vocab = get_vocab(text)

    for _ in range(num_merges):
        pairs = get_pairs(vocab)
        if not pairs:
            break
        best_pair = pairs.most_common(1)[0][0]
        vocab = merge_vocab(best_pair, vocab)

    return vocab


In [17]:
# Sample text
text = "Eat Eat Sleep Repeat Repeat repeat"

# Perform BPE tokenization
num_merges = 20  # Number of merges to perform
tokenized_vocab = bpe_tokenization(text, num_merges)

# Output the result
print(tokenized_vocab)


{'Eat': 2, 'Sleep': 1, 'Repeat': 2, 'repeat': 1}


# **WordPiece** **Tokenization**

In [18]:
import re
from collections import defaultdict

def create_vocab(text):
    # Create a vocabulary of words from the input text
    words = text.split()
    vocab = defaultdict(int)
    for word in words:
        vocab[word] += 1
    return vocab

def get_subwords(vocab, max_vocab_size=100):
    # Create a set of subwords starting from characters
    subwords = set()
    for word in vocab:
        subwords.add(word)  # Add the full word
        for char in word:
            subwords.add(char)  # Add individual characters

    # Limit vocabulary size
    return sorted(list(subwords))[:max_vocab_size]

In [19]:
def wordpiece_tokenize(text, vocab):
    # Tokenize the input text using WordPiece algorithm
    tokens = []
    words = text.split()

    for word in words:
        if word in vocab:
            tokens.append(word)
            continue

        subword = ''
        start = 0
        while start < len(word):
            # Check for longest matching subword
            matched = False
            for end in range(len(word), start, -1):
                subword_candidate = word[start:end]
                if subword_candidate in vocab:
                    tokens.append(subword_candidate)
                    start = end
                    matched = True
                    break

            if not matched:
                tokens.append(word[start])  # Add the character if no match
                start += 1

    return tokens

In [20]:
# Sample text
text = "Eat Eat Sleep Repeat Repeat repeat."

# Create vocabulary
vocab = create_vocab(text)

# Get subwords for WordPiece tokenization
subwords = get_subwords(vocab)

# Perform WordPiece tokenization
tokenized_output = wordpiece_tokenize(text, subwords)

# Output the result
print(tokenized_output)


['Eat', 'Eat', 'Sleep', 'Repeat', 'Repeat', 'repeat.']
