# Step 1: Import Necessary Libraries


In [2]:
# Importing necessary libraries
import numpy as np
import tensorflow as tf
from collections import defaultdict
import re


#Step 2: Define Functions for BPE


In [3]:
# 2.1: Function to Build Initial Vocabulary

def build_vocab(sentences):
    vocab = defaultdict(int)

    for sentence in sentences:
        # Add start and end tokens to the sentence
        sentence = '<s> ' + sentence + ' </s>'
        for char in sentence:
            vocab[char] += 1

    return vocab


In [10]:

# 2.2: Function to Count Pairs
def get_stats(vocab):
    pairs = defaultdict(int)

    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            pairs[pair] += freq

    return pairs


In [11]:
# 2.3: Function to Merge Pairs

def merge_vocab(pair, vocab):
    new_vocab = {}
    bigram = ' '.join(pair)

    replacement = ''.join(pair)
    for word in vocab:
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]

    return new_vocab


#Step 3: Main BPE Algorithm


In [6]:
def bpe(sentences, num_merges):
    # Step 1: Build initial vocabulary
    vocab = build_vocab(sentences)

    # Step 2: Perform merges
    for _ in range(num_merges):
        pairs = get_stats(vocab)

        # If no pairs left, break the loop
        if not pairs:
            break

        # Find the most frequent pair
        best_pair = max(pairs, key=pairs.get)

        # Merge the best pair in the vocabulary
        vocab = merge_vocab(best_pair, vocab)

    return vocab


# Step 4: Example Usage


In [7]:
# Sample sentences for demonstration
sentences = [
    "hello there",
    "hello world",
    "hello",
    "there world"
]

# Specify the number of merges
num_merges = 5

# Apply BPE
final_vocab = bpe(sentences, num_merges)

# Print the final vocabulary
print("Final Vocabulary:")
for word, freq in final_vocab.items():
    print(f"{word}: {freq}")


Final Vocabulary:
<: 8
s: 8
>: 8
 : 11
h: 5
e: 7
l: 8
o: 5
t: 2
r: 4
/: 4
w: 2
d: 2


In [8]:
class BPETokenizer(tf.keras.preprocessing.text.Tokenizer):
    def __init__(self, num_merges):
        super(BPETokenizer, self).__init__()
        self.num_merges = num_merges

    def fit_on_texts(self, texts):
        self.vocab = bpe(texts, self.num_merges)

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            tokens = []
            for word in text.split():
                if word in self.vocab:
                    tokens.append(word)
                else:
                    tokens.append("<UNK>")  # handle OOV words
            sequences.append(tokens)
        return sequences


# Step 5: Integrating with TensorFlow


In [9]:
# 5.1: Create a Tokenizer

class BPETokenizer(tf.keras.preprocessing.text.Tokenizer):
    def __init__(self, num_merges):
        super(BPETokenizer, self).__init__()
        self.num_merges = num_merges

    def fit_on_texts(self, texts):
        self.vocab = bpe(texts, self.num_merges)

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            tokens = []
            for word in text.split():
                if word in self.vocab:
                    tokens.append(word)
                else:
                    tokens.append("<UNK>")  # handle OOV words
            sequences.append(tokens)
        return sequences


# Step 6: Example Usage of the Tokenizer


In [12]:
# Initialize the BPE tokenizer
bpe_tokenizer = BPETokenizer(num_merges=5)

# Fit the tokenizer on the sample sentences
bpe_tokenizer.fit_on_texts(sentences)

# Convert texts to sequences
sequences = bpe_tokenizer.texts_to_sequences(sentences)

# Print the sequences
print("Tokenized Sequences:")
for seq in sequences:
    print(seq)


Tokenized Sequences:
['<UNK>', '<UNK>']
['<UNK>', '<UNK>']
['<UNK>']
['<UNK>', '<UNK>']
