In [101]:
class gpt4_tokenizer:
    def __init__(self,text,vocab_size):
        self.text=text
        self.vocab_size=vocab_size
        self.pair_dict = {}
        self.merges = {}
        self.vocab = {idx: bytes([idx]) for idx in range(256)}


    def get_pair_counts(self,text):
    
        for i in range(len(text) - 1):
            pair = (text[i],text[i+1])
            if pair in self.pair_dict:
                self.pair_dict[pair] += 1
            else:
                self.pair_dict[pair] = 1

        return self.pair_dict 
    

    def pair_switch(self,token,pair,new_pair):
        i=0
        np=[]
        while i < len(token):
            if i<len(token)-1 and token[i] and token[i]==pair[0] and token[i+1]==pair[1]:
                np.append(new_pair)
                i+=2
            else :
                np.append(token[i])
                i+=1
        return np
    


    
    def train(self, text,vocab_size, verbose=False):
        text = text.encode("utf-8")
        text=list(map(int,text))

        number_of_merges=vocab_size-256
        bpe_tokens=list(text)

        for i in range(number_of_merges):
            pair_dict = self.get_pair_counts(bpe_tokens)
            pair = max(pair_dict, key=pair_dict.get)
            if verbose:
                print(f"Token {pair} is merged as {i+256}")
            bpe_tokens = self.pair_switch(bpe_tokens, pair, i+256)
            self.merges[pair] = i+256
        return self.merges, bpe_tokens

        


    def decode(self,ids):
        vocab = {idx: bytes([idx]) for idx in range(256)}
        for (p0, p1), idx in self.merges.items():
            vocab[idx] = vocab[p0] + vocab[p1]

        tokens = b"".join(vocab[idx] for idx in ids)
        decoded_text = tokens.decode("utf-8", errors="replace")
        return decoded_text
    
    def encoder(self,text):
        # Convert the text to bytes
        tokens=list(text.encode("utf-8"))
        tokens=list(map(int,tokens))
        while len(tokens)>=2:
            pair_counts=self.get_pair_counts(tokens)
            pair=min(pair_counts,key=lambda p:self.merges.get(p,float("inf")))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            tokens=self.merges(tokens,pair,idx)
        return tokens
    


In [109]:
class BPEncoder:
    def __init__(self, vocab_size=276):
        self.vocab_size = vocab_size
        self.merges = {}
        self.vocab = {idx: bytes([idx]) for idx in range(256)}

    def get_pair_counts(self, lst):
        pair_dict = {}
        for i in range(len(lst) - 1):
            pair = (lst[i], lst[i+1])
            if pair in pair_dict:
                pair_dict[pair] += 1
            else:
                pair_dict[pair] = 1
        return pair_dict

    def pair_switch(self, token, pair, new_pair):
        i = 0
        np = []
        while i < len(token):
            if i < len(token) - 1 and token[i] and token[i] == pair[0] and token[i+1] == pair[1]:
                np.append(new_pair)
                i += 2
            else:
                np.append(token[i])
                i += 1
        return np

    def fit(self, text):
        tokens = list(text.encode("utf-8"))
        bpe_tokens = list(tokens)
        number_of_merges = self.vocab_size - 256
        for i in range(number_of_merges):
            pair_dict = self.get_pair_counts(bpe_tokens)
            if not pair_dict:
                break  # No more pairs to merge
            pair = max(pair_dict, key=pair_dict.get)
            self.merges[pair] = i + 256
            bpe_tokens = self.pair_switch(bpe_tokens, pair, i + 256)

        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]


    def encode(self, text):
        tokens = list(text.encode("utf-8"))
        encoded = []
        while len(tokens) >= 2:
            pair_counts = self.get_pair_counts(tokens)
            pair = min(pair_counts, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break
            idx = self.merges[pair]
            tokens = self.pair_switch(tokens, pair, idx)
            encoded.append(idx)
        return encoded

    def decode(self, ids):
        decoded = b"".join(self.vocab[idx] for idx in ids)
        text = decoded.decode("utf-8", errors="replace")
        return text


In [89]:
text="""Bonjour! 🌟 Welcome to the magical world of language! In this vast universe of words, where the sky is the limit, let's embark on a journey together, exploring the beauty and diversity of different languages. ¡Hola amigos! ¿Cómo están? Today, we're going to delve into the wonders of multilingualism. Imagine being able to speak fluently in various tongues, connecting with people from all walks of life. C'est fantastique! 🎉

Let's start with English, the language of Shakespeare and modern communication. English is a global language, spoken by millions around the world. From the bustling streets of New York City to the serene countryside of England, English bridges cultures and societies. It's a language of opportunity, innovation, and creativity. So, grab your cup of tea ☕️ and let's dive into the world of English literature and culture.

Pero no podemos olvidar el hermoso idioma español. Con sus ricos sonidos y expresiones poéticas, el español nos lleva en un viaje a través de la pasión y el romance. Desde las vibrantes fiestas de España hasta las playas doradas de América Latina, el español es una celebración de la vida y la diversidad. ¡Viva la lengua española! 🇪🇸

Maintenant, parlons français! Ah, la langue de l'amour et de la sophistication. Le français est un mélange envoûtant de finesse et d'élégance. De Paris, la ville lumière, aux champs de lavande de la Provence, le français évoque un sentiment de joie de vivre. C'est magnifique! 🥖

As we journey through these languages, let's not forget the power of emojis. 😊 Emojis add color and emotion to our digital conversations. Whether it's a smiley face 😄 to brighten someone's day or a heart ❤️ to express love and affection, emojis transcend language barriers and connect us on a deeper level.

Now, let's wrap up our linguistic adventure with a toast 🥂 to the beauty of language! May we continue to explore, learn, and appreciate the rich tapestry of words that make our world a more vibrant and interconnected place. Cheers to language! Salud! 🎊

"""

In [102]:
tokenizer = gpt4_tokenizer(text=text, vocab_size=500)

In [103]:
merges, bpe_tokens = tokenizer.train(text=text,vocab_size=500, verbose=True)

Token (101, 32) is merged as 256
Token (115, 32) is merged as 257
Token (97, 110) is merged as 258
Token (97, 32) is merged as 259
Token (32, 116) is merged as 260
Token (32, 108) is merged as 261
Token (111, 110) is merged as 262
Token (101, 115) is merged as 263
Token (44, 32) is merged as 264
Token (101, 114) is merged as 265
Token (101, 114) is merged as 266
Token (105, 110) is merged as 267
Token (116, 105) is merged as 268
Token (100, 32) is merged as 269
Token (46, 32) is merged as 270
Token (116, 32) is merged as 271
Token (111, 114) is merged as 272
Token (111, 114) is merged as 273
Token (111, 102) is merged as 274
Token (103, 117) is merged as 275
Token (101, 110) is merged as 276
Token (101, 110) is merged as 277
Token (100, 256) is merged as 278
Token (121, 32) is merged as 279
Token (104, 256) is merged as 280
Token (111, 117) is merged as 281
Token (97, 103) is merged as 282
Token (97, 103) is merged as 283
Token (117, 114) is merged as 284
Token (111, 32) is merged as 2

In [104]:
tokenizer.encoder(text)

TypeError: 'dict' object is not callable

In [105]:
tokenizer.decode([23,23])

KeyError: 272

In [110]:
# Create an instance of BPEncoder
bpe = BPEncoder(vocab_size=276)

# Fit the encoder to a given text
text = "Hello, how are you?"
bpe.fit(text)

# Encode text into BPE tokens
encoded_text = bpe.encode(text)
print("Encoded text:", encoded_text)

# Decode BPE tokens back into text
decoded_text = bpe.decode(encoded_text)
print("Decoded text:", decoded_text)


Encoded text: [256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273]
Decoded text: HeHelHellHelloHello,Hello, Hello, hHello, hoHello, howHello, how Hello, how aHello, how arHello, how areHello, how are Hello, how are yHello, how are yoHello, how are youHello, how are you?
