<a href="https://colab.research.google.com/github/Rudra-rudie/llmbuilding/blob/main/llmtokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import requests

# Using a sample text directly for demonstration purposes
# This replaces the previous attempt to read a PDF, which resulted in garbled text.
raw_text = "This is a sample text for tokenization. It's designed to demonstrate how the tokenizer works. The quick brown fox jumps over the lazy dog."

print ("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 138
This is a sample text for tokenization. It's designed to demonstrate how the tokenizer works. The q


In [3]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [4]:
result = re.split(r'([,.]|\s)',text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [5]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [7]:
text = "Hello world. Is this-- a test?"
result = re.split(r'([,.:?_!"()\"]|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [8]:
result = [item for item in result if item.strip()]
print(result)

['Hello', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [32]:
preprocessed = re.split(r"([,.:;?_!\"()']|--|\s)", raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['This', 'is', 'a', 'sample', 'text', 'for', 'tokenization', '.', 'It', "'", 's', 'designed', 'to', 'demonstrate', 'how', 'the', 'tokenizer', 'works', '.', 'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [35]:
print(len(preprocessed))

29


In [36]:
all_words = sorted(set(preprocessed ))
vocab_size = len(all_words)

print (vocab_size)

26


In [37]:
vocab = {token: integer for integer,token in enumerate(all_words)}

In [38]:
for i , item in enumerate (vocab.items()):
  print(item)
  if i>= 50:
    break

("'", 0)
('.', 1)
('It', 2)
('The', 3)
('This', 4)
('a', 5)
('brown', 6)
('demonstrate', 7)
('designed', 8)
('dog', 9)
('for', 10)
('fox', 11)
('how', 12)
('is', 13)
('jumps', 14)
('lazy', 15)
('over', 16)
('quick', 17)
('s', 18)
('sample', 19)
('text', 20)
('the', 21)
('to', 22)
('tokenization', 23)
('tokenizer', 24)
('works', 25)


In [40]:
class SimpleTokenizer:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {i: s for s, i in vocab.items()}

  def encode(self, text):
    # Corrected regex to be consistent with the vocabulary generation
    # Using double-quoted raw string for better handling of literal single quotes
    preprocessed = re.split(r"([,.:;?_!\"()']|--|\s)", text)

    preprocessed = [
        item.strip() for item in preprocessed if item.strip()
    ]
    ids = [self.str_to_int[s] for s in preprocessed ]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i]for i in ids])
    # Adjusted regex for decode to match the encoding pattern for punctuation
    # Corrected escaping of ')' inside character class
    text =  re.sub(r"\s+([,.:;?_!\"()'])", r"\1", text)

    return text

In [41]:
tokenizer = SimpleTokenizer(vocab)

# Using raw_text itself to ensure all tokens are in the vocabulary
text = raw_text
ids = tokenizer.encode(text)
print(ids)

decoded_text = tokenizer.decode(ids)
print(f"\nEncoded IDs: {ids}")
print(f"Decoded Text: {decoded_text}")

# Optional: Test with the original text from the problem statement (will still KeyError without expanding vocab)
# text_from_problem =""" It's the last he painted you know,"
#            Mrs. Gisburn said with pardonable pride."""
# try:
#     ids_problem = tokenizer.encode(text_from_problem)
#     print(f"\nIDs from problem text: {ids_problem}")
# except KeyError as e:
#     print(f"\nKeyError for original problem text: {e}. Tokens not in current vocabulary.")

[4, 13, 5, 19, 20, 10, 23, 1, 2, 0, 18, 8, 22, 7, 12, 21, 24, 25, 1, 3, 17, 6, 11, 14, 16, 21, 15, 9, 1]

Encoded IDs: [4, 13, 5, 19, 20, 10, 23, 1, 2, 0, 18, 8, 22, 7, 12, 21, 24, 25, 1, 3, 17, 6, 11, 14, 16, 21, 15, 9, 1]
Decoded Text: This is a sample text for tokenization. It' s designed to demonstrate how the tokenizer works. The quick brown fox jumps over the lazy dog.


In [42]:
tokenizer.decode(ids)

"This is a sample text for tokenization. It' s designed to demonstrate how the tokenizer works. The quick brown fox jumps over the lazy dog."

In [43]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: integer for integer,token in enumerate(all_tokens)}




In [44]:
len(vocab.items())

28

In [46]:
for i , item in enumerate(list(vocab.items())[-5]):
  print(item)

tokenization
23


In [50]:
class SimmpleTokenizerV2:
  def __init__(self, vocab):
    self.str_to_int = vocab
    self.int_to_str = {v: k for k, v in vocab.items()}

  def encode(self, text):
    preprocessed = re.split(r'([,.:;?_!"()\\]|--|\s)', text)
    preprocessed = [
                    item if item in self.str_to_int
                    else"<|unk|>" for item in preprocessed
    ]

    ids = [self.str_to_int[item] for item in preprocessed]
    return ids

  def decode(self,ids):
    text = " ".join([self.int_to_str[i] for i in ids])

    text = re.sub(r'\s+([,.:;?!"()\\])',r'\1',text)
    return text

In [51]:
tokenizer = SimmpleTokenizerV2(vocab)

text1 = "Hello , do u like tea?"
text2 = "IN the sunlit terraces of the palace."

text = "<|endoftext|>".join ((text1,text2))

print(text)

Hello , do u like tea?<|endoftext|>IN the sunlit terraces of the palace.


In [52]:
tokenizer.encode(text)

[27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 21,
 27,
 27,
 27,
 27,
 27,
 27,
 27,
 21,
 27,
 27,
 1,
 27]

In [53]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> the <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> the <|unk|> <|unk|>. <|unk|>'