<a href="https://colab.research.google.com/github/OussamaHaff/machine-learning-upskilling/blob/main/02-llms-from-scratch/01-tokenising-text/simple_text_tokenisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Stats about text to tokenise

In [3]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as book_file:
    raw_text = book_file.read()

print("Total number of characters:", len(raw_text))

Total number of characters: 20479


In [4]:
print("First 100 characters:\n", raw_text[:99])

First 100 characters:
 I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


### Basic Regex



*   Split test *on* whitespace character (`s` for space)

In [5]:
import re

text = "Hello, world. This, is a test."
result_s_split_only = re.split(r'(\s)', text)
print(result_s_split_only)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


*   Split on whitespace (s), comma (,) and period (.)

In [6]:
result_punc_chars_split = re.split(r'([,.]|\s)', text)
print(result_punc_chars_split)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


*   Strip the whitespace character

In [45]:
result_strip_whitespace = [item for item in result_punc_chars_split if item.strip()]
print(result_strip_whitespace)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


*    Splitg all punctuation characters

In [46]:
text = "Hello, world. Is this-- a test?"
result_all_punc_chars_split = re.split(r'[.,:;?_!"()\']|--|\s', text)
print(result_all_punc_chars_split)

result_all_punc_chars_split_cleaned = [item.strip() for item in result_all_punc_chars_split if item.strip()]
print("Cleaned result:\n", result_all_punc_chars_split_cleaned)

['Hello', '', 'world', '', 'Is', 'this', '', 'a', 'test', '']
Cleaned result:
 ['Hello', 'world', 'Is', 'this', 'a', 'test']


# Basic tokenisation of sample data

In [47]:
preprocessed = re.split(r'[.,:;?_!"()\']|--|\s', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print("Number of tokens:", len(preprocessed))
print("First 33 tokens:\n", preprocessed[:33])

Number of tokens: 3788
First 33 tokens:
 ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', 'though', 'a', 'good', 'fellow', 'enough', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', 'in', 'the', 'height', 'of', 'his', 'glory', 'he']


# Basic vocabulary of sample data

In [48]:
vocab_words = sorted(set(preprocessed))
vocab_size = len(vocab_words)
print("Vocabulary size:", vocab_size)

Vocabulary size: 1118


### The actual vocabulary with IDs

In [49]:
vocab = { token:integer for integer,token in enumerate(vocab_words) }
for i, item in enumerate(vocab.items()):
  print(item)
  if i > 50:
    break

('A', 0)
('Ah', 1)
('Among', 2)
('And', 3)
('Are', 4)
('Arrt', 5)
('As', 6)
('At', 7)
('Be', 8)
('Begin', 9)
('Burlington', 10)
('But', 11)
('By', 12)
('Carlo', 13)
('Chicago', 14)
('Claude', 15)
('Come', 16)
('Croft', 17)
('Destroyed', 18)
('Devonshire', 19)
('Don', 20)
('Dubarry', 21)
('Emperors', 22)
('Florence', 23)
('For', 24)
('Gallery', 25)
('Gideon', 26)
('Gisburn', 27)
('Gisburns', 28)
('Grafton', 29)
('Greek', 30)
('Grindle', 31)
('Grindles', 32)
('HAD', 33)
('Had', 34)
('Hang', 35)
('Has', 36)
('He', 37)
('Her', 38)
('Hermia', 39)
('His', 40)
('How', 41)
('I', 42)
('If', 43)
('In', 44)
('It', 45)
('Jack', 46)
('Jove', 47)
('Just', 48)
('Lord', 49)
('Made', 50)
('Miss', 51)


# Simple Text Tokeniser

In [50]:
class SimpleTextTokeniserV1:
  def __init__(self, vocab):
    """
    Saves the vocabulary of unique and sorted tokens,
    then creates its inverse and saves it.
    """
    self.str_to_int = vocab
    self.int_to_str = { i:s for s,i in vocab.items() }

  def encode(self, text):
    """
    Preprocesses a text by splitting it on special chars,
    then strip the text from all those chars.
    then map the tokens to ids based on the vocab
    """
    preprocessed = re.split(r'[.,:;!?()"\']|--|\s', text)
    preprocessed = [item.strip() for item in preprocessed if item.strip()]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids):
    text = " ".join([self.int_to_str[i] for i in ids])
    # Subtitues any occurence of a space + punctuation with the punctuation char
    text = re.sub(r'\s+([,.?!()"\'])', r'\1', text)
    return text


### Applying the tokeniser on the text

#### Encode

In [51]:
tokeniser = SimpleTextTokeniserV1(vocab)
text = """
  "It's the last he painted, you know," Mrs. Gisburn said with pardonable pride.
  """
ids = tokeniser.encode(text)
print(ids)

[45, 838, 976, 590, 521, 734, 1114, 584, 56, 27, 839, 1096, 742, 781]


In [52]:
print(tokeniser.decode(ids))

It s the last he painted you know Mrs Gisburn said with pardonable pride
