In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 4070 Laptop GPU
Using device: cuda


## Reading in a short story as text sample into Python.

## Step 1: Creating Tokens

In [7]:
with open ("the-verdict.txt", "r", encoding = "utf-8") as f:
    raw_text = f.read()
print("Total no. of characters :", len(raw_text), "\n")
print("Sample of first 100 characters :\n" + str(raw_text[:101]))

Total no. of characters : 20479 

Sample of first 100 characters :
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no gr


<h2>Testing spliting logic<h2>

In [66]:
result = re.split(r'([,.]|\s)', text)
print(result)

['Hello', ',', '', ' ', 'do', ' ', 'you', ' ', 'like', ' ', 'tea?', ' ', '<|endoftext|>', ' ', 'In', ' ', 'the', ' ', 'sunlit', ' ', 'terraces', ' ', 'of', ' ', 'the', ' ', 'palace', '.', '']


In [67]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'do', 'you', 'like', 'tea?', '<|endoftext|>', 'In', 'the', 'sunlit', 'terraces', 'of', 'the', 'palace', '.']


In [68]:
text = "Hello, World. Is this a test--text ??"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
print("Before removing white space")
print(result)
print("After removing white space")
result = [item.strip() for item in result if item.strip()]
print(result)

Before removing white space
['Hello', ',', '', ' ', 'World', '.', '', ' ', 'Is', ' ', 'this', ' ', 'a', ' ', 'test', '--', 'text', ' ', '', '?', '', '?', '']
After removing white space
['Hello', ',', 'World', '.', 'Is', 'this', 'a', 'test', '--', 'text', '?', '?']


In [69]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:101])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter', '--', 'deploring']


In [70]:
len(preprocessed)

4690

## Step 2: Creating Token IDs

In [71]:
all_unique_words = sorted(set(preprocessed))
vocab_size = len(all_unique_words)
print("vocabulary size :", vocab_size)

vocabulary size : 1130


In [72]:
vocab = {token : integer for integer, token in enumerate(all_unique_words)}
vocab_print = [[token, integer] for token, integer in vocab.items()]
vocab_print[:52]

[['!', 0],
 ['"', 1],
 ["'", 2],
 ['(', 3],
 [')', 4],
 [',', 5],
 ['--', 6],
 ['.', 7],
 [':', 8],
 [';', 9],
 ['?', 10],
 ['A', 11],
 ['Ah', 12],
 ['Among', 13],
 ['And', 14],
 ['Are', 15],
 ['Arrt', 16],
 ['As', 17],
 ['At', 18],
 ['Be', 19],
 ['Begin', 20],
 ['Burlington', 21],
 ['But', 22],
 ['By', 23],
 ['Carlo', 24],
 ['Chicago', 25],
 ['Claude', 26],
 ['Come', 27],
 ['Croft', 28],
 ['Destroyed', 29],
 ['Devonshire', 30],
 ['Don', 31],
 ['Dubarry', 32],
 ['Emperors', 33],
 ['Florence', 34],
 ['For', 35],
 ['Gallery', 36],
 ['Gideon', 37],
 ['Gisburn', 38],
 ['Gisburns', 39],
 ['Grafton', 40],
 ['Greek', 41],
 ['Grindle', 42],
 ['Grindles', 43],
 ['HAD', 44],
 ['Had', 45],
 ['Hang', 46],
 ['Has', 47],
 ['He', 48],
 ['Her', 49],
 ['Hermia', 50],
 ['His', 51]]

<h2>Tokenizer v1 <h2>
<h3>Create an class to get a vocabulary, then create a method for encoding which changes text to token id then get a decoder that gets an list of id and returns a string as output <h3>

In [73]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {val : key for key, val in vocab.items()}
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[i] for i in preprocessed]
        return ids
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [74]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
           Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [75]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [50]:
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

<h3>This error occours because when it encounters an word not in vocubulary it throws an error<h3>

### ADDING SPECIAL CONTEXT TOKENS

In the previous section, we implemented a simple tokenizer and applied it to a passage
from the training set. 

In this section, we will modify this tokenizer to handle unknown
words.


In particular, we will modify the vocabulary and tokenizer we implemented in the
previous section, SimpleTokenizerV2, to support two new tokens, <|unk|>(Unknown word) and
<|endoftext|>

In [76]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(['<|endoftext|>', '<|unk|>'])

vocab = {token : integer for integer, token in enumerate(all_tokens)}
len(vocab)

1132

<h3>Now the new vocabulary size is 1132 including the two new keys<h3>

In [61]:
for key, val in list(vocab.items())[-5:]:
    print(key, ":", val)

younger : 1127
your : 1128
yourself : 1129
<|endoftext|> : 1130
<|unk|> : 1131



Step 1: Replace unknown words by <|unk|> tokens
    
Step 2: Replace spaces before the specified punctuations

If the value is not in vocabulary then we add <|unk|> for that token


In [62]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [63]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [64]:
tokenizer.encode(text)

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]

In [65]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

Based on comparing the de-tokenized text above with the original input text, we know that
the training dataset, Edith Wharton's short story The Verdict, did not contain the words
"Hello" and "palace."

So far, we have discussed tokenization as an essential step in processing text as input to
LLMs. Depending on the LLM, some researchers also consider additional special tokens such
as the following:

[BOS] (beginning of sequence): This token marks the start of a text. It
signifies to the LLM where a piece of content begins.

[EOS] (end of sequence): This token is positioned at the end of a text,
and is especially useful when concatenating multiple unrelated texts,
similar to <|endoftext|>. For instance, when combining two different
Wikipedia articles or books, the [EOS] token indicates where one article
ends and the next one begins.

[PAD] (padding): When training LLMs with batch sizes larger than one,
the batch might contain texts of varying lengths. To ensure all texts have
the same length, the shorter texts are extended or "padded" using the
[PAD] token, up to the length of the longest text in the batch.


Note that the tokenizer used for GPT models does not need any of these tokens mentioned
above but only uses an <|endoftext|> token for simplicity


the tokenizer used for GPT models also doesn't use an <|unk|> token for outof-vocabulary words. Instead, GPT models use a byte pair encoding tokenizer, which breaks
down words into subword units
