## Reading in a short story as text sample into Python.

## Step 1: Creating Tokens

In [1]:
with open("Harrypot/03 Harry Potter and the Prisoner of Azkaban.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 625735
Harry Potter was a highly unusual boy in many ways. For one thing, he hated the summer holidays mor


In [2]:
import re

text = "Harry Potter was a highly unusual boy in many ways. For one thing, he hated the summer holidays"
result = re.split(r'(\s)', text)

print(result)

['Harry', ' ', 'Potter', ' ', 'was', ' ', 'a', ' ', 'highly', ' ', 'unusual', ' ', 'boy', ' ', 'in', ' ', 'many', ' ', 'ways.', ' ', 'For', ' ', 'one', ' ', 'thing,', ' ', 'he', ' ', 'hated', ' ', 'the', ' ', 'summer', ' ', 'holidays']


In [3]:
result = re.split(r'([,.]|\s)', text)

print(result)

['Harry', ' ', 'Potter', ' ', 'was', ' ', 'a', ' ', 'highly', ' ', 'unusual', ' ', 'boy', ' ', 'in', ' ', 'many', ' ', 'ways', '.', '', ' ', 'For', ' ', 'one', ' ', 'thing', ',', '', ' ', 'he', ' ', 'hated', ' ', 'the', ' ', 'summer', ' ', 'holidays']


In [4]:
result = [item for item in result if item.strip()]
print(result)

['Harry', 'Potter', 'was', 'a', 'highly', 'unusual', 'boy', 'in', 'many', 'ways', '.', 'For', 'one', 'thing', ',', 'he', 'hated', 'the', 'summer', 'holidays']


In [5]:
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [6]:
# Strip whitespace from each item and then filter out any empty strings.
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [7]:
text = "Hello, world. Is this-- a test?"

result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [8]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['Harry', 'Potter', 'was', 'a', 'highly', 'unusual', 'boy', 'in', 'many', 'ways', '.', 'For', 'one', 'thing', ',', 'he', 'hated', 'the', 'summer', 'holidays', 'more', 'than', 'any', 'other', 'time', 'of', 'year', '.', 'For', 'another']


In [9]:
print(len(preprocessed))


130162


## Step 2: Creating Token IDs

In [10]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

9460


In [11]:
vocab = {token:integer for integer,token in enumerate(all_words)}


In [12]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('(', 1)
(')', 2)
('*', 3)
(',', 4)
('.', 5)
('0-150', 6)
('1', 7)
('125', 8)
('1296', 9)
('1612', 10)
('1722', 11)
('20th', 12)
('9', 13)
(':', 14)
(';', 15)
('?', 16)
('A', 17)
('ABOUT', 18)
('ADVERTISE', 19)
('AFTER', 20)
('AGAIN', 21)
('ALL', 22)
('ALLOWED', 23)
('AND', 24)
('ANIMAL', 25)
('AN’', 26)
('APPARATE', 27)
('ARE', 28)
('AROUND', 29)
('AS', 30)
('AT', 31)
('Abandoning', 32)
('Abbott', 33)
('Aberdeen', 34)
('Abergavenny', 35)
('About', 36)
('Abruptly', 37)
('Accidental', 38)
('According', 39)
('Acid', 40)
('Adalbert', 41)
('After', 42)
('Again', 43)
('Against', 44)
('Agriculture', 45)
('Ah', 46)
('Ahead', 47)
('Aids', 48)
('Albus', 49)
('Alicia', 50)


In [13]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [16]:
tokenizer = SimpleTokenizerV1(vocab)

text = """Harry Potter was a highly unusual boy in many ways. For one thing, he hated the summer holidays."""
ids = tokenizer.encode(text)
print(ids)

[602, 1016, 8290, 1516, 4390, 8129, 2186, 4581, 5127, 8315, 5, 477, 5568, 7721, 4, 4307, 4291, 7695, 7463, 4421, 5]


In [17]:
tokenizer.decode(ids)


'Harry Potter was a highly unusual boy in many ways. For one thing, he hated the summer holidays.'

In [19]:
text = "Hello!; Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal"
print(tokenizer.encode(text))

KeyError: 'Hello'

<div class="alert alert-block alert-info">
    
The problem is that the word "Hello" was not used in the The short story. 

Hence, it
is not contained in the vocabulary. 

This highlights the need to consider large and diverse
training sets to extend the vocabulary when working on LLMs.

</div>

### ADDING SPECIAL CONTEXT TOKENS

In the previous section, we implemented a simple tokenizer and applied it to a passage
from the training set. 

In this section, we will modify this tokenizer to handle unknown
words.


In particular, we will modify the vocabulary and tokenizer we implemented in the
previous section, SimpleTokenizerV2, to support two new tokens, <|unk|> and
<|endoftext|>

<div class="alert alert-block alert-warning">

We can modify the tokenizer to use an <|unk|> token if it
encounters a word that is not part of the vocabulary. 

Furthermore, we add a token between
unrelated texts. 

For example, when training GPT-like LLMs on multiple independent
documents or books, it is common to insert a token before each document or book that
follows a previous text source

</div>



<div class="alert alert-block alert-success">

Let's now modify the vocabulary to include these two special tokens, <unk> and
<|endoftext|>, by adding these to the list of all unique words that we created in the
previous section:
</div>

In [20]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [21]:
len(vocab.items())


9462

<div class="alert alert-block alert-info">
    
Based on the output of the print statement above, the new vocabulary size is 1132 (the
vocabulary size in the previous section was 1130).

</div>



<div class="alert alert-block alert-success">

As an additional quick check, let's print the last 5 entries of the updated vocabulary:
</div>

In [22]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('…', 9457)
('…‘He’s', 9458)
('…’', 9459)
('<|endoftext|>', 9460)
('<|unk|>', 9461)


<div class="alert alert-block alert-success">

A simple text tokenizer that handles unknown words</div>



<div class="alert alert-block alert-info">
    
Step 1: Replace unknown words by <|unk|> tokens
    
Step 2: Replace spaces before the specified punctuations

</div>


In [23]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [27]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "HelloGuys!; Mrs. Dursley, of number four, Privet Drive, were hello123test proud to say that they were perfectly normal"

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> HelloGuys!; Mrs. Dursley, of number four, Privet Drive, were hello123test proud to say that they were perfectly normal


In [28]:
tokenizer.encode(text)


[9461,
 4,
 3158,
 8615,
 4962,
 7628,
 16,
 9460,
 9461,
 0,
 15,
 860,
 5,
 359,
 4,
 5539,
 5511,
 3872,
 4,
 1033,
 349,
 4,
 8353,
 9461,
 6083,
 7822,
 6556,
 7690,
 7709,
 8353,
 5790,
 5482]

In [29]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> <|unk|>!; Mrs. Dursley, of number four, Privet Drive, were <|unk|> proud to say that they were perfectly normal'


<div class="alert alert-block alert-info">
    
Based on comparing the de-tokenized text above with the original input text, we know that
the training dataset, Edith Wharton's short story The Verdict, did not contain the words
"Hello" and "palace."

</div>


<div class="alert alert-block alert-warning">

So far, we have discussed tokenization as an essential step in processing text as input to
LLMs. Depending on the LLM, some researchers also consider additional special tokens such
as the following:

[BOS] (beginning of sequence): This token marks the start of a text. It
signifies to the LLM where a piece of content begins.

[EOS] (end of sequence): This token is positioned at the end of a text,
and is especially useful when concatenating multiple unrelated texts,
similar to <|endoftext|>. For instance, when combining two different
Wikipedia articles or books, the [EOS] token indicates where one article
ends and the next one begins.

[PAD] (padding): When training LLMs with batch sizes larger than one,
the batch might contain texts of varying lengths. To ensure all texts have
the same length, the shorter texts are extended or "padded" using the
[PAD] token, up to the length of the longest text in the batch.

</div>


<div class="alert alert-block alert-warning">

Note that the tokenizer used for GPT models does not need any of these tokens mentioned
above but only uses an <|endoftext|> token for simplicity

</div>

<div class="alert alert-block alert-warning">

the tokenizer used for GPT models also doesn't use an <|unk|> token for outof-vocabulary words. Instead, GPT models use a byte pair encoding tokenizer, which breaks
down words into subword units
</div>

### BYTE PAIR ENCODING


**BPE Tokenizer**

In [30]:
! pip3 install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-win_amd64.whl.metadata (6.8 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.11.6-cp311-cp311-win_amd64.whl.metadata (41 kB)
Downloading tiktoken-0.9.0-cp311-cp311-win_amd64.whl (893 kB)
   ---------------------------------------- 0.0/893.9 kB ? eta -:--:--
   ---------------------------------------- 893.9/893.9 kB 4.5 MB/s eta 0:00:00
Downloading regex-2024.11.6-cp311-cp311-win_amd64.whl (274 kB)
Installing collected packages: regex, tiktoken
Successfully installed regex-2024.11.6 tiktoken-0.9.0


In [None]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

In [None]:
strings = tokenizer.decode(integers)

print(strings)

**Exercise 2.1**

In [None]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

**Data sampling with sliding window**

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

In [None]:
enc_sample = enc_text[50:]


In [None]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

**IMPLEMENTING A DATA LOADER**

In [None]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [None]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [None]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [None]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

In [None]:
second_batch = next(data_iter)
print(second_batch)

In [None]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

**CREATE TOKEN EMBEDDINGS**

In [None]:
input_ids = torch.tensor([2, 3, 5, 1])


In [None]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
print(embedding_layer.weight)


In [None]:
print(embedding_layer(torch.tensor([3])))


In [None]:
print(embedding_layer(input_ids))


**POSITIONAL EMBEDDINGS (ENCODING WORD POSITIONS)**

In [None]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [None]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [None]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

In [None]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

In [None]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [None]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

In [None]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)