### Reading in a short story as text sample into python

In [68]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  287,   262,  6001,   286],
        [  465, 13476,    11,   339],
        [  550,  5710,   465, 12036],
        [   11,  6405,   257,  5527],
        [27075,    11,   290,  4920],
        [ 2241,   287,   257,  4489],
        [   64,   319,   262, 34686],
        [41976,    13,   357, 10915]]), tensor([[  262,  6001,   286,   465],
        [13476,    11,   339,   550],
        [ 5710,   465, 12036,    11],
        [ 6405,   257,  5527, 27075],
        [   11,   290,  4920,  2241],
        [  287,   257,  4489,    64],
        [  319,   262, 34686, 41976],
        [   13,   357, 10915,   314]])]


### Effect of batch size

In [69]:
dataloader = create_dataloader_v1(
    raw_text,batch_size = 8, max_length = 4, stride = 4, shuffle = False
)

data_iter = iter(dataloader)

first_input , first_output = next(data_iter)


print("First input ------->")
print(first_input)
print("First output ------->")
print(first_output)

First input ------->
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
First output ------->
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### Step 1 : Creating Tokens

In [70]:
with open('Data/the-verdict.txt','r',encoding='utf-8') as f:
    raw_text = f.read()


print("Total No of character:", len(raw_text))
print(raw_text[:99])

Total No of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [71]:
import re

text = "Hello, world. This, is a test."

result = re.split(r'([,.]|\s)',text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [72]:
result = [item for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [73]:
text = "Hello, world. This, is -- a test."

result = re.split(r'([,.?!:;"\/|]|--|\s)',text)

print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', '', '--', '', ' ', 'a', ' ', 'test', '.', '']


In [74]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [75]:
print(len(preprocessed))

4690


### Step 2: Converting token to ID

In [76]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [77]:
vocab = {token:integer for integer,token in enumerate(all_words)}

In [78]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)
('Devonshire', 30)
('Don', 31)
('Dubarry', 32)
('Emperors', 33)
('Florence', 34)
('For', 35)
('Gallery', 36)
('Gideon', 37)
('Gisburn', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


In [79]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [80]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
        Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)
print(ids)

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


In [81]:
tokenizer.decode(ids)

'" It\' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.'

In [82]:
text = "Hello, Do you like Tea?"
print(tokenizer.encode(text))

KeyError: 'Hello'

### Adding Special Context Tokens

In [None]:
all_tokens = sorted(list(set(preprocessed)))

all_tokens.extend(["<|endoftext|>","<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [None]:
len(vocab.items())

1132

In [None]:
for i,item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]

        preprocessed =[
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer2 = SimpleTokenizerV2(vocab)

text1 = "Hello, Do you like Tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1,text2))

print(text)

Hello, Do you like Tea? <|endoftext|> In the sunlit terraces of the palace.


In [None]:
print(tokenizer2.encode(text))

[1131, 5, 1131, 1126, 628, 1131, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [None]:
tokenizer2.decode(tokenizer2.encode(text))

'<|unk|>, <|unk|> you like <|unk|>? <|endoftext|> In the sunlit terraces of the <|unk|>.'

### Byte Pair Encoding

In [None]:
!pip install tiktoken



In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)


integers = tokenizer.encode(text,allowed_special={'<|endoftext|>'})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [None]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


### Let us take another simple example to illustrate how the BPE tokenizer deal with random words

In [None]:
integers = tokenizer.encode("Hello @qwekfjokgokplfklokfokfo")
print(integers)


strings = tokenizer.decode(integers)
print(strings)

[15496, 2488, 80, 732, 74, 69, 73, 482, 70, 482, 489, 69, 41582, 482, 69, 482, 6513]
Hello @qwekfjokgokplfklokfokfo


### IMPLEMENTING A DATA LOADER

In [None]:
from torch.utils.data import Dataset,DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,txt, tokenizer,max_length, stride):
        self.input_ids = []
        self.target_ids = []


        token_ids = tokenizer.encode(txt, allowed_special={"|<endoftext>|"})


        for i in range(0,len(token_ids)-max_length,stride):
            input_chunks = token_ids[i:i+max_length]
            target_chunks = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunks))
            self.target_ids.append(torch.tensor(target_chunks))

    
    def __len__(self):
        return len(self.input_ids)
    

    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]

## Creating Data Loader

In [None]:
def create_dataloader_v1(text,batch_size=4,max_length =256,
                        stride = 128, shuffle = True, drop_last = True,
                        num_workers = 0):


    tokenizer = tiktoken.get_encoding('gpt2')

    # create dataset
    dataset = GPTDatasetV1(text, tokenizer,max_length, stride)

    # create dataloader

    dataloader = DataLoader(
        dataset,
        batch_size = batch_size,
        shuffle = shuffle,
        drop_last = drop_last,
        num_workers = num_workers
    )

    return dataloader

In [None]:
import torch
dataloader = create_dataloader_v1(
    raw_text,batch_size = 1, max_length = 4, stride = 1, shuffle = False
)


data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [None]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


### Effect of batch size

In [None]:
dataloader = create_dataloader_v1(
    raw_text,batch_size = 8, max_length = 4, stride = 4, shuffle = False
)

data_iter = iter(dataloader)

first_input , first_output = next(data_iter)


print("First input ------->")
print(first_input)
print("First output ------->")
print(first_output)

First input ------->
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
First output ------->
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


### Token Embedding

In [83]:
input_ids = torch.tensor([2, 3, 5, 1])

<div class="alert alert-block alert-success">
For the sake of simplicity and illustration purposes, suppose we have a small vocabulary of only 6 words (instead of the 50,257 words in the BPE tokenizer vocabulary), and we want to create embeddings of size 3 (in GPT-3, the embedding size is 12,288 dimensions):

Using the vocab_size and output_dim, we can instantiate an embedding layer in PyTorch, setting the random seed to 123 for reproducibility purposes:
</div>

<div class="alert alert-block alert-success">
    
Using the vocab_size and output_dim, we can instantiate an embedding layer in PyTorch,
setting the random seed to 123 for reproducibility purposes:

</div>

In [84]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

<div class="alert alert-block alert-info">
    
The print statement in the code prints the embedding layer's underlying
weight matrix:
    
</div>

In [85]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


<div class="alert alert-block alert-info">
    
We can see that the weight matrix of the embedding layer contains small, random values.
These values are optimized during LLM training as part of the LLM optimization itself, as we
will see in upcoming chapters. Moreover, we can see that the weight matrix has six rows
and three columns. There is one row for each of the six possible tokens in the vocabulary.
And there is one column for each of the three embedding dimensions.
    
</div>

<div class="alert alert-block alert-info">
    
Each row in this output matrix is obtained via a lookup operation from the embedding
weight matrix
    
</div>

In [86]:
print(embedding_layer(torch.tensor([3])))


tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


### POSITIONAL EMBEDDINGS (ENCODING WORD POSITIONS)

<div class="alert alert-block alert-success">

Previously, we focused on very small embedding sizes in this chapter for illustration
purposes. 

We now consider more realistic and useful embedding sizes and encode the input
tokens into a 256-dimensional vector representation. 

This is smaller than what the original
GPT-3 model used (in GPT-3, the embedding size is 12,288 dimensions) but still reasonable
for experimentation. 

Furthermore, we assume that the token IDs were created by the BPE
tokenizer that we implemented earlier, which has a vocabulary size of 50,257:

</div>

In [90]:
vocab_size = 50257
output_dim = 256

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

<div class="alert alert-block alert-info">
    
Using the token_embedding_layer above, if we sample data from the data loader, we
embed each token in each batch into a 256-dimensional vector. If we have a batch size of 8
with four tokens each, the result will be an 8 x 4 x 256 tensor.
    
</div>

<div class="alert alert-block alert-success">

Let's instantiate the data loader ( Data sampling with a sliding window),
first:

</div>

In [87]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [88]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


<div class="alert alert-block alert-info">
    
As we can see, the token ID tensor is 8x4-dimensional, meaning that the data batch
consists of 8 text samples with 4 tokens each.
    
</div>

<div class="alert alert-block alert-success">

Let's now use the embedding layer to embed these token IDs into 256-dimensional
vectors:

</div>

In [91]:
token_embeddings = embedding_layer(inputs)
print("\nToken embedding:\n", token_embeddings)
print("\nToken embedding shape:\n", token_embeddings.shape)


Token embedding:
 tensor([[[ 1.4588, -0.3653, -0.0404,  ..., -0.1445,  0.3833,  0.3091],
         [-0.2655, -0.2798, -2.5196,  ..., -1.1844,  2.7038,  0.9186],
         [ 0.6687,  0.7299,  0.3889,  ..., -0.0528,  0.0065, -1.1664],
         [ 1.6191,  0.9051, -0.4884,  ...,  0.3483, -0.6136,  0.3884]],

        [[-0.7848,  0.0632, -0.3674,  ..., -0.3421, -0.4469, -1.5079],
         [-0.3812, -0.5149,  0.1585,  ...,  0.6091,  0.0865,  0.4503],
         [-0.1415,  1.5888,  0.0494,  ...,  1.4478, -0.3205,  1.4569],
         [-1.6207, -0.8156, -0.3197,  ...,  0.3969,  0.8440,  0.0444]],

        [[ 0.1663, -0.8918,  3.2887,  ...,  0.7426, -0.5492,  2.1841],
         [ 1.2922,  1.1209, -0.9461,  ...,  0.3205, -0.2395, -0.9524],
         [ 0.5750,  0.0232, -0.1438,  ..., -0.3635, -0.1418,  1.2569],
         [ 0.5686,  0.3034,  1.3640,  ...,  0.8473, -1.2123, -0.6241]],

        ...,

        [[-2.6437,  0.7465,  0.2959,  ...,  1.6383, -0.0200, -0.6255],
         [-1.3163, -1.2242,  0.4636,  

<div class="alert alert-block alert-info">
    
As we can tell based on the 8x4x256-dimensional tensor output, each token ID is now
embedded as a 256-dimensional vector.
    
</div>

<div class="alert alert-block alert-success">

For a GPT model's absolute embedding approach, we just need to create another
embedding layer that has the same dimension as the token_embedding_layer:

</div>

In [92]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [93]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


<div class="alert alert-block alert-info">
    
As shown in the preceding code example, the input to the pos_embeddings is usually a
placeholder vector torch.arange(context_length), which contains a sequence of
numbers 0, 1, ..., up to the maximum input length âˆ’ 1. 

The context_length is a variable
that represents the supported input size of the LLM. 

Here, we choose it similar to the
maximum length of the input text. 

In practice, input text can be longer than the supported
context length, in which case we have to truncate the text.
    
</div>

<div class="alert alert-block alert-info">
    
As we can see, the positional embedding tensor consists of four 256-dimensional vectors.
We can now add these directly to the token embeddings, where PyTorch will add the 4x256-
dimensional pos_embeddings tensor to each 4x256-dimensional token embedding tensor in
each of the 8 batches:
    
</div>

In [94]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


<div class="alert alert-block alert-warning">

The input_embeddings we created are the embedded input
examples that can now be processed by the main LLM modules
    
</div>