# BYTE PAIR ENCODING

In [1]:
!pip install tiktoken




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.8.0


In [2]:
tokenizer = tiktoken.get_encoding("gpt2")
text = ("hello, do you like Tea? <|endoftext|> In the sunlight terraces""of someunknownPlace.")
integers = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(integers)
print(tokenizer.decode(integers))

[31373, 11, 466, 345, 588, 15777, 30, 220, 50256, 554, 262, 19606, 8812, 2114, 1659, 617, 34680, 27271, 13]
hello, do you like Tea? <|endoftext|> In the sunlight terracesof someunknownPlace.


#INPUT TARGET DATA PAIRS

In [4]:
 with open("THE VERDICT.txt","r",encoding = "utf-8") as f:
     raw_text = f.read()
enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5774


In [5]:
enc_sample = enc_text[100:]

In [6]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size]
print(f"x: {x}")
print(f"y:        {y}")

x: [6001, 286, 465, 13476]
y:        [286, 465, 13476]


In [7]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context,"---->",desired)

[6001] ----> 286
[6001, 286] ----> 465
[6001, 286, 465] ----> 13476
[6001, 286, 465, 13476] ----> 1


In [8]:
for i in range(1,context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context),"---->",tokenizer.decode([desired]))

 height ---->  of
 height of ---->  his
 height of his ---->  glory
 height of his glory ----> "


In [9]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

Looking in indexes: https://download.pytorch.org/whl/cpu
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cpu/torchvision-0.21.0%2Bcpu-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cpu/torchaudio-2.6.0%2Bcpu-cp313-cp313-win_amd64.whl.metadata (6.7 kB)
Using cached https://download.pytorch.org/whl/cpu/torchvision-0.21.0%2Bcpu-cp313-cp313-win_amd64.whl (1.6 MB)
Using cached https://download.pytorch.org/whl/cpu/torchaudio-2.6.0%2Bcpu-cp313-cp313-win_amd64.whl (2.4 MB)
Installing collected packages: torchvision, torchaudio
Successfully installed torchaudio-2.6.0+cpu torchvision-0.21.0+cpu



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt,allowed_special ={"<|endoftext|>"})

        for i in range(0,len(token_ids) - max_length ,stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1 : i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__ (self ,idx):
        return self.input_ids[idx],self.target_ids[idx]

            

In [5]:
def create_dataloader_v1(txt , batch_size = 4 , max_length = 256 , stride = 128 , shuffle = True , drop_last = True , num_workers =0 ):
    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt , tokenizer , max_length , stride)
    dataloader = DataLoader(dataset,batch_size=batch_size , shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

In [6]:
 with open("THE VERDICT.txt","r",encoding = "utf-8") as f:
     raw_text = f.read()

In [7]:
import torch 
print("pytorch version :-", torch.__version__)
dataloader = create_dataloader_v1(raw_text,batch_size =1,max_length=4 , stride=1,shuffle = False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

pytorch version :- 2.6.0+cpu
[tensor([[10970, 33310,    35, 18379]]), tensor([[33310,    35, 18379,   198]])]


In [8]:
dataloader = create_dataloader_v1(raw_text,batch_size =8 , max_length = 4 , stride =4, shuffle = False)
data_iter = iter(dataloader)
inputs,targets = next(data_iter)
print("Input:\n",inputs)
print("\nTargets:\n",targets)

Input:
 tensor([[10970, 33310,    35, 18379],
        [  198, 15749, 40417,   198],
        [  198,    40,   550,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  198,   198, 11274,  5891],
        [ 1576,   438,   568,   340]])

Targets:
 tensor([[33310,    35, 18379,   198],
        [15749, 40417,   198,   198],
        [   40,   550,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   198],
        [  198, 11274,  5891,  1576],
        [  438,   568,   340,   373]])


# TOKEN EMBEDDINGS 

In [9]:
input_ids = torch.tensor([2, 3, 5, 1])

In [10]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size , output_dim)

In [11]:
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [12]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [13]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


# POSITION EMBEDDING