In [1]:
from importlib.metadata import version
from torch.utils.data import Dataset,DataLoader
import torch,tiktoken
print(f"Torch version: {version('torch')}\nTiktoken version: {version('tiktoken')}")

Torch version: 2.9.1
Tiktoken version: 0.12.0


In [2]:
class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids=[]
        self.target_ids=[]
        token_ids=tokenizer.encode(txt,allowed_special={'<|endoftext|>'})
        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk=token_ids[i:i+max_length]
            target_chunk=token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    tokenizer=tiktoken.get_encoding('gpt2')
    dataset=GPTDatasetV1(txt,tokenizer,max_length,stride)
    dataloader=DataLoader(dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers)
    return dataloader
with open('./1-adding_bells_whistles_to_training_loop/verdict.txt','r',encoding='utf-8') as f:
    raw_text=f.read()
vocab_size=50257
output_dim=256
context_length=1024
token_embedding_layer=torch.nn.Embedding(vocab_size,output_dim)
pos_embedding_layer=torch.nn.Embedding(context_length,output_dim)
batch_size=8
max_length=4
dataloader=create_dataloader_v1(raw_text,batch_size=batch_size,max_length=max_length,stride=max_length)
for batch in dataloader:
    x,y=batch
    token_embeddings=token_embedding_layer(x)
    pos_embeddings=pos_embedding_layer(torch.arange(max_length))
    input_embeddings=token_embeddings+pos_embeddings
    break
input_embeddings.shape

torch.Size([8, 4, 256])

In [3]:
tokenizer=tiktoken.get_encoding('gpt2')
integers=tokenizer.encode('Akwirw ier')
integers

[33901, 86, 343, 86, 220, 959]

In [4]:
for i in integers:
    print(f'{i} -> {tokenizer.decode([i])}')

33901 -> Ak
86 -> w
343 -> ir
86 -> w
220 ->  
959 -> ier


In [5]:
tokenizer.encode('Ak')

[33901]

In [6]:
tokenizer.encode('w')

[86]

In [7]:
tokenizer.encode('ir')

[343]

In [8]:
tokenizer.encode(' ')

[220]

In [9]:
tokenizer.encode('ier')

[959]

In [10]:
tokenizer.decode([33901,86,343,86,220,959])

'Akwirw ier'

In [11]:
encoded_text=tokenizer.encode(raw_text)
dataloader=create_dataloader_v1(raw_text,batch_size=4,max_length=2,stride=2)
for batch in dataloader:
    x,y=batch
    break
x

tensor([[  351,   326],
        [  314,   550],
        [26394,    12],
        [    6, 14707]])

In [12]:
dataloader=create_dataloader_v1(raw_text,batch_size=4,max_length=8,stride=2)
for batch in dataloader:
    x,y=batch
    break
x

tensor([[  531,   314,  3521,   470,  7521,   683,    11,   326],
        [   12, 12239,   438,  1169,   691,  2134,  7163,   262],
        [12036,    13,   843,   523,   438, 14363, 10568,   852],
        [ 1865,    11,   355,   616,  2951,  6348, 23840,   284]])