In [1]:
import tiktoken

In [3]:
with open("the-verdict.txt" , 'r' , encoding='utf-8') as f:
    raw_text = f.read()

tokenizer = tiktoken.get_encoding("gpt2") 
enc_text = tokenizer.encode(raw_text) 
print(len(enc_text))  

5145


In [15]:
context_size=10

x = enc_text[:context_size]
y = enc_text[1:context_size+1]

print(f"x:{x}")
print(f"y:{y}")

x:[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]
y:[367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257]


In [12]:
for i in range(1 , context_size+1):
    context = enc_text[:i]
    desired = enc_text[i]

    print(context ,"->", desired)

[40] -> 367
[40, 367] -> 2885
[40, 367, 2885] -> 1464
[40, 367, 2885, 1464] -> 1807
[40, 367, 2885, 1464, 1807] -> 3619
[40, 367, 2885, 1464, 1807, 3619] -> 402
[40, 367, 2885, 1464, 1807, 3619, 402] -> 271
[40, 367, 2885, 1464, 1807, 3619, 402, 271] -> 10899
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899] -> 2138
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138] -> 257


In [13]:
for i in range(1 , context_size+1):
    context = enc_text[:i]
    desired = enc_text[i]

    print(tokenizer.decode(context) ,"->", tokenizer.decode([desired]))

I ->  H
I H -> AD
I HAD ->  always
I HAD always ->  thought
I HAD always thought ->  Jack
I HAD always thought Jack ->  G
I HAD always thought Jack G -> is
I HAD always thought Jack Gis -> burn
I HAD always thought Jack Gisburn ->  rather
I HAD always thought Jack Gisburn rather ->  a


In [29]:
from torch.utils.data import Dataset , DataLoader

class GPTDataset1(Dataset):
    def __init__(self , txt , tokenizer , max_length , stride):
        self.input_ids = []
        self.output_ids = []

        token_ids = tokenizer.encode(txt , allowed_special={"<|endoftext|>"})

        # use a sliding window to chunk 

        for i in range(0 , len(token_ids)- max_length , stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.output_ids.append(torch.tensor(target_chunk))

    def __len__(self):
            return len(self.input_ids) 

    def __getitem__(self , idx):
            return self.input_ids[idx] , self.output_ids[idx]       

In [30]:
def create_dataloader1(txt , batch_size=4 , max_length=256 ,stride=128 , shuffle=True , drop_last=True, num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset =  GPTDataset1(txt , tokenizer , max_length , stride)

    #create dataset
    dataloader = DataLoader(
        dataset ,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers = num_workers
    )
    return dataloader




In [31]:
with open("the-verdict.txt",'r',encoding='utf-8') as f:
    raw_text = f.read()

In [32]:
import torch

dataloader = create_dataloader1(
    raw_text , batch_size=1,max_length=4 , stride=1 , shuffle=True
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[1464, 9885,  345,  286]]), tensor([[9885,  345,  286, 2376]])]
