In [1]:
with open("the-verdict.txt", "r", encoding='utf-8') as f:
    raw_text = f.read()

In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
enc_text = tokenizer.encode(raw_text)

In [4]:
print(len(enc_text))

enc_sample = enc_text[:50]
print(enc_sample)

5145
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11]


In [5]:
context_size = 4
# context_size means the number of tokens model can see at a time 
# and predict the next token

x = enc_text[:context_size]
y = enc_text[1:context_size+1]

## i.e. 
## is x = [model, predicts, next, token]
##    y = [predicts,next,  token, end_of_text]

## so this means 
# if input is model output is predicts
# if input is model predicts output is next§

print(x)
print(y)

[40, 367, 2885, 1464]
[367, 2885, 1464, 1807]


In [6]:
## we can prepare input-target pairs for the model

for i in range(1, context_size + 1):
    context = enc_text[:i]
    target = enc_text[i]
    print(f"context: {context} -> target: {target}")

## so this means 
# if input is model output is predicts
# if input is model predicts output is next§

context: [40] -> target: 367
context: [40, 367] -> target: 2885
context: [40, 367, 2885] -> target: 1464
context: [40, 367, 2885, 1464] -> target: 1807


In [7]:

for i in range(1, context_size + 1):
    context = enc_text[:i]
    target = enc_text[i]
    print(f"context: {tokenizer.decode(context)} -> target: {tokenizer.decode([target])}")


context: I -> target:  H
context: I H -> target: AD
context: I HAD -> target:  always
context: I HAD always -> target:  thought


### Implementing a data loader

In [8]:
from torch.utils.data import DataLoader, Dataset
import torch

class GPTDatasetV1(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the text
        tokens_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})

        for i in range(0,len(tokens_ids) - max_length,stride):
            input_chunk = tokens_ids[i:i+max_length]
            target_chunk = tokens_ids[i+1:i+max_length+1]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

            ## each input output pairs is a tensor of shape (max_lengthx)
            ## and it contains max_length prediction task
            ## i.e. if max_length = 4
            ## then input is [model, predicts, next, token]
            ## and target is [predicts,next,  token, end_of_text]

            ## so pairs like : model -> predicts
            ##           model predicts -> next
            ##           model predicts next -> token
            ##           model predicts next token -> end_of_text

        self.tokenizer = tokenizer
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.enc_text = tokenizer.encode(txt)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self,idx):
        return self.input_ids[idx],self.target_ids[idx]

In [9]:
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    # initalize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # initialize the dataset
    dataset = GPTDatasetV1(txt,tokenizer,max_length,stride)

    # create the dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [11]:
import torch

print(torch.__version__)

dataloader  = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)


data_iter = iter(dataloader)

first_batch = next(data_iter)

print(first_batch)





2.0.0
[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [12]:
second_batch = next(data_iter)

print(second_batch)





[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]
