In [1]:
import importlib
import tiktoken

In [2]:
tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
  raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

5145


In [4]:
enc_sample = enc_text[50:]

In [7]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [8]:
for i in range(1, context_size+1):
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(context,"------->",desired)

[290] -------> 4920
[290, 4920] -------> 2241
[290, 4920, 2241] -------> 287
[290, 4920, 2241, 287] -------> 257


In [12]:
for i in range(1, context_size+1):
  context = enc_sample[:i]
  desired = enc_sample[i]

  print(tokenizer.decode(context),"------->",tokenizer.decode([desired]))

 and ------->  established
 and established ------->  himself
 and established himself ------->  in
 and established himself in ------->  a


In [13]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride):
    self.input_ids = []
    self.target_ids = []

    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i+1:i+max_length+1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [15]:
def create_dataloaderV1(txt, batch_size=4, max_length=256, stride=128, shuffle=True,drop_last=True, num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")

  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )
  return dataloader

In [16]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
  raw_text = f.read()

In [17]:
import torch

dataloader = create_dataloaderV1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
