<a href="https://colab.research.google.com/github/OussamaHaff/machine-learning-upskilling/blob/main/02-llms-from-scratch/03-data-sampling-with-sliding-window/data_sampling_with_sliding__window.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Table of Contents
- [Basic input-target pairs generation](#basic-input-target-pairs-generation)
- [Create the input-target pairs](#create-the-input-target-pairs)
  - [Using Vanilla Python](#using-vanilla-python)
  - [Using PyTorch's Tensors & DataLoader](#using-pytorchs-tensors--dataloader)


# Basic input-target pairs generation

In [6]:
!pip install tiktoken
import tiktoken

tokeniser = tiktoken.get_encoding("gpt2")

In [4]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as data_file:
  raw_text = data_file.read()


In [8]:
encoded_text = tokeniser.encode(raw_text)
print(len(encoded_text))
print(encoded_text[:52])

5145
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920]


In [9]:
# Remove first 50 elements

encoded_sample = encoded_text[50:]
print(encoded_sample[:10])

[290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686]


# Create the input-target pairs
## Using Vanilla Python

In [15]:
context_size = 4
input_x = encoded_sample[:context_size]
target_y = encoded_sample[1:context_size+1]

print(input_x)
print(f"     {target_y}")

[290, 4920, 2241, 287]
     [4920, 2241, 287, 257]


In [18]:
for i in range(1, context_size+1):
  context = encoded_sample[:i]
  desired = encoded_sample[i]
  print(context, "----->", desired)

[290] -----> 4920
[290, 4920] -----> 2241
[290, 4920, 2241] -----> 287
[290, 4920, 2241, 287] -----> 257


In [20]:
for i in range(1, context_size+1):
  context = encoded_sample[:i]
  desired = encoded_sample[i]
  print(tokeniser.decode(context), "----->", tokeniser.decode([desired]))

 and ----->  established
 and established ----->  himself
 and established himself ----->  in
 and established himself in ----->  a


## Using PyTorch's Tensors & DataLoader

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
  """Contructs a PyTorch Dataset with """
  def __init__(self, text, tokeniser, sliding_window_max_length, stride):
    self.input_ids = []
    self.target_ids = []

    tokenised_text = tokeniser.encode(text)

    for i in range(0, len(tokenised_text) - sliding_window_max_length, stride):
      input_chunk = tokenised_text[i: i + sliding_window_max_length]
      target_chunk = tokenised_text[i + 1 : i + sliding_window_max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, id):
    return self.input_ids[id], self.target_ids[id]

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [4]:
def create_dataloader_v1(text, batch_size=4, sliding_window_max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):
  tokeniser = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(text, tokeniser, sliding_window_max_length, stride)
  dataloader = DataLoader(
      dataset,
      batch_size=batch_size,
      shuffle=shuffle,
      drop_last=drop_last,
      num_workers=num_workers
  )
  return dataloader

In [7]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as dataset_file:
  raw_text = dataset_file.read()

dataloader = create_dataloader_v1(
    raw_text, batch_size=1, sliding_window_max_length=4, stride=2, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)
second_batch = next(data_iter)
print(second_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]
[tensor([[2885, 1464, 1807, 3619]]), tensor([[1464, 1807, 3619,  402]])]
