### CREATING INPUT-TARGET PAIRS

Will use same "The Verdict" book as raw data. First read the data

In [2]:
import fitz  # PyMuPDF

# Open the PDF file
with fitz.open("The_Verdict.pdf") as doc:
    raw_text = ""
    # Loop through each page of the PDF and extract text
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  # Load page by number
        raw_text += page.get_text()  # Extract text from page

print(raw_text)


1
The Verdict
Edith Wharton
1908
Exported from Wikisource on May 20, 2024
2
I HAD always thought Jack Gisburn rather a cheap genius--
though a
good fellow enough--so it was no great surprise to
me to hear that,
in the height of his glory, he had dropped
his painting, married a
rich widow, and established himself
in a villa on the Riviera.
(Though I rather thought it would
have been Rome or Florence.)
"The height of his glory"--that was what the women called
it. I can
hear Mrs. Gideon Thwing--his last Chicago sitter--
deploring his
 unaccountable abdication. "Of course it's
going to send the value of
my picture 'way up; but I don't
think of that, Mr. Rickham--the loss
to Arrt is all I think of."
The word, on Mrs. Thwing's lips,
 multiplied its _rs_ as
though they were reflected in an endless
vista of mirrors.
And it was not only the Mrs. Thwings who mourned.
Had
not the exquisite Hermia Croft, at the last Grafton Gallery
show, stopped me before Gisburn's "Moon-dancers" to say,
with tear

use BPE to tokenize the data

In [3]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)

In [4]:
enc_text

[16,
 198,
 464,
 4643,
 11600,
 198,
 7407,
 342,
 854,
 41328,
 198,
 1129,
 2919,
 198,
 3109,
 9213,
 422,
 11145,
 271,
 1668,
 319,
 1737,
 1160,
 11,
 48609,
 198,
 17,
 198,
 40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 198,
 2016,
 257,
 198,
 11274,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 198,
 1326,
 284,
 3285,
 326,
 11,
 198,
 259,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 198,
 14363,
 12036,
 11,
 6405,
 257,
 198,
 7527,
 27075,
 11,
 290,
 4920,
 2241,
 198,
 259,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 198,
 7,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 198,
 14150,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 198,
 270,
 13,
 314,
 460,
 198,
 258,
 283,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 198,
 2934,
 489,
 3255,
 465,
 198,
 48422,
 540,
 450

In [5]:
print(len(enc_text))

6170


In [6]:
context_size = 4 #length of the input, means how many words your model can take in one go


x = enc_text[:context_size]
y = enc_text[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [16, 198, 464, 4643]
y:      [198, 464, 4643, 11600]


In [7]:
for i in range(1, context_size+1):
    input = enc_text[:i]
    target = enc_text[i]

    print(input, "---->", target)

[16] ----> 198
[16, 198] ----> 464
[16, 198, 464] ----> 4643
[16, 198, 464, 4643] ----> 11600


Lets decode these IDs for better understanding

In [8]:
for i in range(1, context_size+1):
    input = enc_text[:i]
    target = enc_text[i]

    print(tokenizer.decode(input), "---->", tokenizer.decode([target]))

1 ----> 

1
 ----> The
1
The ---->  Ver
1
The Ver ----> dict


lets skip first few tokens to get some good output just to understand

In [9]:
for i in range(1, context_size+1):
    input = enc_text[50:][:i]
    target = enc_text[50:][i]

    print(tokenizer.decode(input), "---->", tokenizer.decode([target]))

so ---->  it
so it ---->  was
so it was ---->  no
so it was no ---->  great


Lets implement with dataloader and dataset class

In [10]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [11]:
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

lets test this on our data for 1 batch

In [12]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version: 1.12.1
[tensor([[  16,  198,  464, 4643]]), tensor([[  198,   464,  4643, 11600]])]


with 8 batch nad stride 4

In [13]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   16,   198,   464,  4643],
        [11600,   198,  7407,   342],
        [  854, 41328,   198,  1129],
        [ 2919,   198,  3109,  9213],
        [  422, 11145,   271,  1668],
        [  319,  1737,  1160,    11],
        [48609,   198,    17,   198],
        [   40,   367,  2885,  1464]])

Targets:
 tensor([[  198,   464,  4643, 11600],
        [  198,  7407,   342,   854],
        [41328,   198,  1129,  2919],
        [  198,  3109,  9213,   422],
        [11145,   271,  1668,   319],
        [ 1737,  1160,    11, 48609],
        [  198,    17,   198,    40],
        [  367,  2885,  1464,  1807]])
