# Positional Embedding

In [10]:
import torch

In [17]:
import fitz  # PyMuPDF

# Open the PDF file
with fitz.open("The_Verdict.pdf") as doc:
    raw_text = ""
    # Loop through each page of the PDF and extract text
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)  # Load page by number
        raw_text += page.get_text()  # Extract text from page

print(raw_text)

1
The Verdict
Edith Wharton
1908
Exported from Wikisource on May 20, 2024
2
I HAD always thought Jack Gisburn rather a cheap genius--
though a
good fellow enough--so it was no great surprise to
me to hear that,
in the height of his glory, he had dropped
his painting, married a
rich widow, and established himself
in a villa on the Riviera.
(Though I rather thought it would
have been Rome or Florence.)
"The height of his glory"--that was what the women called
it. I can
hear Mrs. Gideon Thwing--his last Chicago sitter--
deploring his
 unaccountable abdication. "Of course it's
going to send the value of
my picture 'way up; but I don't
think of that, Mr. Rickham--the loss
to Arrt is all I think of."
The word, on Mrs. Thwing's lips,
 multiplied its _rs_ as
though they were reflected in an endless
vista of mirrors.
And it was not only the Mrs. Thwings who mourned.
Had
not the exquisite Hermia Croft, at the last Grafton Gallery
show, stopped me before Gisburn's "Moon-dancers" to say,
with tear

In [12]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [14]:
import tiktoken
def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [15]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [16]:
token_embedding_layer

Embedding(50257, 256)

In [18]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [19]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   16,   198,   464,  4643],
        [11600,   198,  7407,   342],
        [  854, 41328,   198,  1129],
        [ 2919,   198,  3109,  9213],
        [  422, 11145,   271,  1668],
        [  319,  1737,  1160,    11],
        [48609,   198,    17,   198],
        [   40,   367,  2885,  1464]])

Inputs shape:
 torch.Size([8, 4])


In [20]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [21]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [25]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [26]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
