# Installations

In [52]:
#Check torch and tiktoken version
from importlib.metadata import version

print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))


torch version: 2.8.0
tiktoken version: 0.11.0


# Converting pdf into txt file

In [53]:
#Convert pdf into txt file and save as book.txt
import os
import PyPDF2

# Set working directory to project root
os.chdir("/Users/srijanashrestha/Desktop/Projects/Main projects/LLM Project/1LLM2RuleThemAll")

print("Current working directory:", os.getcwd())

# Ensure folder exists
os.makedirs("BookAndDataFiles", exist_ok=True)

#  Paths
pdf_path = "/Users/srijanashrestha/Desktop/Projects/Main projects/LLM Project/Lord Of The Rings.pdf"
txt_path = "BookAndDataFiles/book.txt"

# 4Extract text from PDF
text = ""
with open(pdf_path, "rb") as f:
    reader = PyPDF2.PdfReader(f)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:  # skip empty pages
            text += page_text + "\n"


# Save text to file
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(text)

print(f"PDF text extracted and saved to {txt_path}")
print("Length of text:", len(text))
print("First 500 characters:\n", text[:500])



Current working directory: /Users/srijanashrestha/Desktop/Projects/Main projects/LLM Project/1LLM2RuleThemAll
PDF text extracted and saved to BookAndDataFiles/book.txt
Length of text: 3055720
First 500 characters:
 The Lord  of the Rings  
BY 
J.R.R.  Tolkien  

Three Rings for the Elven-kings under the sky, 
Seven for the Dwarf-lords in their halls of stone, 
Nine for Mortal Men doomed to die, 
One for the Dark Lord on his dark throne 
In the Land of Mordor where the Shadows lie. 
One Ring to rule them all, One Ring to ﬁnd them, 
One Ring to bring them all and in the darkness bind them 
In the Land of Mordor where the Shadows lie. 
CONTENTS  
J.R.R.  TOLKIEN i 
NOTE  ON THE  TEXT  
NOTE  ON THE  5 0TH ANN


# Tiktoken from Scratch

In [54]:
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

print(f"length={len(preprocessed)}")

['The', 'Lord', 'of', 'the', 'Rings', 'BY', 'J', '.', 'R', '.', 'R', '.', 'Tolkien', 'Three', 'Rings', 'for', 'the', 'Elven-kings', 'under', 'the', 'sky', ',', 'Seven', 'for', 'the', 'Dwarf-lords', 'in', 'their', 'halls', 'of']
length=671697


In [55]:
#Converting tokens into IDs
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(f"vocab size= {vocab_size}")

vocab size= 23924


In [56]:
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('&', 1)
('(', 2)
(')', 3)
('*', 4)
('+', 5)
(',', 6)
('-', 7)
('-B', 8)
('-D', 9)
('-DO', 10)
('-R', 11)
('-chebin', 12)
('.', 13)
('//www', 14)
('0TH', 15)
('1', 16)
('10', 17)
('100', 18)
('1000', 19)
('1001', 20)
('1001–11', 21)
('1001–4', 22)
('1002', 23)
('10022', 24)
('1003', 25)
('1004', 26)
('1004–18', 27)
('1005', 28)
('1006', 29)
('1006–16', 30)
('1007', 31)
('1008', 32)
('1009', 33)
('1009–17', 34)
('100–10', 35)
('100–2', 36)
('100–6', 37)
('101', 38)
('1010', 39)
('1011', 40)
('1012', 41)
('1013', 42)
('1014', 43)
('1014–16', 44)
('1015', 45)
('1015–16', 46)
('1016', 47)
('1016–17', 48)
('1016–21', 49)
('1017', 50)


In [57]:
##Adding special context tokens

all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}
print(f"vocab size={len(vocab)}")

#printing last five vocabs 
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

vocab size=23926
('ﬂuttering', 23921)
('ﬂy', 23922)
('ﬂying', 23923)
('<|endoftext|>', 23924)
('<|unk|>', 23925)


In [58]:
#adjusted tokenizer
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

# Byte Pair Encoding(BPE)

In [59]:

import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.11.0


In [60]:
tokenizer = tiktoken.get_encoding("gpt2")

In [61]:
text = (
"Hello, do you like tea? <|endoftext|> In the sunlit terraces"
"of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


# Data Sampling with Sliding Window

In [62]:
with open("/Users/srijanashrestha/Desktop/Projects/Main projects/LLM Project/1LLM2RuleThemAll/BookAndDataFiles/book.txt", "r", encoding="utf-8") as f:
    text = f.read()

enc_text = tokenizer.encode(text)
print(len(enc_text))

936434


In [63]:


enc_sample = enc_text[:50] #removing firsts 50 tokens 
print(tokenizer.decode(enc_sample))

The Lord  of the Rings  
BY 
J.R.R.  Tolkien  

Three Rings for the Elven-kings under the sky, 
Seven for the Dwarf-lords in their halls of stone,


In [64]:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y: {y}")

x: [464, 4453, 220, 286]
y: [4453, 220, 286, 262]


In [65]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[464] ----> 4453
[464, 4453] ----> 220
[464, 4453, 220] ----> 286
[464, 4453, 220, 286] ----> 262


In [66]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

The ---->  Lord
The Lord ---->  
The Lord  ---->  of
The Lord  of ---->  the


## A dataset for batched inputs and targets

In [67]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [68]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                        stride=128, shuffle=True, drop_last=True,
                        num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [69]:
txt_path = "BookAndDataFiles/book.txt"

with open(txt_path, "r", encoding="utf-8") as f:
    raw_text = f.read()
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 464, 4453,  220,  286]]), tensor([[4453,  220,  286,  262]])]


In [70]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[4453,  220,  286,  262]]), tensor([[  220,   286,   262, 26028]])]


In [71]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=4, stride=4,
    shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  464,  4453,   220,   286],
        [  262, 26028,   220,   220],
        [  198, 17513,   220,   198],
        [   41,    13,    49,    13],
        [   49,    13,   220, 32447],
        [  220,   220,   198,   198],
        [12510, 26028,   329,   262],
        [40748,    12,    74,   654]])

Targets:
 tensor([[ 4453,   220,   286,   262],
        [26028,   220,   220,   198],
        [17513,   220,   198,    41],
        [   13,    49,    13,    49],
        [   13,   220, 32447,   220],
        [  220,   198,   198, 12510],
        [26028,   329,   262, 40748],
        [   12,    74,   654,   739]])


# Token Embeddings

In [72]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [73]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


# Encoding word positions

In [74]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [75]:
max_length = 4
dataloader = create_dataloader_v1(
raw_text, batch_size=8, max_length=max_length,
stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[  464,  4453,   220,   286],
        [  262, 26028,   220,   220],
        [  198, 17513,   220,   198],
        [   41,    13,    49,    13],
        [   49,    13,   220, 32447],
        [  220,   220,   198,   198],
        [12510, 26028,   329,   262],
        [40748,    12,    74,   654]])

Inputs shape:
 torch.Size([8, 4])


In [76]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [77]:
context_length = max_length

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

pos_embeddings = pos_embedding_layer(torch.arange(context_length))

input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


# Summary

In [78]:
import importlib
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader

with open("/Users/srijanashrestha/Desktop/Projects/Main projects/LLM Project/1LLM2RuleThemAll/BookAndDataFiles/book.txt", "r", encoding="utf-8") as f:
    text = f.read()

enc_text = tokenizer.encode(text)

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                        stride=128, shuffle=True, drop_last=True,
                        num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader    

In [79]:
context_length = max_length

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

token_embeddings = token_embedding_layer(inputs)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)
    

torch.Size([8, 4, 256])
