In [15]:
from importlib.metadata import version


print("torch version:", version("torch"))
print("tiktoken version:", version("tiktoken"))


torch version: 2.8.0
tiktoken version: 0.11.0


In [16]:
import os
import PyPDF2

# Set working directory to project root
os.chdir("/Users/srijanashrestha/Desktop/CS_Projects/Projects/1LLM2RuleThemAll")
print("Current working directory:", os.getcwd())

# Ensure folder exists
os.makedirs("BookAndDataFiles", exist_ok=True)

#  Paths
pdf_path = "/Users/srijanashrestha/Downloads/Tolkien-J.-The-lord-of-the-rings-HarperCollins-ebooks-2010.pdf"
txt_path = "BookAndDataFiles/book.txt"

# 4Extract text from PDF
text = ""
with open(pdf_path, "rb") as f:
    reader = PyPDF2.PdfReader(f)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:  # skip empty pages
            text += page_text + "\n"

# Save text to file
with open(txt_path, "w", encoding="utf-8") as f:
    f.write(text)

print(f"PDF text extracted and saved to {txt_path}")
print("Length of text:", len(text))
print("First 500 characters:\n", text[:500])


Current working directory: /Users/srijanashrestha/Desktop/CS_Projects/Projects/1LLM2RuleThemAll
PDF text extracted and saved to BookAndDataFiles/book.txt
Length of text: 3055720
First 500 characters:
 The Lord  of the Rings  
BY 
J.R.R.  Tolkien  

Three Rings for the Elven-kings under the sky, 
Seven for the Dwarf-lords in their halls of stone, 
Nine for Mortal Men doomed to die, 
One for the Dark Lord on his dark throne 
In the Land of Mordor where the Shadows lie. 
One Ring to rule them all, One Ring to ﬁnd them, 
One Ring to bring them all and in the darkness bind them 
In the Land of Mordor where the Shadows lie. 
CONTENTS  
J.R.R.  TOLKIEN i 
NOTE  ON THE  TEXT  
NOTE  ON THE  5 0TH ANN


In [17]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['The', 'Lord', 'of', 'the', 'Rings', 'BY', 'J', '.', 'R', '.', 'R', '.', 'Tolkien', 'Three', 'Rings', 'for', 'the', 'Elven-kings', 'under', 'the', 'sky', ',', 'Seven', 'for', 'the', 'Dwarf-lords', 'in', 'their', 'halls', 'of']


In [14]:
print(len(preprocessed))

671697


In [18]:
#Converting tokens into IDs
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

23924


In [19]:
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

('!', 0)
('&', 1)
('(', 2)
(')', 3)
('*', 4)
('+', 5)
(',', 6)
('-', 7)
('-B', 8)
('-D', 9)
('-DO', 10)
('-R', 11)
('-chebin', 12)
('.', 13)
('//www', 14)
('0TH', 15)
('1', 16)
('10', 17)
('100', 18)
('1000', 19)
('1001', 20)
('1001–11', 21)
('1001–4', 22)
('1002', 23)
('10022', 24)
('1003', 25)
('1004', 26)
('1004–18', 27)
('1005', 28)
('1006', 29)
('1006–16', 30)
('1007', 31)
('1008', 32)
('1009', 33)
('1009–17', 34)
('100–10', 35)
('100–2', 36)
('100–6', 37)
('101', 38)
('1010', 39)
('1011', 40)
('1012', 41)
('1013', 42)
('1014', 43)
('1014–16', 44)
('1015', 45)
('1015–16', 46)
('1016', 47)
('1016–17', 48)
('1016–21', 49)
('1017', 50)


In [34]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}

In [35]:
len(vocab.items())

23926

In [36]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('ﬂuttering', 23921)
('ﬂy', 23922)
('ﬂying', 23923)
('<|endoftext|>', 23924)
('<|unk|>', 23925)


In [37]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text
    
    

In [38]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [39]:
tokenizer.encode(text)

[23925,
 6,
 10585,
 22212,
 14374,
 19982,
 2393,
 23924,
 4605,
 20084,
 19700,
 20047,
 15777,
 20084,
 23925,
 13]