In [1]:
!pip install --quiet --upgrade pypdf tiktoken==0.6.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.8 MB[0m [31m11.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m27.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/329.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.0/329.0 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# install all the required libraries, modules and packages for this project.
import os
from pypdf import PdfReader
import re
import tiktoken

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
pdf_path = "/content/drive/MyDrive/Datasets/Book/Outliers_Malcolm_Gladwell.pdf"

In [8]:
pdf_path = "/content/drive/MyDrive/Datasets/Book/"

In [13]:
def pdf_to_text(pdf_path, output_txt_path=None, save_to_file=True):
    """reads pdf and converts to text, optionally saving to .txt file."""

    # check if pdf exists.
    if not os.path.exists(pdf_path):
        print(f"error: '{pdf_path}' not found.")
        return None

    try:
        # open pdf in binary mode.
        with open(pdf_path, 'rb') as file:

            # create pdf reader.
            reader = PdfReader(file)
            num_pages = len(reader.pages)
            print(f"processing {num_pages} pages...")

            # extract all text.
            full_text = []
            for page_num in range(num_pages):
                page = reader.pages[page_num]
                text = page.extract_text()

                if text:
                    full_text.append(text)
                else:
                    print(f"warning: page {page_num + 1} had no extractable text.")

            # combine all pages.
            combined_text = "\n\n".join(full_text)

            # save to file if requested.
            if save_to_file:
                if output_txt_path is None:
                    output_txt_path = pdf_path.replace('.pdf', '.txt')

                with open(output_txt_path, 'w', encoding='utf-8') as txt_file:
                    txt_file.write(combined_text)

                print(f"saved to: {output_txt_path}")

            return combined_text

    except Exception as e:
        print(f"error reading pdf: {e}")
        return None

In [14]:
def batch_pdf_to_text(folder_path, output_folder=None):
    """converts all pdfs in a folder to text files."""

    # check if folder exists.
    if not os.path.exists(folder_path):
        print(f"error: folder '{folder_path}' not found.")
        return

    # create output folder if needed.
    if output_folder and not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # get all pdf files.
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]

    if not pdf_files:
        print("no pdf files found in folder.")
        return

    print(f"found {len(pdf_files)} pdf files. converting...")

    combined_texts = []

    # process each pdf.
    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)

        if output_folder:
            output_path = os.path.join(output_folder, pdf_file.replace('.pdf', '.txt'))
        else:
            output_path = None

        print(f"\nprocessing: {pdf_file}")

        combined_texts.append(pdf_to_text(pdf_path, output_path))

    print("\nbatch conversion complete!")

    return combined_texts

In [15]:
raw_text_contents = batch_pdf_to_text(pdf_path)

found 1 pdf files. converting...

processing: Outliers_Malcolm_Gladwell.pdf
processing 249 pages...
saved to: /content/drive/MyDrive/Datasets/Book/Outliers_Malcolm_Gladwell.txt

batch conversion complete!


In [17]:
raw_text_content = raw_text_contents[0]

print("Total number of character:", len(raw_text_content), "\n")
print(raw_text_content[:99])

Total number of character: 460987 

OUTLIERS
The Story of Success
MALCOLM GLADWELL
BACK BAY BOOKS
LITTLE, BROWN AND COMPANY
NEW YORK   


In [18]:
def preprocess_text(raw_text, max_preview=30):
    """splits text on punctuation and whitespace, removes empty strings."""

    # split on common punctuation and whitespace.
    pattern = r'([,.:;?_!"()\'\[\]{}\/\\|—–-]+|\.\.\.|\s+)'

    tokens = re.split(pattern, raw_text)

    # remove whitespace and filter empty strings.
    preprocessed = [item.strip() for item in tokens if item.strip()]

    # preview first n tokens.
    if max_preview:
        print(f"first {max_preview} tokens:")
        print(preprocessed[:max_preview])
        print(f"\ntotal word level tokens: {len(preprocessed)}\n")

    return preprocessed

In [19]:
preprocessed = preprocess_text(raw_text_content, max_preview=30)

# get the vocabulary size of the dataset.
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

first 30 tokens:
['OUTLIERS', 'The', 'Story', 'of', 'Success', 'MALCOLM', 'GLADWELL', 'BACK', 'BAY', 'BOOKS', 'LITTLE', ',', 'BROWN', 'AND', 'COMPANY', 'NEW', 'YORK', '•', 'BOSTON', '•', 'LONDON', 'Begin', 'Reading', 'Table', 'of', 'Contents', 'Reading', 'Group', 'Guide', 'Copyright']

total word level tokens: 92758

11324


In [20]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer, token in enumerate(all_tokens)}

In [21]:
class WordTokenizer:
    """tokenizes text into ids and decodes ids back to text. it focuses on word-level tokenization."""

    def __init__(self, vocab):
        """initializes tokenizer with vocabulary mappings."""

        self.tok_to_int = vocab
        self.int_to_tok = {integer: token for token, integer in vocab.items()}
        self.pattern = r'([,.:;?_!"()\'\[\]{}\/\\|—–-]+|\.\.\.|\s+)'

    def encode(self, text):
        """converts text to list of token ids."""

        # split on punctuation and whitespace.
        preprocessed = re.split(self.pattern, text)

        # remove empty strings and whitespace.
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        # replace unknown tokens with <|unk|>.
        preprocessed = [token if token in self.tok_to_int else "<|unk|>" for token in preprocessed]

        # convert tokens to ids.
        ids = [self.tok_to_int[tok] for tok in preprocessed]

        # return ids
        return ids


    def decode(self, ids):
        """converts list of token ids back to text."""

        # map ids to tokens.
        tokens = [self.int_to_tok[id] for id in ids]

        # join tokens with spaces.
        text = " ".join(tokens)

        # remove spaces before punctuation.
        text = re.sub(self.pattern, r'\1', text)

        # remove spaces before punctuation.
        text = re.sub(r'\s+([,.:;?_!"()\'\[\]{}\/\\|—–-])', r'\1', text)

        return text

In [22]:
# initialize the wordtokenizer class.
tokenizer = WordTokenizer(vocab)

# sample text from (and outside) outliers book.
text1 = "Chris Langan's mother was from San Francisco and was estranged from her family."

# sample text with unknown words for testing.
text2 = "do you know about smartphone cryptocurrency?"


# combine texts with special separator token.
text = " <|endoftext|> ".join((text1, text2))

print(f"original text: {text}")

# convert text to token ids.
encoded_text = tokenizer.encode(text)
print(f"\nencoded text: {encoded_text}")

# convert token ids back to text.
decoded_text = tokenizer.decode(encoded_text)
print(f"\ndecoded text: {decoded_text}")

original text: Chris Langan's mother was from San Francisco and was estranged from her family. <|endoftext|> do you know about smartphone cryptocurrency?

encoded text: [915, 1905, 11325, 11325, 7500, 10581, 5993, 2676, 1342, 3561, 10581, 5524, 5993, 6370, 5700, 29, 11325, 11325, 11325, 11325, 11325, 5192, 10835, 6940, 3291, 11325, 11325, 430]

decoded text: Chris Langan <|unk|> <|unk|> mother was from San Francisco and was estranged from her family. <|unk|> <|unk|> <|unk|> <|unk|> <|unk|> do you know about <|unk|> <|unk|>?
