# Collecting all of the player rules and guides for a TTRPG

If I change the name this can be used to scrap a directory of pdf documents and add them to a collection.  Works best with pdf files that are easily readable by pypdf2.  Note that the cleaning steps are not ideal for arvix papers and would need changing to make better use of them.  Additionally section tags and paragraph tags should be added to the metadata for better context.

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()
import time
import json
import re
from tqdm.notebook import tqdm

from PyPDF2 import PdfReader
import chromadb
from chromadb.config import Settings

pdf_directory = "./pdfs/Rules"
bestiary_directory = "./pdfs/Bestiary"

persistant_directory = "./chroma"
client = chromadb.Client(Settings(chroma_db_impl="duckdb+parquet", persist_directory=persistant_directory))
collection_name = "dnd_documents"
collection = client.create_collection(collection_name)

# Helper functions

Beware the commented code.  It will delete your collection.

In [None]:
#client.delete_collection(collection_name)

In [None]:
def get_file_list(directory_path):
    return [f for f in os.listdir(directory_path) if f.endswith(".pdf")]

In [None]:
def build_key(page_index, sentence_index, prefix=None):
    if prefix is not None:
        return prefix + "_p" + str(page_index) + "_s" + str(sentence_index)
    else:
        return "p" + str(page_index) + "_s" + str(sentence_index)

In [None]:
def tokenize_page_to_sentences(page, page_index, line_min_length=5, name_in_key=None):
    sentences_tuples = list()
    page_text = page.extract_text()
    lines = page_text.splitlines()
    page_text = "\n".join([line for line in lines if len(line.split()) > line_min_length])  # only take lines that are larger than k
    page_text = re.sub(r'E L T[\W\w]*\n', ' ', page_text)  # E L T pattern removed.  used in the pdf for tables
    page_text = re.sub(r'[\n]|[  ]|[• ]', ' ', page_text)  # Remove line breaks, double spaces, and dots
    page_text = re.sub(r'(?!\.)  (?!\.)', ' ', page_text)  # Remove double spaces in between letters
    sentences = page_text.split('.')  # break into sentences
    sentences = [sentence.strip() + "." for sentence in sentences if len(sentence) > 1]  # re-add "." to the end of the sentence
    for sentence_index, sentence in enumerate(sentences):
        sentences_tuples.append((build_key(page_index, sentence_index, name_in_key), sentence, page_index, sentence_index))
    return sentences_tuples

In [None]:
def pdf_to_document_tuples(file_path, line_min_length=5, name_in_key=None):
    sentences_tuples = []
    
    pdf_reader = PdfReader(file_path)
    pages = pdf_reader.pages
    for page_index, page in tqdm(enumerate(pages)):
        new_tuples = tokenize_page_to_sentences(page, page_index, line_min_length, name_in_key=name_in_key)
        for t in new_tuples:
            sentences_tuples.append(t)
        
    return sentences_tuples

In [None]:
def get_column_from_tuples(tuples_list, column_index):
    return [tuples[column_index] for tuples in tuples_list]

In [None]:
def vectorize_pdfs_in_directory_to_chroma(directory_path, collection):
    files = get_file_list(directory_path)
    
    for file in tqdm(files):
        # Get all documents in directory, tokenize them into sentences for embedding
        document_tuples = pdf_to_document_tuples(os.path.join(directory_path,file), name_in_key=file)
        # I pass a tuple for the extra metadata.  This used to be in the id of the call and required additional parsing after query.
        document_keys = get_column_from_tuples(document_tuples, 0)
        document_sentences = get_column_from_tuples(document_tuples, 1)
        document_page_index = get_column_from_tuples(document_tuples, 2)
        document_sentence_index = get_column_from_tuples(document_tuples, 3)
        # Create metadata
        metadata = [{"file_name":file, "page_index":document_page_index[i], "sentence_index":document_sentence_index[i]} for i in range(len(document_page_index))]
        # Add to chroma collection
        collection.add(documents=document_sentences, metadatas=metadata, ids=document_keys)

# Embedding to chroma.

In [None]:
# Take all documents in a directory and add it to the chroma vector database
vectorize_pdfs_in_directory_to_chroma(pdf_directory, collection)

In [None]:
client.persist()

In [None]:
# check just to see if it's there
collection.query(query_texts=["critical hit"], n_results=2)

In [None]:
def load_page(directory_path, file_name, page_number):
    pdf_reader = PdfReader(os.path.join(directory_path, file_path))
    return pdf_reader.pages[page_number]

In [None]:
def get_close_sentences(directory_path, file_name, page_number, sentence_number):
    
    