The idea here is to pre-process .pdf files found on the GamesWorkshop oficcial website, so that they are more LLM friendly. 



In [None]:

# !pip install -r requirements.txt  # uncomment and run if you wish to pull all the dependencies
# !pip install llama-parse          # uncomment and run if you only want to tweak the parser

In [1]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
import os
import configparser
from llama_parse import LlamaParse

def load_config(file_path=".config"):
    config = configparser.ConfigParser()
    config.read(file_path)
    return config

def get_api_key(var, config):
    return config["DEFAULT"].get(var)

# Load the config file
config = load_config()

# Fetch the API keys
llamaParse_api_key = get_api_key("LLAMA_CLOUD_API_KEY", config)

os.environ["LLAMA_CLOUD_API_KEY"] = llamaParse_api_key
print(llamaParse_api_key)

In [31]:
import os
from llama_parse import LlamaParse

dir_location = "Documents/"
dest_location = "MarkdownDocs/"
parsing_instruction="""The document provides information on the Warhammer 40K tabletop game, including rules, attributes of playable units, and other essential details needed for gameplay.
                      Preserve the structure of the document, including headings, subheadings, and tables, and ensure no sections are skipped.
                      Maintain the integrity of tables and lists, and extract all details in full.
                      """

# Recursively go through all folders in dir_location
for root, dirs, files in os.walk(dir_location):
    for file in files:
        if file.endswith('.pdf'):
            pdf_path = os.path.join(root, file)

            document = LlamaParse(result_type="markdown",
                                  parsing_instruction=parsing_instruction
                                  ).load_data(pdf_path)
            
            combined_text = "".join([doc.text for doc in document])
            
            # Create the corresponding output directory in MarkdownDocs
            relative_folder_path = os.path.relpath(root, dir_location)
            output_folder = os.path.join(dest_location, relative_folder_path)
            os.makedirs(output_folder, exist_ok=True)

            md_file_name = file.replace('.pdf', '.md')
            md_file_path = os.path.join(output_folder, md_file_name)
            
            with open(md_file_path, "w", encoding="utf-8") as md_file:
                md_file.write(combined_text)
            
            print(f"Processed {pdf_path} and saved as {md_file_path}")


Started parsing the file under job_id 4f8ad8f3-c35b-4ec1-ab8a-744bfd53dcde
Processed Documents/Armor-Tyranids.pdf and saved as MarkdownDocs/.\Armor-Tyranids.md
Started parsing the file under job_id 8605f0b1-c33b-4b02-8ada-0df617667461
.Processed Documents/Combat-Patrol-Tyranids.pdf and saved as MarkdownDocs/.\Combat-Patrol-Tyranids.md
Started parsing the file under job_id 652a47f0-4011-420b-a57a-9e7a9164909d
Processed Documents/Legends-Tyranids.pdf and saved as MarkdownDocs/.\Legends-Tyranids.md
Started parsing the file under job_id b0eaab31-14d6-48f9-8ff5-80805ca610dd
...Processed Documents/Indexes-FAQs-Errata\AELDARI-ARMY-RULE.pdf and saved as MarkdownDocs/Indexes-FAQs-Errata\AELDARI-ARMY-RULE.md
Started parsing the file under job_id 71a77be6-a9e2-41f3-8862-7655b67c322e
.Processed Documents/Indexes-FAQs-Errata\AGENTS-OF-THE-IMPERIUM-ARMY-RULE.pdf and saved as MarkdownDocs/Indexes-FAQs-Errata\AGENTS-OF-THE-IMPERIUM-ARMY-RULE.md
Started parsing the file under job_id a1846e4f-bd7e-45a1-