In [None]:
# Manual variables
BASE_URL = "https://www.d20pfsrd.com"
DISCOVERIES_URL = BASE_URL + "/classes/base-classes/alchemist/discoveries"
a_tag = "alchemist-discoveries"
# acceptable_names = ["Prerequisite", "Benefit"]
title = "Alchemist"

# Import libraries
import requests
import json
from bs4 import BeautifulSoup

# Functions
def sanitize_filename(filename):
    """Removes invalid characters from filenames."""
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

def grab_url(URL, a_tag):
    """Fetches the main page and extracts all valid discovery links."""
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, "html.parser")
    
    links = []
    for a in soup.find_all('a', href=True):
        # if no a_tag then skip current portion
        if not a_tag in a['href']:
            continue
        href = a['href']
        # Need to reset href if starts with "/"
        if href.startswith("/"):
            href = BASE_URL + href
        links.append(href)
    
    return links

def extract_formatted_data(URL):
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, "html.parser")

    h1_tag = soup.find("h1")
    if not h1_tag:
        return None
    
    title = h1_tag.get_text(strip=True)
    content = [f"## {title}\n"] #storing as markdown format :(
    
    node = h1_tag.find_next_sibling() #starts reading everything after h1_tag

    # want to stop as soon as we hit a div (usually the bottom of the page)
    while node and node.name != "div":
        if node.name == "p":
            content.append(node.get_text(strip=True))
        elif node.name == "b" or node.name == "strong": #bold text
            content.append(f"**{node.get_text(strip=True)}**")
        elif node.name == "ul": #underline text
            for li in node.find_all("li"):
                content.append(f"- {li.get_text(strip=True)}")
        elif node.name == "ol": #ordered list
            for i, li in enumerate(node.find_all("li")):
                content.append(f"{i}. {li.get_text(strip=True)}")
        node = node.find_next_sibling()

    json_data = {
        "title": title,
        "content": content
    }

    with open(f"{title}.json", 'w', encoding='utf-8') as json_file:
            json.dump(json_data, json_file, indent=2, ensure_ascii=False)
    

# Fetch all links
links = grab_url(DISCOVERIES_URL, a_tag)

# Process each discovery page
for link in links:
    print(f"\nExtracting from: {link}\n")
    extracted_text = extract_formatted_data(link)
    if extracted_text:
        print(extracted_text)
    else:
        print("No relevant content found.")



Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/acid-bomb

No relevant content found.

Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/alchemical-simulacrum-su

No relevant content found.

Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/alchemical-zombie-su

No relevant content found.

Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/anarchic-bombs



OSError: [Errno 22] Invalid argument: 'Anarchic Bombs*.json'

In [45]:
# discoveries
import requests
import json
from bs4 import BeautifulSoup

BASE_URL = "https://www.d20pfsrd.com"
DISCOVERIES_URL = BASE_URL + "/classes/base-classes/alchemist/discoveries"
a_tag = "alchemist-grand-discoveries"
ACCEPTABLE_KEYS = ["Prerequisite", "Benefit"]  # Add more keys if needed

def grab_page_data(URL, a_tag):
    """Fetches the main page and extracts all valid discovery links."""
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, "html.parser")

    links = []
    for a in soup.find_all('a', href=True):
        if a_tag in a['href']:
            href = a['href']
            if href.startswith("/"):
                href = BASE_URL + href  # Convert relative URL to absolute
            links.append(href)

    return links

import re
import requests
from bs4 import BeautifulSoup

import requests
from bs4 import BeautifulSoup

def extract_formatted_data(link):
    """Extracts structured content in the desired JSON format."""
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")

    h1_tag = soup.find("h1")
    if not h1_tag:
        return None

    title = h1_tag.get_text(strip=False)  # Get the discovery title
    content_dict = {}  # Dictionary to store extracted data

    # Remove <div> elements with class 'section15'
    for section in soup.find_all("div", class_="section15"):
        section.decompose()  # Remove the element from the tree    

    node = h1_tag.find_next_sibling()  # Start reading after <h1>

    current_key = None  # Track section titles (e.g., "Prerequisite", "Benefit")

    # Skip any <div> tags that come before the first <p> tag after the <h1>
    while node and node.name not in ["p", "div"]:  # Search for the first <p> or <div> after <h1>
        node = node.find_next_sibling()

    # Now that we've found the first meaningful content or ended up with None
    while node:
        if node.name == "p":  # Paragraphs
            text = node.get_text(strip=False)
            for key in ACCEPTABLE_KEYS:
                if text.startswith(key):  # Match section headers
                    current_key = key
                    content_dict[current_key] = text[len(key):].strip(": ").replace(key, " ")
                    break
            else:
                if current_key:  # Append to the last detected section
                    content_dict[current_key] += " " + text
                    
        
        # If node is a <div> tag, search for any nested <p> tags inside it
        elif node.name == "div":
            # Search for any nested <p> tags inside the <div>
            nested_p = node.find_all("p")
            if nested_p:  # Only process the <div> if it contains <p> tags
                for p_tag in nested_p:
                    text = p_tag.get_text(strip=False)
                    for key in ACCEPTABLE_KEYS:
                        if text.startswith(key):  # Match section headers
                            current_key = key
                            content_dict[current_key] = text[len(key):].strip(": ").replace(key, " ")
                            break
                    else:
                        if current_key:  # Append to the last detected section
                            content_dict[current_key] += " " + text

        # Move to the next sibling after processing the current node
        node = node.find_next_sibling()

    return {title: content_dict}  # Return structured JSON with the title as the key


# Fetch all links
links = grab_page_data(DISCOVERIES_URL, a_tag)

# Process each discovery page
discoveries_data = {}

# counter = 0
for link in links:
    print(f"\nExtracting from: {link}\n")
    extracted_data = extract_formatted_data(link)
    if extracted_data:
        discoveries_data.update(extracted_data)

    # counter += 1
    # if counter >= 20:
    #     break

# Save to JSON file
with open("discoveries.json", "w", encoding="utf-8") as json_file:
    json.dump(discoveries_data, json_file, indent=2, ensure_ascii=False)

# Print final JSON output
print(json.dumps(discoveries_data, indent=2, ensure_ascii=False))



Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-grand-discoveries/awakened-intellect/


Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-grand-discoveries/true-mutagen/

{
  "Awakened Intellect": {
    "Prerequisite": "Grand discovery",
    "Benefit": "The alchemistâs constant exposure to strange chemicals has expanded his mind. His Intelligence score permanently increases by 2 points."
  },
  "True Mutagen": {
    "Prerequisite": "Grand discovery, grand mutagen discovery",
    "Benefit": "The alchemistâs mutagen now grants a +8 natural armor bonus and a +8 alchemical bonus to Strength, Dexterity, and Constitution. The alchemist takes a â2 penalty to his Intelligence, Wisdom, and Charisma as long as the mutagen persists."
  }
}
