In [13]:
import requests
import json
from bs4 import BeautifulSoup
import re

BASE_URL = "https://www.d20pfsrd.com"

# Dictionary of classes with their respective URLs and a_tag identifiers
CLASSES = {
    "Alchemist": {
        "url": "/classes/base-classes/alchemist/discoveries",
        "a_tag": "alchemist-discoveries",
    },
    "Alchemist_grand": {
        "url": "/classes/base-classes/alchemist/discoveries",
        "a_tag": "grand-discoveries",
    },    
    "Barbarian": {
        "url": "/classes/base-classes/rage-powers",
        "a_tag": "rage-powers",
    },
    "Investigator": {
        "url": "/classes/hybrid-classes/investigator/investigator-talents",
        "a_tag": "investigator-talents",
    },
    "magus": {
        "url": "/classes/base-classes/magus/magus-arcana",
        "a_tag": "magus-arcana",
    },        
    "Ninja": {
        "url": "/classes/alternate-classes/ninja/ninja-tricks/",
        "a_tag": "ninja-tricks",
    },
    "Ninja_advanced": {
        "url": "/classes/alternate-classes/ninja/ninja-tricks/",
        "a_tag": "advanced",
    },    
    "Rogue": {
        "url": "/classes/core-classes/rogue/rogue-talents/",
        "a_tag": "paizo-rogue-talents",
    },
    "Rogue_advanced": {
        "url": "/classes/core-classes/rogue/rogue-talents/",
        "a_tag": "paizo-rogue-advanced-talents",
    },    
    "Slayer": {
        "url": "/classes/hybrid-classes/slayer/slayer-talents/",
        "a_tag": "slayer-talents",
    },
    "Slayer_advanced": {
        "url": "/classes/hybrid-classes/slayer/slayer-talents/",
        "a_tag": "advanced",
    }

}

ACCEPTABLE_KEYS = ["Prerequisite", "Benefit"]


def grab_page_data(URL, a_tag):
    """Fetches the main page and extracts all valid discovery links."""
    response = requests.get(BASE_URL + URL)
    soup = BeautifulSoup(response.text, "html.parser")

    links = []
    for a in soup.find_all("a", href=True):
        if a_tag in a["href"]:
            href = a["href"]
            if href.startswith("/"):
                href = BASE_URL + href  # Convert relative URL to absolute
            links.append(href)

    return links


def extract_formatted_data(link):
    """Extracts structured content in the desired JSON format."""
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")

    h1_tag = soup.find("h1")
    if not h1_tag:
        return None

    title = h1_tag.get_text(strip=False)  # Get the discovery title
    content_dict = {}

    # Remove <div> elements with class 'section15'
    for section in soup.find_all("div", class_="section15"):
        # section.decompose() #removes all
        section.unwrap()

    node = h1_tag.find_next_sibling()
    current_key = None

    while node and node.name not in ["p", "div"]:  # Search for the first <p> or <div> after <h1>
        node = node.find_next_sibling()

    while node:
        if node.name == "p":  # Paragraphs
            text = node.get_text(strip=False)
            for key in ACCEPTABLE_KEYS:
                if text.startswith(key):
                    current_key = key
                    content_dict[current_key] = text[len(key) :].strip(": ").replace(key, " ")
                    break
            else:
                if current_key:
                    content_dict[current_key] += " " + text

        elif node.name == "div":
            nested_p = node.find_all("p")
            if nested_p:
                for p_tag in nested_p:
                    text = p_tag.get_text(strip=False)
                    for key in ACCEPTABLE_KEYS:
                        if text.startswith(key):
                            current_key = key
                            content_dict[current_key] = text[len(key) :].strip(": ").replace(key, " ")
                            break
                    else:
                        if current_key:
                            content_dict[current_key] += " " + text

        node = node.find_next_sibling()

    return {title: content_dict}

def clean_data(data):
    if isinstance(data, dict):
        for key, value in data.items():
            data[key] = clean_data(value)
    elif isinstance(data, list):
        for idx, value in enumerate(data):
            data[idx] = clean_data(value)
    elif isinstance(data, str):
        data = data.replace("(s): ", ""). replace("(s) ", "").replace("s: ", "")
    return data

def remove_unicode(json_data):
    """Removes any non-ASCII characters from the json_data."""
    if isinstance(json_data, dict):
        return {k: remove_unicode(v) for k, v in json_data.items()}
    elif isinstance(json_data, list):
        return [remove_unicode(item) for item in json_data]
    elif isinstance(json_data, str):
        return re.sub(r'[^\x00-\x7F]', '', json_data)
    else:
        return json_data

# Process each class separately and save to individual files
for class_name, class_info in CLASSES.items():
    print(f"\nProcessing {class_name} discoveries...\n")

    pre_class_data = {}
    links = grab_page_data(class_info["url"], class_info["a_tag"])

    for link in links:
        print(f"Extracting from: {link}")
        extracted_data = extract_formatted_data(link)
        if extracted_data:
            pre_class_data.update(extracted_data)

    # cleaning the json output data 
    class_data = clean_data(pre_class_data)
    class_data = remove_unicode(class_data)

    # Save each class's data to a separate JSON file
    filename = f"{class_name.lower()}.json"
    with open(filename, "w", encoding="utf-8") as json_file:
        json.dump(class_data, json_file, indent=2, ensure_ascii=True)

    print(f"Saved: {filename}")

print("\nData extraction complete! JSON files saved separately.")



Processing Alchemist discoveries...

Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/acid-bomb
Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/alchemical-simulacrum-su
Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/alchemical-zombie-su
Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/anarchic-bombs
Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/axiomatic-bombs
Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/bitter-pill-su/
Extracting from: https://www.d20pfsrd.com/classes/base-classes/alchemist/discoveries/paizo-alchemist-discoveries/blackstar-bomb-su
Extracting from: https://www.d20pfsrd.com/classes/