In [3]:
# discoveries
import requests
import json
from bs4 import BeautifulSoup

BASE_URL = "https://www.d20pfsrd.com"
DISCOVERIES_URL = BASE_URL + "/classes/core-classes/barbarian/rage-powers/"
a_tag = "rage-powers"
ACCEPTABLE_KEYS = ["Prerequisite", "Benefit"]  # Add more keys if needed
title = "Barbarian"

def grab_page_data(URL, a_tag):
    """Fetches the main page and extracts all valid discovery links."""
    response = requests.get(URL)
    soup = BeautifulSoup(response.text, "html.parser")

    links = []
    for a in soup.find_all('a', href=True):
        if a_tag in a['href']:
            href = a['href']
            if href.startswith("/"):
                href = BASE_URL + href  # Convert relative URL to absolute
            links.append(href)

    return links

import re
import requests
from bs4 import BeautifulSoup

import requests
from bs4 import BeautifulSoup

def extract_formatted_data(link):
    """Extracts structured content in the desired JSON format."""
    response = requests.get(link)
    soup = BeautifulSoup(response.text, "html.parser")

    h1_tag = soup.find("h1")
    if not h1_tag:
        return None

    title = h1_tag.get_text(strip=False)  # Get the discovery title
    content_dict = {}  # Dictionary to store extracted data

    # Remove <div> elements with class 'section15'
    for section in soup.find_all("div", class_="section15"):
        section.decompose()  # Remove the element from the tree    

    node = h1_tag.find_next_sibling()  # Start reading after <h1>

    current_key = None  # Track section titles (e.g., "Prerequisite", "Benefit")

    # Skip any <div> tags that come before the first <p> tag after the <h1>
    while node and node.name not in ["p", "div"]:  # Search for the first <p> or <div> after <h1>
        node = node.find_next_sibling()

    # Now that we've found the first meaningful content or ended up with None
    while node:
        if node.name == "p":  # Paragraphs
            text = node.get_text(strip=False)
            for key in ACCEPTABLE_KEYS:
                if text.startswith(key):  # Match section headers
                    current_key = key
                    content_dict[current_key] = text[len(key):].strip(": ").replace(key, " ")
                    break
            else:
                if current_key:  # Append to the last detected section
                    content_dict[current_key] += " " + text
                    
        
        # If node is a <div> tag, search for any nested <p> tags inside it
        elif node.name == "div":
            # Search for any nested <p> tags inside the <div>
            nested_p = node.find_all("p")
            if nested_p:  # Only process the <div> if it contains <p> tags
                for p_tag in nested_p:
                    text = p_tag.get_text(strip=False)
                    for key in ACCEPTABLE_KEYS:
                        if text.startswith(key):  # Match section headers
                            current_key = key
                            content_dict[current_key] = text[len(key):].strip(": ").replace(key, " ")
                            break
                    else:
                        if current_key:  # Append to the last detected section
                            content_dict[current_key] += " " + text

        # Move to the next sibling after processing the current node
        node = node.find_next_sibling()

    return {title: content_dict}  # Return structured JSON with the title as the key

def clean_data(data):
    if isinstance(data, dict):
        for key, value in data.items():
            data[key] = clean_data(value)
    elif isinstance(data, list):
        for idx, value in enumerate(data):
            data[idx] = clean_data(value)
    elif isinstance(data, str):
        data = data.replace("(s): ", ""). replace("(s) ", "").replace("s: ", "")
    return data

# Fetch all links
links = grab_page_data(DISCOVERIES_URL, a_tag)

# Process each discovery page
pre_json_data = {}

# counter = 0
for link in links:
    print(f"\nExtracting from: {link}\n")
    extracted_data = extract_formatted_data(link)
    if extracted_data:
        pre_json_data.update(extracted_data)

    # counter += 1
    # if counter >= 20:
    #     break

pre_json_data = clean_data(pre_json_data)

# Save to JSON file
with open(f"{title}.json", "w", encoding="utf-8") as json_file:
    json.dump(pre_json_data, json_file, indent=2, ensure_ascii=False)

# Print final JSON output
print(json.dumps(pre_json_data, indent=2, ensure_ascii=False))



Extracting from: https://www.d20pfsrd.com/classes/core-classes/barbarian/rage-powers/paizo-rage-powers/beast-totem-su


Extracting from: https://www.d20pfsrd.com/classes/core-classes/barbarian/rage-powers/dragon-totem-su


Extracting from: https://www.d20pfsrd.com/classes/core-classes/barbarian/rage-powers/rage-powers-blood/


Extracting from: https://www.d20pfsrd.com/classes/core-classes/barbarian/rage-powers/paizo-rage-powers/ancestor-totem-lesser-su/


Extracting from: https://www.d20pfsrd.com/classes/core-classes/barbarian/rage-powers/paizo-rage-powers/ancestor-totem-su/


Extracting from: https://www.d20pfsrd.com/classes/core-classes/barbarian/rage-powers/paizo-rage-powers/ancestor-totem-greater-su/


Extracting from: https://www.d20pfsrd.com/classes/core-classes/barbarian/rage-powers/paizo-rage-powers/animal-fury-ex


Extracting from: https://www.d20pfsrd.com/classes/core-classes/barbarian/rage-powers/paizo-rage-powers/armor-ripper


Extracting from: https://www.d20pfsrd.com/cla