In [2]:
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
import requests
import json
import re

url = ''

class ArchivesNethysScraper:
    def __init__(self, url, class_name):
        self.url = url
        self.href = f'http://spheresofpower.wikidot.com/{class_name}'
        self.page = requests.get(url + self.href)
        self.soup = BeautifulSoup(self.page.content, 'html.parser')
        self.table = self.soup.find("table")

    def table_rows(self):
        for b in self.soup.find_all('h4')[1:]:
            yield b

    def clean_text(self, text):
        cleaned_text = [line.replace('\u2019', "'").replace('\u2018', "'").replace('\u201c', '"').replace('\u201d', '"').strip() for line in text if line.strip() and line.strip() != "."]
        return ' '.join(cleaned_text)
    
    def remove_parenthesis(self, input_string):
        pattern = r"\(.*?pg\..*?\)"
        result = re.sub(pattern, '', input_string)
        return result

    def get_class_info(self, table_row):
        output = {}
        tags = self.soup.find_all(['h4', 'b'])
                                #, 'strong'

        for tag in tags:
            key = tag.text.lower()
            value_tags = []
            next_sibling = tag.next_sibling

            while next_sibling and (not isinstance(next_sibling, Tag) or next_sibling.name not in ['h4', 'b']):
                if hasattr(next_sibling, 'strings'):
                    value_tags.append(' '.join(next_sibling.strings).strip())

                next_sibling = next_sibling.next_sibling

            cleaned_value = self.clean_text(value_tags)
            cleaned_value = self.remove_parenthesis(cleaned_value)
            output[key] = {"benefits": cleaned_value}

        return output

def get_class_info_json(url, class_name):
    scraper = ArchivesNethysScraper(url, class_name)
    class_info = scraper.get_class_info(next(scraper.table_rows(), None).string)
    filtered_info = {k: v for k, v in class_info.items() if v}
    return json.dumps(filtered_info, indent=2)

class_names = [    
    "Alchemy",
    "Athletics",
    "Barrage",
    "Barroom",
    "Beastmastery",
    "Berserker",
    "Boxing",
    "Brute",
    "Dual Wielding",
    "Duelist",
    "Equipment",
    "Fencing",
    "Gladiator",
    "Guardian",
    "Lancer",
    "Open Hand",
    "Scoundrel",
    "Scout",
    "Shield",
    "Sniper",
    "Trap",
    "Warleader-sphere",
    "Wrestling"]

json_output = '{\n' + ',\n'.join([
    f'  "{class_name}": {get_class_info_json(url, class_name)}'
    for class_name in class_names
]) + '\n}'

print(json_output)


import json

def get_class_info_json(url, class_name):
    scraper = ArchivesNethysScraper(url, class_name)
    class_info = scraper.get_class_info(next(scraper.table_rows(), None).string)
    filtered_info = {k: v for k, v in class_info.items() if v}
    return filtered_info

class_names = [    
    "Alteration",
    "Blood",
    "Conjuration",
    "Creation",
    "Dark",
    "Death",
    "Destruction",
    "Divination",
    "Enhancement",
    "Fallen Fey",
    "Fate",
    "Illusion",
    "Life",
    "Light",
    "Mana",
    "Mind",
    "Nature",
    "Protection",
    "Telekinesis",
    "Time",
    "War",
    "Warp",
    "Weather"
    ]

# Create a dictionary to store class information
class_info_dict = {}
for class_name in class_names:
    class_info_dict[class_name] = get_class_info_json(url, class_name)

# Write the dictionary to a JSON file
with open('spheres_of_power.json', 'w') as json_file:
    json.dump(class_info_dict, json_file, indent=2)


{
  "Alchemy": {
  "special release": {
    "benefits": "Spheres of Power 5E $19.99 Spheres of Might 5E $19.99 Join the DDS Discord"
  },
  "billowing poison": {
    "benefits": "Whenever you create an inhaled poison, the radius of its area of effect increases by 5 ft. For every 10 ranks in Craft (alchemy) you possess, its area of effect increases by an additional 5 ft."
  },
  "careful poisoner": {
    "benefits": "Whenever you create a poison, you can select 1 creature; that creature is immune to your poison. For every 4 ranks in Craft (alchemy) you possess, you can select an additional creature to be immune to your poison."
  },
  "chemical coating [apoc]": {
    "benefits": "Source:   Spheres Apocrypha: Alchemical Formulae You may modify your formulae into an oil, called a coating that can be applied to weapons by increasing Craft (alchemy) DC to create them by 5. The coating's effects are added to any attacks you make in which you successfully hit the target. Formulae applied as a

In [4]:
import json

# Read data from JSON file
with open("spheres_of_power.json", "r") as file:
    data = json.load(file)

# Initialize dictionary for structured output
structured_data = {}

# Iterate over each category
for category, talents in data.items():
    # Initialize dictionary for talents within this category
    category_talents = {}
    # Iterate over each talent within the category
    for talent, details in talents.items():
        benefits = details.get("benefits", "")
        period_index = benefits.find('Prerequisites:')
        if period_index != -1:
            prerequisites = benefits[:period_index].strip()
            benefits_text = benefits[period_index:].strip()
        else:
            prerequisites = ""
            benefits_text = benefits.strip()
        talent_data = {
            "prerequisites": prerequisites,
            "benefits": benefits_text
        }
        # Add talent data to category dictionary with talent name as key
        category_talents[talent] = talent_data
    # Add category and talents to structured output
    structured_data[category] = category_talents

# Output structured data as JSON-like structure
print(json.dumps(structured_data, indent=4))



with open('spheres_of_power_cleaned.json', 'w') as json_file:
    json.dump(structured_data, json_file, indent=2)


{
    "Alteration": {
        "special release": {
            "prerequisites": "",
            "benefits": "Spheres of Power 5E $19.99 Spheres of Might 5E $19.99 Join the DDS Discord"
        },
        "blank transformation": {
            "prerequisites": "",
            "benefits": "When a creature gains the Alteration sphere, they gain the ability to grant the Blank Transformation. Unlike other transformations, the Blank Transformation does not change the creature's basic makeup. They do not gain the +10 bonus to Disguise checks, nor do they lose their abilities, equipment, natural attacks, or any other aspect of their unaltered form. The Blank Transformation allows a caster to add traits to a creature without fundamentally changing the target first. As a trait, you may change the target's cosmetic appearance. This could include changing the target's apparent age, making an elf into an orc, adding a purely cosmetic tail, disguising a large dog as a small pony, changing a male into