In [1]:

from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
import requests
import re

url = 'https://www.aonprd.com/'




class ArchivesNethysScraper:
    def __init__(self, url, class_name):
        self.url = url
        self.href = f'ClassDisplay.aspx?ItemName={class_name}'
        self.page = requests.get(url + self.href)
        self.soup = BeautifulSoup(self.page.content, 'html.parser')
        self.table = self.soup.find("table")

    def table_rows(self):
        for b in self.soup.find_all('b')[1:]:
            yield b

    def clean_text(self, text):
        # Replace Unicode right single quotation mark with an apostrophe
        cleaned_text = [line.replace('\u2019', "'").replace('\u2018', "'").replace('\u201c', '"').replace('\u201d', '"').strip() for line in text if line.strip() and line.strip() != "."]
        return ' '.join(cleaned_text)
    
    def remove_parenthesis(self, input_string):
        pattern = r"\(.*?pg\..*?\)"
        result = re.sub(pattern, '', input_string)
        return result


    def get_class_info(self, table_row):
        # print(f'This is the href {self.href}')
        output = {}
        b_tags = self.soup.find_all('b')

        for i, b_tag in enumerate(b_tags):
            key = b_tag.text.lower()
            # Extract all navigable strings until the next <b> tag
            value_tags = []
            next_sibling = b_tag.next_sibling


            while next_sibling and next_sibling.name != 'b':
                if hasattr(next_sibling, 'strings'):
                    # Join all strings to handle multiple lines
                    value_tags.append(' '.join(next_sibling.strings).strip())

                next_sibling = next_sibling.next_sibling

            cleaned_value = self.clean_text(value_tags)
            cleaned_value = self.remove_parenthesis(cleaned_value)
            output[key] = cleaned_value

        return output            

import json


def get_class_info_json(url, class_name):
    scraper = ArchivesNethysScraper(url, class_name)
    class_info = scraper.get_class_info(next(scraper.table_rows(), None).string)
    filtered_info = {k: v for k, v in class_info.items() if v}
    return filtered_info

class_names = ["Alchemist", "Antipaladin", "Arcanist", "Barbarian", "Barbarian _unchained", "Bard", "Bloodrager", "Brawler", "Cavalier", "Cleric", "Druid", "Fighter", "Gunslinger", "Hunter", "Inquisitor", "Investigator", "Kineticist", "Magus", "Medium", "Mesmerist", "Monk", "Monk _unchained", "Ninja", "Occultist", "Oracle", "Paladin", "Psychic", "Ranger", "Rogue", "Rogue _unchained", "Samurai", "Shaman", "Shifter", "Skald", "Slayer", "Sorcerer", "Spiritualist", "Summoner", "Summoner _unchained", "Swashbuckler", "Vigilante", "Warpriest", "Witch", "Wizard"]

# Create a dictionary to store class information
class_info_dict = {}
for class_name in class_names:
    class_info_dict[class_name] = get_class_info_json(url, class_name)

# Write the dictionary to a JSON file
with open('base_class_info.json', 'w') as json_file:
    json.dump(class_info_dict, json_file, indent=2)
 

        

