In [None]:
from bs4 import BeautifulSoup
import requests
import json

url = 'https://www.aonprd.com/'

class ArchetypeScraper:
    def __init__(self, url, class_name):
        self.url = url
        self.href = f'Archetypes.aspx?Class={class_name}'
        self.page = requests.get(url + self.href)
        self.soup = BeautifulSoup(self.page.content, 'html.parser')
        self.table = self.soup.find("table")

    def table_rows(self):
        for tr in self.soup.find_all('tr')[1:]:
            yield tr

    def archetype_name(self, href2_list):
        for url in href2_list:
            values = url.split('=')[1]
            #url has the class name + the archetype name, we need to remove the class name manually
            value = values.replace(f'{class_name} ', '')
            return value            

    def clean_text(self, text):
        # Replace Unicode right single quotation mark with an apostrophe
        cleaned_text = [line.replace('\u2019', "'").strip() for line in text if line.strip() and line.strip() != "."]
        return ' '.join(cleaned_text)

    def get_archetype_info(self, table_row):
        output = {}
        tag = table_row.td
        href2 = tag.a.attrs['href']
        href2_list = []
        href2_list.append(href2)
        value = self.archetype_name(href2_list)

        detail_page = requests.get(self.url + href2)
        detail_soup = BeautifulSoup(detail_page.content, 'html.parser')

        # Find all <b> tags on the detail page
        b_tags = detail_soup.find_all('b')

        for i, b_tag in enumerate(b_tags):
            key = b_tag.text.lower()

            # Extract all navigable strings until the next <b> tag
            value_tags = []
            next_sibling = b_tag.next_sibling


            while next_sibling and next_sibling.name != 'b':
                if hasattr(next_sibling, 'strings'):
                    # Join all strings to handle multiple lines
                    value_tags.append(' '.join(next_sibling.strings).strip())

                next_sibling = next_sibling.next_sibling

            cleaned_value = self.clean_text(value_tags)
            output[key] = cleaned_value

        return value, output

# List of class names
class_names = ["Alchemist", "Antipaladin", "Arcanist", "Barbarian", "Barbarian (Unchained)", "Bard", "Bloodrager", "Brawler", "Cavalier", "Cleric", "Druid", "Fighter", "Gunslinger", "Hunter", "Inquisitor", "Investigator", "Kineticist", "Magus", "Medium", "Mesmerist", "Monk", "Monk (Unchained)", "Ninja", "Occultist", "Oracle", "Paladin", "Psychic", "Ranger", "Rogue", "Rogue (Unchained)", "Samurai", "Shaman", "Shifter", "Skald", "Slayer", "Sorcerer", "Spiritualist", "Summoner", "Summoner (Unchained)", "Swashbuckler", "Vigilante", "Warpriest", "Witch", "Wizard"]

# Create a dictionary to store the JSON output for each class
json_output = {}

# Iterate through class names and scrape archetype information
for class_name in class_names:
    arch_scraper = ArchetypeScraper(url, class_name)

    # Create a list to store information for all archetypes of the current class
    archetypes_info = {}

    # Iterate through all table rows and collect information for each archetype
    for table_row in arch_scraper.table_rows():
        archetype_name, archetype_info = arch_scraper.get_archetype_info(table_row)
        archetypes_info[archetype_name] = archetype_info

    # Add the dictionary of archetypes' information to the json_output dictionary
    json_output[class_name] = archetypes_info

# Convert the dictionary to a JSON-formatted string
json_string = json.dumps(json_output, indent=2)

# Print the JSON string
print(json_string)
