In [14]:
from bs4 import BeautifulSoup
import requests
import json

url = 'https://www.aonprd.com/'

class ArchetypeScraper:
    def __init__(self, url, class_name):
        self.url = url 
        self.href = f'MagicWondrous.aspx?FinalSlot={class_name}'
        self.page = requests.get(url + self.href)
        self.soup = BeautifulSoup(self.page.content, 'html.parser')
        self.table = self.soup.find("table")

    def table_rows(self):
        for tr in self.soup.find_all('tr')[1:]:
            yield tr

    def archetype_name(self, href2_list):
        for url in href2_list:
            values = url.split('=')[1]
            #url has the class name + the archetype name, we need to remove the class name manually
            value = values.replace(f'{class_name} ', '')
            return value            

    def clean_text(self, text):
        # Replace Unicode right single quotation mark with an apostrophe
        cleaned_text = [line.replace('\u2019', "'").strip() for line in text if line.strip() and line.strip() != "."]
        return ' '.join(cleaned_text)

    def get_archetype_info(self, table_row):
        output = {}
        tag = table_row.td
        href2 = tag.a.attrs['href']
        href2_list = []
        href2_list.append(href2)
        value = self.archetype_name(href2_list)

        detail_page = requests.get(self.url + href2)
        detail_soup = BeautifulSoup(detail_page.content, 'html.parser')

        # Find all <b> tags on the detail page
        b_tags = detail_soup.find_all('b')

        for i, b_tag in enumerate(b_tags):
            key = b_tag.text.lower()

            # Extract all navigable strings until the next <b> tag
            value_tags = []
            next_sibling = b_tag.next_sibling


            while next_sibling and next_sibling.name != 'b':
                if hasattr(next_sibling, 'strings'):
                    # Join all strings to handle multiple lines
                    value_tags.append(' '.join(next_sibling.strings).strip())

                next_sibling = next_sibling.next_sibling

            cleaned_value = self.clean_text(value_tags)
            output[key] = cleaned_value

        return value, output

# List of class names
class_names = ["Belts", "Body", "Chest", "Eyes", "Feet", "Hands", "Head", "Headband", "Neck", "Shoulders", "Wrist", "None/Other"]

# Create a dictionary to store the JSON output for each class
json_output = {}

# Iterate through class names and scrape archetype information
for class_name in class_names:
    arch_scraper = ArchetypeScraper(url, class_name)

    # Create a list to store information for all archetypes of the current class
    archetypes_info = {}

    # Iterate through all table rows and collect information for each archetype
    for table_row in arch_scraper.table_rows():
        archetype_name, archetype_info = arch_scraper.get_archetype_info(table_row)
        archetypes_info[archetype_name] = archetype_info

    # Add the dictionary of archetypes' information to the json_output dictionary
    json_output[class_name] = archetypes_info

# Convert the dictionary to a JSON-formatted string
json_string = json.dumps(json_output, indent=2)

with open('items.json', 'w', encoding='utf-8') as json_file:
    json.dump(json_output, json_file, indent=2, ensure_ascii=False)

# Print the JSON string
print(json_string)


{
  "Belts": {
    "Belt of Tumbling": {
      "source": "Ultimate Equipment pg. 211",
      "aura": "faint transmutation;",
      "cl": "1st",
      "slot": "belt;",
      "price": "800 gp;",
      "weight": "1 lb. Description This thin and flexible cotton cord is meant to be wrapped\r several times around wearer's waist. The belt's wearer gains\r a +4 competence bonus on Acrobatics checks made to move\r through a threatened square or through an enemy's space. Construction",
      "requirements": "Craft Wondrous Item, cat's grace ;",
      "cost": "400 gp"
    },
    "Beneficial Bandolier": {
      "source": "Ultimate Equipment pg. 211",
      "aura": "moderate transmutation;",
      "cl": "9th",
      "slot": "belt;",
      "price": "1,000 gp;",
      "weight": "2 lbs. Description This bandolier is made of finely tanned\r leather. It has slots for up to 200 rounds\r of ammunition. Pellets and black\r powder are kept in tiny individual\r pouches, and bullets in small loops. The bandol

In [None]:
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
import requests
import re

url = 'https://www.aonprd.com/'


class ArchivesNethysScraper:
    def __init__(self, url, href, item):
        self.url = url
        self.href = href = f'MagicWondrous.aspx?FinalSlot={item}'
        self.page = requests.get(url+href)
        self.soup = BeautifulSoup(self.page.content, 'html.parser')
        self.table = self.soup.find("table")

    def clean_text(self, text):
        # Replace Unicode right single quotation mark with an apostrophe
        cleaned_text = [line.replace('\u2019', "'").replace('\u2018', "'").replace('\u201c', '"').replace('\u201d', '"') for line in text.split('\n') if line.strip() and line.strip() != "."]
        return ' '.join(cleaned_text).strip()
    
    def remove_parenthesis(self, input_string):
        pattern = r"\(.*?pg\..*?\)"
        result = re.sub(pattern, '', input_string)
        result = re.sub(r'<.*?>', '', result)        

        return result

    def table_rows(self):
        for tr in self.table.find_all('tr')[1:]:
            yield tr

    def get_item_info(self, table_row):
        output = {}
        tag = table_row.td
        href = tag.a.attrs['href']
        page = requests.get(url+href)
        attributes = ['Slot', 'Price', 'Weight']
        
        def fetch_info(left_tag, right_tag, format_output):
            regex = re.compile(
                f'{left_tag}(.*){right_tag}'
            )
            return format_output(regex.findall(page.text)[0])
            
        format_string = lambda s: re.sub(r'<.*?>', '', s.replace('\r', '').replace(';', ''))
        for i in range(len(attributes)-1):
            l = f'<b>{attributes[i]}</b>'
            r = f'<b>{attributes[i+1]}</b>'
            output[attributes[i].lower()] = fetch_info(l, r, format_string)
        l = r
        r = '<h3 class="framing">Description</h3>'
        output[attributes[2].lower()] = fetch_info(l, r,format_string)
        output['description'] = fetch_info(
            '<h3 class="framing">Description</h3>',
            '<h3 class="framing">Construction</h3>',
            lambda str: str.replace('\r', '')
        )
        return output

items = ["Belts", "Body", "Chest", "Eyes", "Feet", "Hands", "Head", "Headband", "Neck", "Shoulders", "Wrist", "None/Other"]

for item in items:
    scraper = ArchivesNethysScraper(url, href, item)
    for tr in scraper.table_rows():
        print(scraper.get_item_info(tr))

# body = ArchivesNethysScraper(url, 'MagicWondrous.aspx?FinalSlot=Body')
# for tr in body.table_rows():
#    print(body.get_item_info(tr))
#    break