In [190]:
from bs4 import BeautifulSoup
import pandas as pd

In [191]:
with open('../data/pets.html', 'r', encoding='utf-8') as file:
    soup = BeautifulSoup(file)

In [192]:
info = soup.find_all('article', class_='tabber__panel')
tier_info = info[:-1]
token_info = info[-1]

In [193]:
def extract_pet_data(tier_info):
    tier_data = []
    for tier in tier_info:
        tier_name = tier.find('h3').get_text(strip=True)
        table = tier.find('table', class_='wikitable')
        if not table:
            continue

        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if not cells:
                continue

            if cells[0].has_attr('rowspan'):
                rowspan = int(cells[0]['rowspan'])
                name = cells[0].find('a')['title']
                attack = cells[1].get_text(strip=True)
                health = cells[2].get_text(strip=True)
                pack_ref = cells[-1].find_all('a')
                packs = [ref.get('title', '').strip() for ref in pack_ref]

                ability_rows = [row] + [row.find_next_sibling('tr', recursive=False) for _ in range(rowspan - 1)]
                for idx, ability_row in enumerate(ability_rows):
                    ability_cells = ability_row.find_all('td')
                    if idx == 0:
                        Level = ability_cells[3].text.strip()
                        ability_desc = ability_cells[4].text.strip()
                    else:
                        Level = ability_cells[0].text.strip()
                        ability_desc = ability_cells[1].text.strip()

                    tier_data.append({
                        'Name': name,
                        'Attack': attack,
                        'Health': health,
                        'Level': Level,
                        'Ability': ability_desc,
                        'Packs': packs,
                        'Tier': tier_name
                    })
    pets_df = pd.DataFrame(tier_data)

    return pets_df

def extract_token_data(token_info):
    table = token_info.find('table', class_='wikitable')
    df = pd.read_html(str(table), header=[0, 1])[0]

    # Expand rows where 'Level 2 & 3' appears in the ability table
    # Expand rows where 'Level 2 & 3' appears in the ability table
    if any(df.iloc[:, 0].astype(str).str.contains('Level 2 & 3')):
        rows_to_add = []
        for idx, row in df.iterrows():
            if str(row[0]).strip() == 'Level 2 & 3':
                for lvl in ['Level 2', 'Level 3']:
                    new_row = row.copy()
                    new_row[0] = lvl
                    rows_to_add.append((idx, new_row))
        # Insert new rows after the original and drop the 'Level 2 & 3' row
        for offset, (idx, new_row) in enumerate(rows_to_add):
            df = pd.concat([
                df.iloc[:idx + 1 + offset],
                pd.DataFrame([new_row], columns=df.columns),
                df.iloc[idx + 1 + offset:]
            ]).reset_index(drop=True)
        df = df[df.iloc[:, 0] != 'Level 2 & 3'].reset_index(drop=True)
    
    # Flatten column names
    df.columns = ['_'.join(col).strip().replace(' ', '_') for col in df.columns.values]

    # Separate rows into token rows and ability description rows
    token_rows = df[~df['Name_Name'].str.match(r'^Level \d$', na=False)].copy()
    ability_rows = df[df['Name_Name'].str.match(r'^Level \d$', na=False)].copy()

    # Rename columns for clarity
    token_rows = token_rows.rename(columns={
        'Name_Name': 'Name',
        'Unnamed:_1_level_0_Level_1': 'Level_1',
        'Unnamed:_2_level_0_Level_2': 'Level_2',
        'Unnamed:_3_level_0_Level_3': 'Level_3',
        'Ability_Ability': 'Ability',
        'Summoned_From_Summoned_From': 'Summoned_From',
        'Additional_Notes_Additional_Notes': 'Notes'
    })

    # Forward-fill the Name in ability description rows to associate with the last token
    ability_rows['Ability_Level'] = ability_rows['Name_Name'].str.extract(r'Level (\d)').astype(int)
    ability_rows['Ability_Description'] = ability_rows['Unnamed:_1_level_0_Level_1']
    ability_rows['Name'] = pd.NA

    last_name = None
    names = []
    for idx, row in ability_rows.iterrows():
        if idx > 0:
            # Search backward for the last valid token name before this group
            for rev_idx in range(idx - 1, -1, -1):
                if rev_idx in token_rows.index:
                    last_name = token_rows.loc[rev_idx, 'Name']
                    break
        names.append(last_name)
    ability_rows['Name'] = names

    ability_rows = ability_rows[['Name', 'Ability_Level', 'Ability_Description']].dropna()

    # Melt the token stats into long format
    token_long = pd.melt(
        token_rows,
        id_vars=['Name', 'Ability', 'Summoned_From', 'Notes'],
        value_vars=['Level_1', 'Level_2', 'Level_3'],
        var_name='Level',
        value_name='Stats'
    )

    token_long['Ability_Level'] = token_long['Level'].str.extract(r'Level_(\d)').astype(int)

    # Split Stats into Attack and Health
    stat_split = token_long['Stats'].str.extract(r'(?P<Attack>[\dX/*\(\)]+)[/](?P<Health>[\dX/*\(\)]+)')
    token_long = pd.concat([token_long, stat_split], axis=1)

    # Merge ability descriptions
    final_df = pd.merge(token_long, ability_rows, how='left', on=['Name', 'Ability_Level'])

    # Select and reorder columns
    final_df = final_df[['Name', 'Attack', 'Health', 'Ability_Level', 'Ability_Description', 'Summoned_From', 'Notes']]
    final_df.sort_values(by=['Name', 'Ability_Level'], inplace=True)
    final_df.reset_index(drop=True, inplace=True)
    final_df.rename(columns={
        'Ability_Level': 'Level',
        'Ability_Description': 'Ability',
        'Summoned_From': 'Summoned From',
    }, inplace=True)
    return final_df

In [194]:
pets_df = extract_pet_data(tier_info)
tokens_df = extract_token_data(token_info)

df_ids = ['pets', 'tokens']


for df_id, df in zip(df_ids, [pets_df, tokens_df]):
    df.to_csv(f'../data/{df_id}.csv', index=False)
    print(f"Saved {df_id} data to CSV.")

Saved pets data to CSV.
Saved tokens data to CSV.


  df = pd.read_html(str(table), header=[0, 1])[0]
  if str(row[0]).strip() == 'Level 2 & 3':
  new_row[0] = lvl
