# Add more books to dataset

## Load books names

In [1]:
import numpy as np
import pandas as pd
import sys

import os
import pathlib


sys.path.append("../../..")
from training.creating_dataset import load_and_preprocess_data, load_data

current_path = os.getcwd()
DATASETS_DIR = pathlib.Path(current_path).parent.parent / "pathfinder_2e_data"

# DATASET_PATHS = [f"{DATASETS_DIR}/{file}" for file in os.listdir(DATASETS_DIR)]
BOOKS = [file for file in os.listdir(DATASETS_DIR)]

In [2]:
BOOKS

['abomination-vaults-bestiary.db',
 'action-macros.db',
 'actions.db',
 'adventure-specific-actions.db',
 'age-of-ashes-bestiary.db',
 'agents-of-edgewatch-bestiary.db',
 'ancestries.db',
 'ancestryfeatures.db',
 'april-fools-bestiary.db',
 'backgrounds.db',
 'bestiary-ability-glossary-srd.db',
 'bestiary-effects.db',
 'bestiary-family-ability-glossary.db',
 'blog-bestiary.db',
 'blood-lords-bestiary.db',
 'book-of-the-dead-bestiary.db',
 'boons-and-curses.db',
 'campaign-effects.db',
 'classes.db',
 'classfeatures.db',
 'conditions.db',
 'criticaldeck.db',
 'crown-of-the-kobold-king-bestiary.db',
 'deities.db',
 'domains.db',
 'equipment-effects.db',
 'equipment.db',
 'example.db',
 'extinction-curse-bestiary.db',
 'fall-of-plaguestone.db',
 'familiar-abilities.db',
 'feat-effects.db',
 'feats.db',
 'fists-of-the-ruby-phoenix-bestiary.db',
 'gatewalkers-bestiary.db',
 'gmg-srd.db',
 'hazards.db',
 'heritages.db',
 'hero-point-deck.db',
 'iconics.db',
 'impossible-lands-bestiary.db',
 

In [3]:
len(BOOKS)

74

In [4]:
features = [
    "cha",
    "con",
    "dex",
    "int",
    "str",
    "wis",
    "ac",
    "hp",
    "perception",
    "fortitude",
    "reflex",
    "will",
    "focus",
    "land_speed",
    "num_immunities",
    "fly",
    "swim",
    "climb",
    "fire_resistance",
    "cold_resistance",
    "electricity_resistance",
    "acid_resistance",
    "piercing_resistance",
    "slashing_resistance",
    "physical_resistance",
    "bludgeoning_resistance",
    "mental_resistance",
    "poison_resistance",
    "all-damage_resistance",
    "cold-iron_weakness",
    "good_weakness",
    "fire_weakness",
    "cold_weakness",
    "area-damage_weakness",
    "splash-damage_weakness",
    "evil_weakness",
    "slashing_weakness",
    "melee",
    "ranged",
    "spells",
]

## Load and preprocess books
Finding books with useful data and updating create_dataset functions to preprocess books.

In [5]:
import json


def check_book(path: str) -> (bool, str):
    data = []

    with open(path) as file:
        # loading json strings from files
        data += [json.loads(line) for line in file]
        file.close()

    bestiary = pd.json_normalize(data)

    if "type" not in bestiary.columns:
        return False, "no type"

    # only npc monsters
    bestiary = bestiary[bestiary["type"] == "npc"]

    if len(bestiary) > 0:
        return True, f"{len(bestiary)}"

    return False, "no nps"

In [6]:
class bcolors:
    OKGREEN = "\033[92m"
    FAIL = "\033[91m"
    ENDC = "\033[0m"

In [7]:
FILTERED_DATA = []
FAILED = []
monster_sum = 0

for book in BOOKS:
    print(book, end=" -> ")
    try:
        result, reason = check_book(path=f"{DATASETS_DIR}/{book}")
    except:
        print(f"{bcolors.FAIL}FAIL{bcolors.ENDC}: exception")
        FAILED.append(book)
        continue
    if result:
        print(f"{bcolors.OKGREEN}accepted{bcolors.ENDC}: {reason} npcs")
        FILTERED_DATA.append(book)
        monster_sum += int(reason)
    else:
        print(f"{bcolors.FAIL}rejected{bcolors.ENDC}: {reason}")

abomination-vaults-bestiary.db -> [92maccepted[0m: 97 npcs
action-macros.db -> [91mrejected[0m: no nps
actions.db -> [91mrejected[0m: no nps
adventure-specific-actions.db -> [91mrejected[0m: no nps
age-of-ashes-bestiary.db -> [92maccepted[0m: 102 npcs
agents-of-edgewatch-bestiary.db -> [92maccepted[0m: 150 npcs
ancestries.db -> [91mrejected[0m: no nps
ancestryfeatures.db -> [91mrejected[0m: no nps
april-fools-bestiary.db -> [92maccepted[0m: 9 npcs
backgrounds.db -> [91mrejected[0m: no nps
bestiary-ability-glossary-srd.db -> [91mrejected[0m: no nps
bestiary-effects.db -> [91mrejected[0m: no nps
bestiary-family-ability-glossary.db -> [91mrejected[0m: no nps
blog-bestiary.db -> [92maccepted[0m: 19 npcs
blood-lords-bestiary.db -> [92maccepted[0m: 139 npcs
book-of-the-dead-bestiary.db -> [92maccepted[0m: 91 npcs
boons-and-curses.db -> [91mrejected[0m: no nps
campaign-effects.db -> [91mrejected[0m: no nps
classes.db -> [91mrejected[0m: no nps
classfeatur

### Failed to load files

In [8]:
len(FAILED), FAILED

(7,
 ['deities.db',
  'equipment.db',
  'example.db',
  'feats.db',
  'journals.db',
  'spell-effects.db',
  'spells.db'])

In [9]:
for book in FAILED:
    # f"{DATASETS_DIR}/{book}"
    print(book, end=" -> ")
    try:
        df = pd.read_json(f"{DATASETS_DIR}/{book}", lines=True)
    except:
        print(f"{bcolors.FAIL}FAIL{bcolors.ENDC}: exception")
        continue

    if "type" not in df.columns:
        print(f"{bcolors.FAIL}rejected{bcolors.ENDC}: no type column")
        continue

    df = df[df["type"] == "npc"]

    if len(df) > 0:
        print(f"{bcolors.OKGREEN}accepted{bcolors.ENDC}: {len(bestiary)} monsters")

    print(f"{bcolors.FAIL}rejected{bcolors.ENDC}: no npcs")
    # break

deities.db -> [91mrejected[0m: no npcs
equipment.db -> [91mrejected[0m: no npcs
example.db -> [91mFAIL[0m: exception
feats.db -> [91mrejected[0m: no npcs
journals.db -> [91mrejected[0m: no type column
spell-effects.db -> [91mrejected[0m: no npcs
spells.db -> [91mrejected[0m: no npcs


example.db was checked manually ==> there is nothing important there

### Books with NPCs

In [10]:
len(FILTERED_DATA), FILTERED_DATA

(37,
 ['abomination-vaults-bestiary.db',
  'age-of-ashes-bestiary.db',
  'agents-of-edgewatch-bestiary.db',
  'april-fools-bestiary.db',
  'blog-bestiary.db',
  'blood-lords-bestiary.db',
  'book-of-the-dead-bestiary.db',
  'crown-of-the-kobold-king-bestiary.db',
  'extinction-curse-bestiary.db',
  'fall-of-plaguestone.db',
  'fists-of-the-ruby-phoenix-bestiary.db',
  'gatewalkers-bestiary.db',
  'impossible-lands-bestiary.db',
  'kingmaker-bestiary.db',
  'malevolence-bestiary.db',
  'menace-under-otari-bestiary.db',
  'monsters-of-myth-bestiary.db',
  'mwangi-expanse-bestiary.db',
  'night-of-the-gray-death-bestiary.db',
  'npc-gallery.db',
  'one-shot-bestiary.db',
  'outlaws-of-alkenstar-bestiary.db',
  'pathfinder-bestiary-2.db',
  'pathfinder-bestiary-3.db',
  'pathfinder-bestiary.db',
  'pathfinder-dark-archive.db',
  'pfs-introductions-bestiary.db',
  'pfs-season-1-bestiary.db',
  'pfs-season-2-bestiary.db',
  'pfs-season-3-bestiary.db',
  'pfs-season-4-bestiary.db',
  'quest-f

In [11]:
f"All nps in accepted books: {monster_sum}"

'All nps in accepted books: 3690'

In [12]:
FAILED_TO_PREPROCESS = []


for book in FILTERED_DATA:
    print(book, end=" -> ")
    try:
        df = load_and_preprocess_data(
            paths_to_books=[f"{DATASETS_DIR}/{book}"], characteristics=features
        )
    except Exception as error:
        print(f"{bcolors.FAIL}FAIL{bcolors.ENDC}: {error}")
        FAILED_TO_PREPROCESS.append(book)
        continue

    print(f"{bcolors.OKGREEN}accepted{bcolors.ENDC}")

abomination-vaults-bestiary.db -> [92maccepted[0m
age-of-ashes-bestiary.db -> [92maccepted[0m
agents-of-edgewatch-bestiary.db -> [92maccepted[0m
april-fools-bestiary.db -> [92maccepted[0m
blog-bestiary.db -> [92maccepted[0m
blood-lords-bestiary.db -> [92maccepted[0m
book-of-the-dead-bestiary.db -> [92maccepted[0m
crown-of-the-kobold-king-bestiary.db -> [92maccepted[0m
extinction-curse-bestiary.db -> [92maccepted[0m
fall-of-plaguestone.db -> [92maccepted[0m
fists-of-the-ruby-phoenix-bestiary.db -> [92maccepted[0m
gatewalkers-bestiary.db -> [92maccepted[0m
impossible-lands-bestiary.db -> [92maccepted[0m
kingmaker-bestiary.db -> [92maccepted[0m
malevolence-bestiary.db -> [92maccepted[0m
menace-under-otari-bestiary.db -> [92maccepted[0m
monsters-of-myth-bestiary.db -> [92maccepted[0m
mwangi-expanse-bestiary.db -> [92maccepted[0m
night-of-the-gray-death-bestiary.db -> [92maccepted[0m
npc-gallery.db -> [92maccepted[0m
one-shot-bestiary.db -> [92maccep

### Failed to preprocess:

In [13]:
len(FAILED_TO_PREPROCESS), FAILED_TO_PREPROCESS

(0, [])

Some books under might be now accepted because of changes applyied.

Original FAILED_TO_PREPROCESS:
* agents-of-edgewatch-bestiary.db
* april-fools-bestiary.db
* extinction-curse-bestiary.db
* fall-of-plaguestone.db
* impossible-lands-bestiary.db
* npc-gallery.db
* pathfinder-dark-archive.db
* pfs-introductions-bestiary.db
* travel-guide-bestiary.db

#### agents-of-edgewatch-bestiary.db

In [14]:
df = load_data(paths_to_books=[f"{DATASETS_DIR}/agents-of-edgewatch-bestiary.db"])

In [15]:
from training.creating_dataset import preprocess_data


# df = preprocess_data(df, characteristics=features)

# ValueError: too many values to unpack (expected 2)
# count_damage_expected_value
# invalid literal for int() with base 10: 'varies by'
# count_damage_expected_value

In [16]:
df.head()

Unnamed: 0,_id,img,items,name,type,system.abilities.cha.mod,system.abilities.con.mod,system.abilities.dex.mod,system.abilities.int.mod,system.abilities.str.mod,...,system.details.reset,system.details.routine,system.source.value,system.statusEffects,prototypeToken.name,system.attributes.adjustment,system.attributes.emitsSound,system.source.author,system.attributes.speed.details,system.attributes.hardness.value
0,07AGJt4ZRjwH85Xp,systems/pf2e/icons/default-icons/npc.svg,"[{'_id': 'ojlesPsjiDHjASWl', 'img': 'systems/p...",Mother Venom,npc,8.0,9.0,6.0,3.0,7.0,...,,,,,,,,,,
1,0UbehYHzOGlNK8Hc,systems/pf2e/icons/default-icons/npc.svg,"[{'_id': 'Ntt0pX48FlDnOFb3', 'img': 'systems/p...",Baatamidar,npc,6.0,5.0,9.0,6.0,6.0,...,,,,,,,,,,
2,0ti3f4fdcB5D2bLB,systems/pf2e/icons/default-icons/npc.svg,"[{'_id': 'VzwOdAoGJR09TZhY', 'flags': {'core':...",Casino Bouncer,npc,1.0,5.0,2.0,0.0,4.0,...,,,,,,,,,,
3,10fEM7T48FUZRo6l,systems/pf2e/icons/default-icons/npc.svg,"[{'_id': 'rDNKRU6LA2e9G96b', 'img': 'systems/p...",Barnacle Ghoul,npc,4.0,3.0,6.0,1.0,6.0,...,,,,,,,,,,
4,181ucNY1zpp2Lz3x,systems/pf2e/icons/default-icons/npc.svg,"[{'_id': 'OoUH7OmiEY82enBA', 'flags': {'core':...",Grunka,npc,1.0,1.0,3.0,0.0,0.0,...,,,,,,,,,,


In [17]:
def count_damage_expected_value(damage_dict: dict[dict]) -> float:
    """
    Calculate the total expected value of damage based on a dictionary of damage specifications.

    :param damage_dict: A dictionary where keys represent different sources of damage,
                        and values are dictionaries with the "damage" key containing damage specifications.
                        Damage can be a constant value or a dice roll in the format 'NdM', 'NdM+X', or 'NdM-X',
                        where N is the number of dice, M is the number of sides on the dice, and X is an optional
                        positive or negative constant.
    :return: The calculated total expected value of damage.
    """
    total_expected_val = 0

    # chance that one melee item have multiple damage types
    for key, value in damage_dict.items():
        damage = value["damage"]

        if not damage or damage == "varies by":
            return 0

        if "d" not in damage:
            # constant damage value
            try:
                total_expected_val += int(damage)
            except:
                print(f"Damage: {damage}")
            continue
        # split dice roll
        roll_nr, dice_type = damage.split("d")
        add = 0
        if "+" in dice_type:
            try:
                """================ change ==================="""
                # get possible positive additional value for damage
                dice_type, all_additional_values = dice_type.split("+", 1)
                # there are monsters with damage like: 3d10+12+2
                all_additional_values = all_additional_values.split("+")
                add = 0
                for add_value in all_additional_values:
                    add += int(add_value)
                """================ end ==================="""
            except:
                print(f"Damage: {damage}, dice: {dice_type}")
        if "-" in dice_type:
            # get possible negative additional value for damage
            dice_type, add = dice_type.split("-")
            add = -int(add)

        roll_nr, dice_type = int(roll_nr), int(dice_type)
        # count expected value with additional damages
        total_expected_val += roll_nr * (dice_type + 1) / 2 + add

    return total_expected_val


def get_max_melee_bonus_damage(
    items_list: list[dict], weapon_type: str
) -> tuple[int, float]:
    """
    Function used for pd.Series.apply()\n
    Get the maximum damageRoll bonus and associated damage from a list of melee of a specific weaponType.

    :param items_list: A list of dictionaries representing melee weapons, each with relevant attributes.
    :param weapon_type: The type of weapon to filter by.
    :return: A tuple containing the maximum bonus and the calculated damage associated with that bonus.
             If no matching melee weapons are found, returns (0, 0).
    """
    melee = [
        i["system"]
        for i in items_list
        if i["type"] == "melee" and i["system"]["weaponType"]["value"] == weapon_type
    ]

    if not melee:
        return 0, 0

    idx_val = [(val["bonus"]["value"], idx) for idx, val in enumerate(melee)]
    # find melee with the highest bonus: bonus and idx
    max_bonus, max_bonus_idx = max(idx_val)
    # get damage information about max_bonus melee
    best_bonus_melee_damage = melee[max_bonus_idx]["damageRolls"]
    # get expected value of chosen melee
    damage_expected_value = count_damage_expected_value(best_bonus_melee_damage)
    return max_bonus, damage_expected_value

In [18]:
# df["melee_max_bonus"], df["melee_damage_exp_val"] = zip(
#     *df["items"].apply(lambda x: get_max_melee_bonus_damage(x, "melee"))
# )
# ValueError: too many values to unpack (expected 2)
# change applied to preprocessing

In [19]:
df["ranged_max_bonus"], df["ranged_damage_exp_val"] = zip(
    *df["items"].apply(lambda x: get_max_melee_bonus_damage(x, "ranged"))
)
# ValueError: invalid literal for int() with base 10: 'varies by'

In [20]:
ranged = []

for i, val in df["items"].iteritems():
    # print(type(val), val)

    # break
    ranged_items = [
        i["system"]
        for i in val
        if i.get("type") == "melee"
        and i.get("system").get("weaponType").get("value") == "ranged"
    ]

    for item in ranged_items:
        item["monster_id"] = i

    ranged += ranged_items

In [21]:
ranged_df = pd.DataFrame.from_dict(data=ranged)  # load data to DataFrame

In [22]:
ranged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   attack         84 non-null     object
 1   attackEffects  84 non-null     object
 2   bonus          84 non-null     object
 3   damageRolls    84 non-null     object
 4   description    84 non-null     object
 5   rules          84 non-null     object
 6   slug           1 non-null      object
 7   source         84 non-null     object
 8   traits         84 non-null     object
 9   weaponType     84 non-null     object
 10  schema         84 non-null     object
 11  monster_id     84 non-null     int64 
dtypes: int64(1), object(11)
memory usage: 8.0+ KB


In [23]:
ranged_df.head()

Unnamed: 0,attack,attackEffects,bonus,damageRolls,description,rules,slug,source,traits,weaponType,schema,monster_id
0,{'value': ''},"{'custom': '', 'value': ['spit']}",{'value': 31},{},{'value': ''},[],spit,{'value': ''},"{'rarity': 'common', 'value': ['agile', 'range...",{'value': 'ranged'},"{'version': 0.827, 'lastMigration': None}",0
1,{'value': ''},"{'custom': '', 'value': ['fill-lungs']}",{'value': 21},"{'0': {'damage': '2d6+10', 'damageType': 'acid'}}",{'value': ''},[],,{'value': ''},"{'rarity': 'common', 'value': ['range-incremen...",{'value': 'ranged'},"{'version': 0.827, 'lastMigration': None}",3
2,{'value': ''},{'value': []},{'value': 8},"{'0': {'damage': '1d8', 'damageType': 'fire'},...",{'value': ''},[],,{'value': ''},"{'rarity': 'common', 'value': ['alchemical', '...",{'value': 'ranged'},"{'version': 0.827, 'lastMigration': None}",4
3,{'value': ''},"{'custom': '', 'value': []}",{'value': 14},{'toq9z9hx6hotqsn9g0er': {'damage': 'varies by...,{'value': ''},[],,{'value': ''},"{'rarity': 'common', 'value': ['range-incremen...",{'value': 'ranged'},"{'version': 0.827, 'lastMigration': None}",5
4,{'value': ''},{'value': []},"{'total': 34, 'value': 34}","{'0': {'damage': '5d6+7', 'damageType': 'bludg...",{'value': ''},"[{'key': 'Note', 'outcome': ['criticalSuccess'...",,{'value': ''},"{'rarity': 'common', 'value': ['range-incremen...",{'value': 'ranged'},"{'version': 0.827, 'lastMigration': None}",7


In [24]:
ranged

[{'attack': {'value': ''},
  'attackEffects': {'custom': '', 'value': ['spit']},
  'bonus': {'value': 31},
  'damageRolls': {},
  'description': {'value': ''},
  'rules': [],
  'slug': 'spit',
  'source': {'value': ''},
  'traits': {'rarity': 'common', 'value': ['agile', 'range-increment-30']},
  'weaponType': {'value': 'ranged'},
  'schema': {'version': 0.827, 'lastMigration': None},
  'monster_id': 0},
 {'attack': {'value': ''},
  'attackEffects': {'custom': '', 'value': ['fill-lungs']},
  'bonus': {'value': 21},
  'damageRolls': {'0': {'damage': '2d6+10', 'damageType': 'acid'}},
  'description': {'value': ''},
  'rules': [],
  'slug': None,
  'source': {'value': ''},
  'traits': {'rarity': 'common', 'value': ['range-increment-10']},
  'weaponType': {'value': 'ranged'},
  'schema': {'version': 0.827, 'lastMigration': None},
  'monster_id': 3},
 {'attack': {'value': ''},
  'attackEffects': {'value': []},
  'bonus': {'value': 8},
  'damageRolls': {'0': {'damage': '1d8', 'damageType': '

In [25]:
for i in ranged:
    damageRoll = i["damageRolls"]
    if len(damageRoll) == 0:
        continue
    for damage in damageRoll.values():
        if damage["damage"] == "varies by":
            print(f"monster id: {i['monster_id']}, damageRoll: {damageRoll}")

monster id: 5, damageRoll: {'toq9z9hx6hotqsn9g0er': {'damage': 'varies by', 'damageType': 'bomb'}}
monster id: 151, damageRoll: {'pzqjnv3o9mh2sdkeivj7': {'damage': 'varies by', 'damageType': 'bomb'}}


In [26]:
df.loc[[5, 151]]["name"]

5         Amateur Chemist
151    Alchemist Aspirant
Name: name, dtype: object

#### april-fools-bestiary.db

In [27]:
df = load_data(paths_to_books=[f"{DATASETS_DIR}/april-fools-bestiary.db"])

In [28]:
from training.creating_dataset import preprocess_data


# df = preprocess_data(df, characteristics=features)
# KeyError: 'system.attributes.weaknesses'

In [29]:
system = [col for col in df.columns if "system" in col]

In [30]:
system

['system.abilities.cha.mod',
 'system.abilities.con.mod',
 'system.abilities.dex.mod',
 'system.abilities.int.mod',
 'system.abilities.str.mod',
 'system.abilities.wis.mod',
 'system.attributes.ac.details',
 'system.attributes.ac.value',
 'system.attributes.allSaves.value',
 'system.attributes.hp.details',
 'system.attributes.hp.max',
 'system.attributes.hp.temp',
 'system.attributes.hp.value',
 'system.attributes.initiative.ability',
 'system.attributes.perception.value',
 'system.attributes.speed.otherSpeeds',
 'system.attributes.speed.value',
 'system.details.alignment.value',
 'system.details.blurb',
 'system.details.creatureType',
 'system.details.level.value',
 'system.details.privateNotes',
 'system.details.publicNotes',
 'system.details.source.value',
 'system.saves.fortitude.saveDetail',
 'system.saves.fortitude.value',
 'system.saves.reflex.saveDetail',
 'system.saves.reflex.value',
 'system.saves.will.saveDetail',
 'system.saves.will.value',
 'system.traits.attitude.value',


In [31]:
test_df = load_data(
    paths_to_books=[
        f"{DATASETS_DIR}/april-fools-bestiary.db",
        f"{DATASETS_DIR}/pathfinder-bestiary.db",
    ]
)

In [32]:
test_df = preprocess_data(test_df, characteristics=features)
# no problem in case there are more books

In [33]:
test_df.head()

Unnamed: 0,dex,land_speed,fortitude,will,book,cha,perception,num_immunities,con,ac,...,spells_nr_lvl_4,spells_nr_lvl_5,spells_nr_lvl_6,spells_nr_lvl_7,spells_nr_lvl_8,spells_nr_lvl_9,melee_max_bonus,avg_melee_dmg,ranged_max_bonus,avg_ranged_dmg
0,2,40,8,9,Pathfinder Blog: April Fool's Bestiary,1,9,0,1,20,...,0,0,0,0,0,0,11,8.5,9,7.5
1,2,15,8,7,Pathfinder Blog: April Fool's Bestiary,3,10,1,2,18,...,0,0,0,0,0,0,11,7.5,0,0.0
2,7,0,15,17,Pathfinder Blog: April Fool's Bestiary,0,19,6,3,32,...,0,0,0,0,0,0,22,23.0,22,15.0
3,3,30,4,7,Pathfinder Blog: April Fool's Bestiary,4,11,0,1,18,...,0,0,0,0,0,0,10,10.5,10,8.5
4,1,30,12,7,Pathfinder Blog: April Fool's Bestiary,1,15,3,0,19,...,0,0,0,0,0,0,10,9.0,10,8.5


In [34]:
from training.creating_dataset import get_characteristic_from_list


def extract_and_assign_chars(
    char_group: set,
    path_to_char: str,
    bestiary: pd.DataFrame,
    df: pd.DataFrame,
    replace_val: str,
):
    """
    Extract and assign values for a group of characteristics from `bestiary` DataFrame to another `df` DataFrame.

    :param char_group: A set of characteristic names to extract and assign.
    :param path_to_char: The path to the column containing the characteristic values in the `bestiary` DataFrame.
    :param bestiary: The DataFrame containing data from which to extract characteristic values.
    :param df: The DataFrame to which the extracted values will be assigned.
    :param replace_val:  A string to replace in characteristic names to determine the target column names in `df`.
    """

    """============= change =================="""
    if path_to_char not in bestiary.columns:
        for char in char_group:
            df[char] = pd.Series(0, index=bestiary.index)
        return
    """============== end ================="""

    for char in char_group:
        characteristic_name = char.replace(replace_val, "")
        get_value = lambda x: get_characteristic_from_list(x, characteristic_name)
        df[char] = bestiary[path_to_char].apply(get_value)

In [35]:
weaknesses = [
    "cold-iron_weakness",
    "good_weakness",
    "fire_weakness",
    "cold_weakness",
    "area-damage_weakness",
    "splash-damage_weakness",
    "evil_weakness",
    "slashing_weakness",
]
WEAKNESSES_PATH = "system.attributes.weaknesses"
new_df = pd.DataFrame()

In [36]:
WEAKNESSES_PATH in df.columns

False

In [37]:
extract_and_assign_chars(
    weaknesses, WEAKNESSES_PATH, bestiary=df, df=new_df, replace_val="_weakness"
)
# changes applied

In [38]:
new_df

Unnamed: 0,cold-iron_weakness,good_weakness,fire_weakness,cold_weakness,area-damage_weakness,splash-damage_weakness,evil_weakness,slashing_weakness
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0


#### pfs-introductions-bestiary.db
Problem solved above (`april-fools-bestiary.db`)

#### extinction-curse-bestiary.db

In [39]:
df = load_data(paths_to_books=[f"{DATASETS_DIR}/extinction-curse-bestiary.db"])

In [40]:
from training.creating_dataset import preprocess_data


# df = preprocess_data(df, characteristics=features)
# ValueError: invalid literal for int() with base 10: ''
# count_damage_expected_value

In [41]:
df["melee_max_bonus"], df["melee_damage_exp_val"] = zip(
    *df["items"].apply(lambda x: get_max_melee_bonus_damage(x, "melee"))
)

#### npc-gallery.db
Problem solved above (`extinction-curse-bestiary.db`)

#### fall-of-plaguestone.db

In [42]:
df = load_data(paths_to_books=[f"{DATASETS_DIR}/fall-of-plaguestone.db"])

In [43]:
from training.creating_dataset import preprocess_data

# df = preprocess_data(df, characteristics=features)
# KeyError: "['system.resources.focus.value'] not in index"

In [44]:
[col for col in df.columns if "system" in col]

['system.abilities.cha.mod',
 'system.abilities.con.mod',
 'system.abilities.dex.mod',
 'system.abilities.int.mod',
 'system.abilities.str.mod',
 'system.abilities.wis.mod',
 'system.attributes.ac.details',
 'system.attributes.ac.value',
 'system.attributes.allSaves.value',
 'system.attributes.hp.details',
 'system.attributes.hp.max',
 'system.attributes.hp.temp',
 'system.attributes.hp.value',
 'system.attributes.initiative.ability',
 'system.attributes.perception.value',
 'system.attributes.speed.otherSpeeds',
 'system.attributes.speed.value',
 'system.details.alignment.value',
 'system.details.blurb',
 'system.details.creatureType',
 'system.details.level.value',
 'system.details.privateNotes',
 'system.details.publicNotes',
 'system.details.rarity.value',
 'system.details.source.value',
 'system.saves.fortitude.saveDetail',
 'system.saves.fortitude.value',
 'system.saves.reflex.saveDetail',
 'system.saves.reflex.value',
 'system.saves.will.saveDetail',
 'system.saves.will.value',
 

In [45]:
from training.creating_dataset import (
    split_characteristics_into_groups,
    CHARACTERISTICS_RENAME,
    extract_and_assign_chars,
    get_nr_of_spells_with_lvl,
    get_max_melee_bonus_damage,
    RESISTANCE_PATH,
    WEAKNESSES_PATH,
    OTHER_SPEED_PATH,
)


def new_preprocess_data(
    bestiary: pd.DataFrame, characteristics: list[str]
) -> pd.DataFrame:
    """
    Creates dataframe containing chosen characteristics, level and source book of monsters from given bestiary.

    :param bestiary: A pandas DataFrame containing information about monsters.
    :param characteristics: A list of characteristics to load.
    :return: DataFrame with monsters from chosen books and with chosen characteristics and their origin book.
    """
    pd.options.mode.chained_assignment = None
    # silent warning (SettingWithCopyWarning) about view and copy
    # we don't need to go back to the original df - no matter if it is a view

    characteristics_groups = split_characteristics_into_groups(set(characteristics))

    if "num_immunities" in characteristics_groups.characteristics_rename:
        immunities_path = CHARACTERISTICS_RENAME.get("num_immunities")
        """========== changed ========="""
        if immunities_path not in bestiary.columns:
            bestiary[immunities_path] = pd.Series(0, index=bestiary.index)

        else:
            count_immunities = lambda x: 0 if x is np.nan else len(x)
            bestiary[immunities_path] = bestiary[immunities_path].apply(
                count_immunities
            )
        """========== end =============="""

    """========== changed ========="""
    if "focus" in characteristics_groups.characteristics_rename:
        if CHARACTERISTICS_RENAME.get("focus") not in bestiary.columns:
            bestiary[CHARACTERISTICS_RENAME.get("focus")] = pd.Series(
                0, index=bestiary.index
            )
    """========== end =============="""

    COLS_TO_EXTRACT = pd.DataFrame(
        data=[
            (characteristic, CHARACTERISTICS_RENAME.get(characteristic))
            for characteristic in characteristics_groups.characteristics_rename.union(
                {"book", "level"}
            )
        ],
        columns=["target_name", "raw_name"],
    )

    raw_names = COLS_TO_EXTRACT["raw_name"]
    target_names = COLS_TO_EXTRACT["target_name"]

    # to not have Series names as a part of final df
    target_names.name = None

    df = bestiary[raw_names]
    df.columns = target_names

    extract_and_assign_chars(
        characteristics_groups.resistances, RESISTANCE_PATH, bestiary, df, "_resistance"
    )

    extract_and_assign_chars(
        characteristics_groups.weaknesses, WEAKNESSES_PATH, bestiary, df, "_weakness"
    )

    extract_and_assign_chars(
        characteristics_groups.speeds, OTHER_SPEED_PATH, bestiary, df, ""
    )

    if "spells" in characteristics_groups.special_characteristics:
        MAX_SPELL_LVL = 9
        for i in range(1, MAX_SPELL_LVL + 1):
            df[f"spells_nr_lvl_{i}"] = bestiary["items"].apply(
                lambda x: get_nr_of_spells_with_lvl(x, i)
            )

    if "melee" in characteristics_groups.special_characteristics:
        df["melee_max_bonus"], df["avg_melee_dmg"] = zip(
            *bestiary["items"].apply(lambda x: get_max_melee_bonus_damage(x, "melee"))
        )

    if "ranged" in characteristics_groups.special_characteristics:
        df["ranged_max_bonus"], df["avg_ranged_dmg"] = zip(
            *bestiary["items"].apply(lambda x: get_max_melee_bonus_damage(x, "ranged"))
        )

    if "focus" in df.columns:
        df["focus"] = df["focus"].fillna(0)
        df["focus"] = df["focus"].astype(int)

    if "land_speed" in df.columns:
        df["land_speed"] = df["land_speed"].fillna(0)

    df.loc[df["level"] > 20, "level"] = 21

    pd.reset_option("mode.chained_assignment")

    return df

In [46]:
test_df = new_preprocess_data(df, characteristics=features)

# changes applied

In [47]:
df = load_and_preprocess_data(
    paths_to_books=[f"{DATASETS_DIR}/fall-of-plaguestone.db"], characteristics=features
)

#### impossible-lands-bestiary.db
Problem solved above (`fall-of-plaguestone.db`)

#### pathfinder-dark-archive.db
Problem solved above (`fall-of-plaguestone.db`)

#### travel-guide-bestiary.db

In [48]:
df = load_data(paths_to_books=[f"{DATASETS_DIR}/travel-guide-bestiary.db"])

In [49]:
# df = preprocess_data(df, characteristics=features)

# KeyError: 'system.attributes.immunities'

In [50]:
df = new_preprocess_data(df, characteristics=features)

## Summary
After applying all of the above changes there are 37 books with 3690 monsters in total

In [52]:
len(FILTERED_DATA)

37

In [53]:
monster_sum

3690

In [55]:
df = load_and_preprocess_data(
    [f"{DATASETS_DIR}/{file}" for file in FILTERED_DATA], characteristics=features
)

In [56]:
df.head()

Unnamed: 0,dex,land_speed,fortitude,will,book,cha,perception,num_immunities,con,ac,...,spells_nr_lvl_4,spells_nr_lvl_5,spells_nr_lvl_6,spells_nr_lvl_7,spells_nr_lvl_8,spells_nr_lvl_9,melee_max_bonus,avg_melee_dmg,ranged_max_bonus,avg_ranged_dmg
0,3.0,20.0,15.0,20.0,Pathfinder #165: Eyes of Empty Death,0.0,18.0,5,0.0,28.0,...,2,0,0,0,0,0,21,19.0,0,0.0
1,3.0,40.0,22.0,19.0,Pathfinder Abomination Vaults Hardcover Compil...,0.0,19.0,0,6.0,30.0,...,0,0,0,0,0,0,23,22.0,0,0.0
2,3.0,30.0,7.0,9.0,Pathfinder #164: Hands of the Devil,1.0,7.0,0,1.0,17.0,...,0,0,0,0,0,0,9,6.5,8,6.5
3,4.0,30.0,17.0,15.0,Pathfinder #164: Hands of the Devil,1.0,16.0,0,3.0,27.0,...,0,0,0,0,0,0,20,18.0,20,9.5
4,6.0,0.0,14.0,18.0,Pathfinder #165: Eyes of Empty Death,2.0,20.0,1,0.0,31.0,...,0,0,0,0,0,0,17,21.5,0,0.0


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3690 entries, 0 to 4327
Data columns (total 52 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   dex                     3690 non-null   float64
 1   land_speed              3690 non-null   float64
 2   fortitude               3690 non-null   float64
 3   will                    3690 non-null   float64
 4   book                    3690 non-null   object 
 5   cha                     3690 non-null   float64
 6   perception              3690 non-null   float64
 7   num_immunities          3690 non-null   int64  
 8   con                     3690 non-null   float64
 9   ac                      3690 non-null   float64
 10  wis                     3690 non-null   float64
 11  reflex                  3690 non-null   float64
 12  int                     3690 non-null   float64
 13  level                   3690 non-null   int64  
 14  str                     3690 non-null   