In [5]:
from typing import Dict, List, Union

import pandas as pd
import requests
from bs4 import BeautifulSoup
from colorama import Fore, Style
from tqdm import tqdm

In [6]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/114.0.0.0 Safari/537.36"
}

EXPECTED_MONSTER_FIELDS = [
    "symbol",
    "Difficulty",
    "Attacks",
    "Baselevel",
    "Baseexperience",
    "Speed",
    "BaseAC",
    "Base MR",
    "Alignment",
    "Frequency(bynormal means)",
    "Genocidable",
    "Weight",
    "Nutritional value",
    "Size",
    "Resistances",
    "Resistancesconveyed",
    "facts",
]

In [None]:
class NetHackScraper:
    def __init__(self, is_local: bool):
        self.root_url = (
            "http://localhost:8080" if is_local else "https://nethackwiki.com"
        )
        self.base_index_url = (
            "http://localhost:8080/index.php" if is_local else "https://nethackwiki.com/wiki"
        )
        self.monsters_url = f"{self.base_index_url}/Monster"
        self.items_url = f"{self.base_index_url}/Item"

        self.headers = {} if is_local else HEADERS

        self._initialise_requests()

    def _initialise_requests(self) -> None:
        monsters_response = requests.get(self.monsters_url, headers=self.headers)
        self.monsters_url_soup = BeautifulSoup(monsters_response.content, "html.parser")

        items_response = requests.get(self.items_url, headers=self.headers)
        self.items_url_soup = BeautifulSoup(items_response.content, "html.parser")


    # --- Monsters ---
    def _get_monsters_url_dict(self) -> Dict[str, str]:
        """Collects a dictionary of monster urls."""
        monsters_urls = {}

        top_level_lis = self.monsters_url_soup.select("ul > li")
        for li in tqdm(top_level_lis):
            nested_ul = li.find("ul")
            if not nested_ul:
                continue  # skip items without a nested list

            for sub_li in nested_ul.find_all("li", recursive=False):
                # get the second <a> tag (the first is usually the image link)
                links = sub_li.find_all("a")
                if len(links) >= 2:
                    monster_tag = links[1]
                    name = monster_tag.get_text(strip=True)
                    monster_url_relative = monster_tag.get("href").split("wiki")[-1]
                    url = self.root_url + monster_url_relative
                    monsters_urls[name] = url

        return monsters_urls

    def parse_monster_info(self, monster_url:str) -> Dict[str, str | List[str]]:
        """Collects a dictionary of monster properties (see `EXPECTED_MONSTER_FIELDS`)."""
        response = requests.get(monster_url, headers=self.headers)
        monster_soup = BeautifulSoup(response.content)
        tbody = monster_soup.select("tbody")
        soup = BeautifulSoup(str(tbody), "html.parser")
        rows = soup.find_all("tr")

        monster_data = {}
        facts = []

        for row in rows:
            th = row.find("th")
            tds = row.find_all("td")

            # symbol
            if th and "colspan" in th.attrs:
                name_tag = th.find("span", class_="nhsym")
                if name_tag:
                    monster_data["symbol"] = name_tag.text.strip()

            # Regular stat fields
            elif th and len(tds) == 1:
                key = th.get_text(strip=True).replace(" (by normal means)", "")
                value = tds[0].get_text(strip=True)
                monster_data[key] = value

            # Bullet-point facts
            elif tds and tds[0].find("ul"):
                for li in tds[0].find_all("li"):
                    fact = li.get_text(strip=True)
                    if fact:
                        facts.append(fact)

            # External reference
            elif len(tds) == 2 and "Reference" in tds[0].text:
                monster_data["Reference"] = tds[1].find("a")["href"]

        if facts:
            monster_data["facts"] = facts

        return {k: v for k, v in monster_data.items() if k in EXPECTED_MONSTER_FIELDS}
    
    # --- Items --- 
    def _get_item_classes_url_dict(self) -> Dict[str, str]:
        items_urls = {}
        for sub_li in self.items_url_soup.select("li"):
            try:
                link = sub_li.find("a")
                text = sub_li.get_text(strip=True)
                symbol = text.split("–")[0]
                name = text.split("–")[1]
                url = link.get("href")
                items_urls[name] = {"symbol": symbol, "url": url}
            except IndexError:
                continue
        return items_urls

# ***Monsters*** 

In [57]:
scraper = NetHackScraper(is_local=True)
monster_url_dict = scraper._get_monsters_url_dict()

100%|██████████| 500/500 [00:00<00:00, 90990.63it/s]


In [None]:
monsters = {}
for k in tqdm(monster_url_dict.keys()):
    monsters[k] = scraper.parse_monster_info(monster_url_dict[k])

if monsters != {}:
    pd.DataFrame(monsters).to_json("dataset/nethack_monsters")
    print(f"{Fore.GREEN}{Style.BRIGHT} Collectedz {len(monsters.keys())} Monster stats")

# ***Items*** 

In [59]:
item_classes_urls = scraper._get_item_classes_url_dict()
item_classes_urls

{'Coins': {'symbol': '$', 'url': '/index.php/Zorkmid'},
 'Amulets': {'symbol': '"', 'url': '/index.php/Amulet'},
 'Weapons': {'symbol': ')', 'url': '/index.php/Weapon'},
 'Armor': {'symbol': '[', 'url': '/index.php/Armor'},
 'Comestibles': {'symbol': '%', 'url': '/index.php/Comestible'},
 'Scrolls': {'symbol': '?', 'url': '/index.php/Scroll'},
 'Spellbooks': {'symbol': '+', 'url': '/index.php/Spellbook'},
 'Potions': {'symbol': '!', 'url': '/index.php/Potion'},
 'Rings': {'symbol': '=', 'url': '/index.php/Ring'},
 'Wands': {'symbol': '/', 'url': '/index.php/Wand'},
 'Tools': {'symbol': '(', 'url': '/index.php/Tool'},
 'Gems/Stones': {'symbol': '*', 'url': '/index.php/Gem'},
 'Boulders/Statues': {'symbol': '`', 'url': '/index.php/Boulder'},
 'Iron balls': {'symbol': '0', 'url': '/index.php/Heavy_iron_ball'},
 'Chains': {'symbol': '_', 'url': '/index.php/Iron_chain'},
 'Venoms': {'symbol': '.', 'url': '/index.php/Venom'}}

In [None]:
def parse_item_info(self, item_url:str) -> Dict[str, Union[str | List[str]]]:
        """Collects a dictionary of monster properties (see `EXPECTED_MONSTER_FIELDS`)."""
        response = requests.get(item_url, headers=self.headers)
        monster_soup = BeautifulSoup(response.content)
        tbody = monster_soup.select("tbody")
        soup = BeautifulSoup(str(tbody), "html.parser")
        rows = soup.find_all("tr")

        monster_data = {}
        facts = []

        for row in rows:
            th = row.find("th")
            tds = row.find_all("td")

            # symbol
            if th and "colspan" in th.attrs:
                name_tag = th.find("span", class_="nhsym")
                if name_tag:
                    monster_data["symbol"] = name_tag.text.strip()

            # Regular stat fields
            elif th and len(tds) == 1:
                key = th.get_text(strip=True).replace(" (by normal means)", "")
                value = tds[0].get_text(strip=True)
                monster_data[key] = value

            # Bullet-point facts
            elif tds and tds[0].find("ul"):
                for li in tds[0].find_all("li"):
                    fact = li.get_text(strip=True)
                    if fact:
                        facts.append(fact)

            # External reference
            elif len(tds) == 2 and "Reference" in tds[0].text:
                monster_data["Reference"] = tds[1].find("a")["href"]

        if facts:
            monster_data["facts"] = facts

        return {k: v for k, v in monster_data.items() if k in EXPECTED_MONSTER_FIELDS}

In [None]:
import json
import os
import re
from typing import Callable, Dict, List

import requests
from tqdm import tqdm

# It's better to use the official API endpoint.
API_URL = "http://localhost:8080/api.php"  # For your local container


class NetHackScraper:
    """
    A scraper for the NetHack Wiki that uses the official MediaWiki API
    for robust and efficient data extraction.
    """

    def __init__(self, is_local: bool):
        self.api_url = API_URL if is_local else "https://nethackwiki.com/api.php"

        # A dispatcher mapping item classes to their specific parsing functions.
        # This is a clean and extensible way to handle different page structures.
        self.category_parsers: Dict[str, Callable] = {
            "Monsters": self._parse_monster_info,
            "Weapons": self._parse_weapon_info,
            "Armor": self._parse_armor_info,
            # Add other parsers here, e.g., "Potions": self._parse_consumable_info
        }

    def _make_api_request(self, params: Dict) -> Dict:
        """Helper function to make a request to the MediaWiki API."""
        params["format"] = "json"
        try:
            response = requests.get(self.api_url, params=params)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"API request failed: {e}")
            return {}

    def get_pages_in_category(self, category_title: str) -> List[str]:
        """Gets a list of all page titles in a given category."""
        print(f"Fetching pages from 'Category:{category_title}'...")
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Category:{category_title}",
            "cmlimit": "500",  # Request max limit
            "cmtype": "page",
        }
        data = self._make_api_request(params)
        pages = [
            page["title"]
            for page in data.get("query", {}).get("categorymembers", [])
        ]
        print(f"Found {len(pages)} pages.")
        return pages

    def get_pages_content(self, page_titles: List[str]) -> Dict[str, str]:
        """
        Fetches the raw wikitext content for a list of pages in a single batch.
        This is much more efficient than one request per page.
        """
        content_dict = {}
        # Batch requests to avoid URLs that are too long
        for i in tqdm(range(0, len(page_titles), 50), desc="Batch fetching content"):
            batch_titles = page_titles[i : i + 50]
            params = {
                "action": "query",
                "prop": "revisions",
                "rvprop": "content",
                "titles": "|".join(batch_titles),
                "redirects": 1,  # Follow redirects
            }
            data = self._make_api_request(params)
            pages = data.get("query", {}).get("pages", {})
            for page_id, page_data in pages.items():
                title = page_data.get("title")
                content = page_data.get("revisions", [{}])[0].get("*")
                if title and content:
                    content_dict[title] = content
        return content_dict

    # --- Custom Parsers for Different Entity Types ---

    def _parse_monster_info(self, title: str, wikitext: str) -> Dict:
        """
        Parses the wikitext of a monster page to extract structured data
        from the 'Monster' infobox template.
        """
        # Regex to find key-value pairs inside the {{Monster ... }} template
        # This is more robust than parsing HTML.
        infobox_match = re.search(r"\{\{monster\s*\|([\s\S]*?)\}\}", wikitext)
        print(infobox_match)
        if not infobox_match:
            return {}

        infobox_content = infobox_match.group(1)
        monster_data = {"name": title, "type": "monster"}
        
        # Regex to find '| key = value' lines
        pattern = re.compile(r"\|\s*([^=]+?)\s*=\s*(.*)")
        matches = pattern.findall(infobox_content)

        for key, value in matches:
            key = key.strip()
            value = value.strip()
            # Clean up wikitext links like [[stone]] -> stone
            value = re.sub(r"\[\[([^|\]]+?)(?:\|[^\]]+)?\]\]", r"\1", value)
            monster_data[key] = value

        return monster_data

    def _parse_weapon_info(self, title: str, wikitext: str) -> Dict:
        """
        Parses the wikitext of a weapon page to extract structured data.
        (This is a placeholder - you would implement regex or string parsing here).
        """
        return {"name": title, "type": "weapon", "damage": "1d6", "cost": 10}

    def _parse_armor_info(self, title: str, wikitext: str) -> Dict:
        """
        Parses the wikitext of an armor page.
        """
        return {"name": title, "type": "armor", "ac": 3, "material": "iron"}

    # --- Main Execution ---

    def generate_dataset(self, save_path: str):
        """
        Generates a dataset for all supported entity classes and saves it.
        """
        os.makedirs(save_path, exist_ok=True)
        
        for category, parser_func in self.category_parsers.items():
            all_category_data = {}
            page_titles = self.get_pages_in_category(category)
            if not page_titles:
                continue

            pages_content = self.get_pages_content(page_titles)

            for title, content in tqdm(
                pages_content.items(), desc=f"Parsing {category}"
            ):
                item_data = parser_func(title, content)
                if item_data:
                    all_category_data[title] = item_data

            print(len(all_category_data.keys()))
            if all_category_data:
                output_file = os.path.join(save_path, f"nethack_{category.lower()}.jsonl")
                with open(output_file, "w") as f:
                    for item_name, data in all_category_data.items():
                        f.write(json.dumps({item_name: data}) + "\n")
                print(
                    f"\nSuccessfully saved {len(all_category_data)} entities to {output_file}"
                )


# if __name__ == "__main__":
#     scraper = NetHackScraper(is_local=True)
#     scraper.generate_dataset(save_path="datasets")


In [79]:
titles = scraper.get_pages_in_category("Monsters")
wikitext = scraper.get_pages_content(titles)

Fetching pages from 'Category:Monsters'...
Found 332 pages.


Batch fetching content: 100%|██████████| 7/7 [00:00<00:00, 21.06it/s]


In [None]:
wk = wikitext["Giant ant"]
infobox_match = re.search(r"\{\{monster\s*\|([\s\S]*?)\}\}", wk)
infobox_content = infobox_match.group(1)

pattern = re.compile(r"\|\s*([^=]+?)\s*=\s*(.*)")
matches = pattern.findall(infobox_content)

In [98]:
wk

'{{monster\n |name=giant ant\n |difficulty=4\n |level=2\n |experience=20\n |speed=18\n |AC=3\n |MR=0\n |align=0\n |frequency=3\n |genocidable=yes\n |attacks=[[Bite]] 1d4 [[Physical damage|physical]]\n |weight=10\n |nutr=10\n |size=tiny\n |resistances=none\n |resistances conveyed=none\n |attributes={{attributes|A giant ant|sgroup=1|animal=1|nohands=1|oviparous=1|carnivore=1|hostile=1}}\n |reference=[https://github.com/NetHack/NetHack/blob/NetHack-3.6.7_Released/src/monst.c#L108 NetHack 3.6.7 - src/monst.c, line 108]\n}}\n{{alternate tilesets|giant ant}}\nA \'\'\'giant ant\'\'\', {{monsym|giant ant}}, is a type of [[monster]] that appears in \'\'[[NetHack]]\'\'. It is a [[carnivorous]] and [[oviparous]] [[animal]] that is the most basic monster of the [[ant or other insect]] [[monster class]]. Despite being the weakest among its group, the giant ant is still a frequent cause of early deaths due to its [[speed]] and tendency to appear in groups.\n\nA giant ant has a single [[bite]] attack

In [None]:
matches

[('difficulty', '4'),
 ('level', '2'),
 ('experience', '20'),
 ('speed', '18'),
 ('AC', '3'),
 ('MR', '0'),
 ('align', '0'),
 ('frequency', '3'),
 ('genocidable', 'yes'),
 ('attacks', '[[Bite]] 1d4 [[Physical damage|physical]]'),
 ('weight', '10'),
 ('nutr', '10'),
 ('size', 'tiny'),
 ('resistances', 'none'),
 ('resistances conveyed', 'none'),
 ('attributes',
  '{{attributes|A giant ant|sgroup=1|animal=1|nohands=1|oviparous=1|carnivore=1|hostile=1')]

In [92]:
for title, text in list(zip(titles, wikitext)):
    print(scraper._parse_monster_info(title, text))
    infobox_match = re.search(r"\{\{monster\s*\|([\s\S]*?)\}\}", wikitext)
    break

None
{}


TypeError: expected string or bytes-like object