In [5]:
import os
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

# --- CONFIG ---
BASE_URL = "https://poedb.tw/us/"
PAGES = ["Two_Hand_Swords#TwoHandSwordsUnique"]  # Add more categories here
CACHE_DIR = "cache"
os.makedirs(CACHE_DIR, exist_ok=True)

# --- FUNCTIONS ---
def fetch_page(name):
    """Fetch and cache HTML from PoEDB."""
    cache_path = os.path.join(CACHE_DIR, f"{name}.html")
    
    # Use cached version if available
    if os.path.exists(cache_path):
        with open(cache_path, "r", encoding="utf-8") as f:
            html = f.read()
        print(f"Loaded from cache: {name}")
    else:
        url = BASE_URL + name
        print(f"Fetching: {url}")
        response = requests.get(url, headers={"User-Agent": "PoEFilterProject/1.0"})
        response.raise_for_status()
        html = response.text
        with open(cache_path, "w", encoding="utf-8") as f:
            f.write(html)
        time.sleep(3)  # Delay to avoid hammering the site
    return html


def parse_item_table(html):
    """Parse item table and extract data as DataFrame."""
    soup = BeautifulSoup(html, "lxml")
    table = soup.find("table", {"class": "itemTable"})
    if not table:
        return pd.DataFrame()
    
    # Extract headers
    headers = [th.text.strip() for th in table.find_all("th")]
    
    # Extract rows
    data = []
    for row in table.find_all("tr")[1:]:
        cols = [td.text.strip() for td in row.find_all("td")]
        if cols:
            data.append(cols)
    
    return pd.DataFrame(data, columns=headers)


def scrape_all():
    """Fetch and parse all configured PoEDB pages."""
    all_items = []
    for name in PAGES:
        html = fetch_page(name)
        df = parse_item_table(html)
        if not df.empty:
            df["Category"] = name
            all_items.append(df)
    
    return pd.concat(all_items, ignore_index=True)


if __name__ == "__main__":
    df = scrape_all()
    print(f"Scraped {len(df)} items from {len(PAGES)} categories.")
    df.to_csv("poe_items.csv", index=False)
    print("Saved results to poe_items.csv ✅")

Fetching: https://poedb.tw/us/Two_Hand_Swords#TwoHandSwordsUnique


ValueError: No objects to concatenate

In [9]:
!pip install nest_asyncio playwright


Collecting nest_asyncio
  Using cached nest_asyncio-1.6.0-py3-none-any.whl.metadata (2.8 kB)
Collecting playwright
  Using cached playwright-1.55.0-py3-none-win_amd64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Using cached pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet<4.0.0,>=3.1.1 (from playwright)
  Downloading greenlet-3.2.4-cp39-cp39-win_amd64.whl.metadata (4.2 kB)
Using cached nest_asyncio-1.6.0-py3-none-any.whl (5.2 kB)
Using cached playwright-1.55.0-py3-none-win_amd64.whl (35.5 MB)
Downloading greenlet-3.2.4-cp39-cp39-win_amd64.whl (298 kB)
Using cached pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, nest_asyncio, greenlet, playwright
  Attempting uninstall: greenlet
    Found existing installation: greenlet 1.1.0
    Uninstalling greenlet-1.1.0:
      Successfully uninstalled greenlet-1.1.0


ERROR: Could not install packages due to an OSError: [WinError 2] 系统找不到指定的文件。: 'c:\\python39\\Scripts\\playwright.exe' -> 'c:\\python39\\Scripts\\playwright.exe.deleteme'


[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: c:\python39\python.exe -m pip install --upgrade pip


In [1]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import pandas as pd

async def scrape_poedb(url: str):
    """Fetch a fully rendered page from PoEDB using Playwright."""
    async with async_playwright() as p:
        browser = await p.firefox.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)
        await page.wait_for_load_state("networkidle")  # wait for JS to load
        html = await page.content()
        await browser.close()
        return html

async def parse_items(url):
    html = await scrape_poedb(url)
    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table", {"class": "itemTable"})
    if not table:
        print("⚠️ No table found — page might be empty or layout changed.")
        return pd.DataFrame()

    headers = [th.text.strip() for th in table.find_all("th")]
    rows = []
    for tr in table.find_all("tr")[1:]:
        cols = [td.text.strip() for td in tr.find_all("td")]
        if cols:
            rows.append(cols)

    df = pd.DataFrame(rows, columns=headers)
    print(f"✅ Scraped {len(df)} items from {url}")
    return df

In [2]:
url = "https://poedb.tw/us/Two_Hand_Sword"
df = await parse_items(url)
df.head()

NotImplementedError: 

In [10]:
print(type(df))

<class 'coroutine'>
