In [7]:
import asyncio
from playwright.async_api import async_playwright

async def crawl_pasgo_by_page(keyword, max_pages=5):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        all_results = []

        for page_num in range(1, max_pages + 1):
            url = f"https://pasgo.vn/tim-kiem?search={keyword}&page={page_num}"
            print(f"🔎 Crawling page {page_num}: {url}")
            await page.goto(url)
            try:
                await page.wait_for_selector("div.item-child-info a", timeout=8000)
            except:
                print(f"⛔ Không tìm thấy dữ liệu ở trang {page_num}")
                continue

            items = await page.query_selector_all("div.item-child-info a")
            for item in items:
                name = await item.inner_text()
                link = await item.get_attribute("href")
                full_link = f"https://pasgo.vn{link}"
                print(f"{name.strip()} - {full_link}")
                all_results.append((name.strip(), full_link))

        await browser.close()
        return all_results

# Ví dụ: Crawl 5 trang kết quả cho từ khoá "lẩu"
results = await crawl_pasgo_by_page("lẩu", max_pages=5)


🔎 Crawling page 1: https://pasgo.vn/tim-kiem?search=lẩu&page=1
Lộc-ally - Cát Linh

Tầng 2, Khách Sạn Grand Mercure Hanoi, Số 9 Cát Linh, P. Quốc Tử Giám,Q. Ba Đình

Đặt bàn giữ chỗ
Buffet Món Việt, Gọi món Á - Âu - https://pasgo.vnhttps://pasgo.vn/nha-hang/loc-ally-restaurant-cat-linh-5490
Đặt chỗ - https://pasgo.vnhttps://pasgo.vn/dat-cho-ngay/5490?returnUrl=/tim-kiem?search=l%E1%BA%A9u&page=1
GoGi House - Giang Văn Minh

Số 14 Giang Văn Minh, P. Kim Mã, Q. Ba Đình

Đặt bàn giữ chỗ
Gọi món, Buffet Nướng Lẩu Hàn Quốc - https://pasgo.vnhttps://pasgo.vn/nha-hang/gogi-house-giang-van-minh-5575
Đặt chỗ - https://pasgo.vnhttps://pasgo.vn/dat-cho-ngay/5575?returnUrl=/tim-kiem?search=l%E1%BA%A9u&page=1
Lẩu Nấm Ashima - Giang Văn Minh

Số 60 Giang Văn Minh, P. Đội Cấn, Q. Ba Đình

Ưu đãi hấp dẫn
Gọi món Á, Chuyên Lẩu Nấm - https://pasgo.vnhttps://pasgo.vn/nha-hang/nha-hang-lau-nam-ashima-giang-van-minh-963
Đặt chỗ - https://pasgo.vnhttps://pasgo.vn/dat-cho-ngay/963?returnUrl=/tim-kiem?sea

In [10]:
import asyncio
from playwright.async_api import async_playwright

async def crawl_pasgo_by_city(keyword, city_id=2, max_pages=3):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await p.chromium.launch_persistent_context(
            user_data_dir="/tmp/playwright",  # cache tạm
            headless=True
        )

        # Gán cityId ngay từ đầu bằng add_init_script
        await context.add_init_script(f"""
            localStorage.setItem('cityId', '{city_id}');
        """)

        page = await context.new_page()
        all_results = []

        for page_num in range(1, max_pages + 1):
            url = f"https://pasgo.vn/tim-kiem?search={keyword}&page={page_num}"
            print(f"🔎 Crawling page {page_num}: {url}")
            await page.goto(url, wait_until="load")

            try:
                await page.wait_for_selector("div.item-child-info a", timeout=8000)
            except:
                print(f"⛔ Trang {page_num} không có kết quả")
                continue

            items = await page.query_selector_all("div.item-child-info a")
            for item in items:
                name = await item.inner_text()
                link = await item.get_attribute("href")
                full_url = f"https://pasgo.vn{link}"
                print(f"📍 {name.strip()} - {full_url}")
                all_results.append((name.strip(), full_url))

        await browser.close()
        return all_results

# Hồ Chí Minh = city_id 2
results = await crawl_pasgo_by_city("lẩu", city_id=2, max_pages=2)


🔎 Crawling page 1: https://pasgo.vn/tim-kiem?search=lẩu&page=1
📍 Lộc-ally - Cát Linh

Tầng 2, Khách Sạn Grand Mercure Hanoi, Số 9 Cát Linh, P. Quốc Tử Giám,Q. Ba Đình

Đặt bàn giữ chỗ
Buffet Món Việt, Gọi món Á - Âu - https://pasgo.vnhttps://pasgo.vn/nha-hang/loc-ally-restaurant-cat-linh-5490
📍 Đặt chỗ - https://pasgo.vnhttps://pasgo.vn/dat-cho-ngay/5490?returnUrl=/tim-kiem?search=l%E1%BA%A9u&page=1
📍 GoGi House - Giang Văn Minh

Số 14 Giang Văn Minh, P. Kim Mã, Q. Ba Đình

Đặt bàn giữ chỗ
Gọi món, Buffet Nướng Lẩu Hàn Quốc - https://pasgo.vnhttps://pasgo.vn/nha-hang/gogi-house-giang-van-minh-5575
📍 Đặt chỗ - https://pasgo.vnhttps://pasgo.vn/dat-cho-ngay/5575?returnUrl=/tim-kiem?search=l%E1%BA%A9u&page=1
📍 Lẩu Nấm Ashima - Giang Văn Minh

Số 60 Giang Văn Minh, P. Đội Cấn, Q. Ba Đình

Ưu đãi hấp dẫn
Gọi món Á, Chuyên Lẩu Nấm - https://pasgo.vnhttps://pasgo.vn/nha-hang/nha-hang-lau-nam-ashima-giang-van-minh-963
📍 Đặt chỗ - https://pasgo.vnhttps://pasgo.vn/dat-cho-ngay/963?returnUrl=/

In [None]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
from playwright.async_api import async_playwright

import re

async def extract_price_range(page):
    # Tìm span chứa giá trung bình
    info_span = await page.query_selector("span.pasgo-giatrungbinh")
    if not info_span:
        return {"price_range": ""}

    full_text = await info_span.inner_text()
    
    match = re.search(r"(\d{1,3}(?:\.\d{3})*)\s*-\s*(\d{1,3}(?:\.\d{3})*)", full_text)
    price_range = f"{match.group(1)} - {match.group(2)} đ/người" if match else ""


    return {"price_range": price_range}

def slug_to_name(slug):
    words = slug.split("-")
    return " ".join([word for word in words])
async def extract_summary(article):
    result = {}
    result['description'] = ''
    titles = await article.query_selector_all(".txt-title")
    for title in titles:
        title_text = (await title.inner_text()).strip().replace(":", "").upper()

        if "MÓN ĐẶC SẮC" in title_text:
            span = await article.query_selector("span")
            value = (await span.inner_text()).strip() if span else ""
            result['cuisines'] = value
        elif "ĐIỂM ĐẶC TRƯNG" in title_text:
            ps_texts = []
            sibling = await title.evaluate_handle("el => el.nextElementSibling")
            while sibling:
                is_null = await sibling.evaluate("el => el === null")
                if is_null:
                    break
                tag_name = await sibling.evaluate("el => el.tagName")
                if tag_name == "DIV":
                    class_name = await sibling.get_attribute("class")
                    if class_name == "txt-title":
                        break  # gặp title tiếp theo → dừng
                if tag_name == "P":
                    ps_texts.append((await sibling.inner_text()).strip())
                sibling = await sibling.evaluate_handle("el => el.nextElementSibling")
            value = "\n".join(ps_texts)
            result['description'] += value
        elif "THÔNG TIN THÊM" in title_text:
            pass    
        else:
            sibling = await title.evaluate_handle("el => el.nextElementSibling")
            value = ""
            if sibling:
                is_null = await sibling.evaluate("el => el === null")
                if not is_null:
                    tag_name = await sibling.evaluate("el => el.tagName")
                    if tag_name == "DIV" and await sibling.get_attribute("class") == "text-description":
                        value = (await sibling.inner_text()).strip()
                    elif tag_name == "P":
                        value = (await sibling.inner_text()).strip()
            result['description'] += value


    return result

async def extract_image_gallery(article):
    image_urls = []
    images = await article.query_selector_all("img")
    for image in images:
        src = await image.get_attribute("src")
        if src:
            if src.startswith("/"):
                src = f"https://pasgo.vn{src}"
            image_urls.append(src)
    print("image_urls_2: ", image_urls)
    return image_urls


 

async def get_detail_data(page, detail_link):
    await page.goto(detail_link)
    await page.wait_for_load_state("domcontentloaded")

    articles = await page.query_selector_all("article")
    result= {}
    result['photo_url'] = []
    for article in articles:
        article_id = await article.get_attribute("id")
        if article_id == "NH-TOMTAT":
            result.update(await extract_summary(article))
        if article_id == "info-booth":
            result.update(await extract_price_range(article))
            print(await extract_price_range(article))
        if article_id == "NH-ANH":
            result['photo_url'].extend(await extract_image_gallery(article))
    return result


async def crawl_pasgo_by_page(category_slug, max_pages=5):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        all_results = []
        for page_num in range(1, max_pages + 1):
            url = f"https://pasgo.vn/ha-noi/nha-hang{category_slug}?page={page_num}"
            print(f"🔎 Crawling page {page_num}: {url}")
            await page.goto(url)
            try:
                await page.wait_for_selector("div.wapitem a", timeout=8000)
            except:
                print(f"⛔ Không tìm thấy dữ liệu ở trang {page_num}")
                continue

            items = await page.query_selector_all("div.wapitem")
            for item in items:
                result = {}
                main = await item.query_selector("div.waptop-main")
                desc = await item.query_selector("div.waptop-desc")
                
                link = await item.query_selector("a.waptop")
                link = await link.get_attribute("href")
                full_link = f"https://pasgo.vn{link}" if link.startswith("/") else link
                detail_page = await browser.new_page()
                detail_data = await get_detail_data(detail_page, full_link)
                result.update(detail_data)
                await detail_page.close()

                img = await main.query_selector("a.waptop img")
                img_url = await img.get_attribute("src") if img else None
                result['photo_url'].extend(img_url)
                name = await main.query_selector("div.wapfooter h3")
                name = await name.inner_text() if name else "Không có tên"

                address = await main.query_selector("p")
                address = await address.inner_text() if address else "Không có địa chỉ"


                sale = await desc.query_selector("div.wapsale")
                sale = await sale.inner_text() if sale else "Không có sale"

                tag = await desc.query_selector("div.waptag")
                tag = await tag.inner_text() if tag else "Không có tag"

                result.update({
                    "img_url": img_url,
                    "name": name.strip(),
                    "address": address.strip(),
                    "link": full_link
                })
                result['city'] = slug_to_name(city)
                all_results.append(result)

        await browser.close()
        return all_results

# Chạy thử
if __name__ == "__main__":
    categories = ["","/lau-27", "/buffet-29", "/hai-san-28", "/lau-and-nuong-91", "/quan-nhau-165", "/mon-chay-44", "/dat-tiec-224", "/han-quoc-16", "/nhat-ban-15", "/mon-au-23", "/mon-viet-21", "/mon-thai-18", "/mon-trung-hoa-126", "/tiec-cuoi-143"]
    cities = [
    "ha-noi",
    "ho-chi-minh",
    "hai-phong",
    "da-nang",
    "khanh-hoa",
    "can-tho",
    "vung-tau",
    "bac-giang",
    "bac-ninh",
    "binh-duong",
    "binh-dinh",
    "binh-thuan",
    "hung-yen",
    "kien-giang",
    "lam-dong",
    "nghe-an",
    "quang-nam",
    "quang-ninh",
    "thanh-hoa",
    "thua-thien-hue"
    ]
    for city in cities:
        for cat in categories:
            asyncio.run(crawl_pasgo_by_page(cat, city, max_pages=5))


🔎 Crawling page 1: https://pasgo.vn/ha-noi/nha-hang/lau-27?page=1
{'price_range': '300.000 - 500.000 đ/người', 'price_level': 0}
image_urls_2:  ['https://pasgo.vn/Upload/anh-chi-tiet/nha-hang-de-re-song-duong-han-thuyen-slide-1-normal-2394309516808.webp', 'https://pasgo.vn/Upload/anh-chi-tiet/nha-hang-de-re-song-duong-han-thuyen-slide-2-normal-2394309416809.webp', 'https://pasgo.vn/Upload/anh-chi-tiet/nha-hang-de-re-song-duong-han-thuyen-slide-3-normal-2394309316810.webp', 'https://pasgo.vn/Upload/anh-chi-tiet/nha-hang-de-re-song-duong-han-thuyen-slide-4-normal-2394309216811.webp', 'https://pasgo.vn/Upload/anh-chi-tiet/nha-hang-de-re-song-duong-han-thuyen-slide-5-normal-131842816812.webp', 'https://pasgo.vn/Upload/anh-chi-tiet/nha-hang-de-re-song-duong-han-thuyen-slide-6-normal-2394309116813.webp', 'https://pasgo.vn/Upload/anh-chi-tiet/nha-hang-de-re-song-duong-han-thuyen-slide-7-normal-2394309016814.webp', 'https://pasgo.vn/Upload/anh-chi-tiet/nha-hang-de-re-song-duong-han-thuyen-slid

KeyboardInterrupt: 