In [None]:
import asyncio
from playwright.async_api import async_playwright
import aiohttp
import os
import nest_asyncio
nest_asyncio.apply()

async def download_image(session, url, filename):
    async with session.get(url) as resp:
        if resp.status == 200:
            with open(filename, 'wb') as f:
                f.write(await resp.read())
                
async def crawl_tours():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)  # True để khỏi bật browser
        page = await browser.new_page()

        await page.goto("https://hanoitourist.vn/tour-trong-nuoc")
        await page.wait_for_timeout(8000)  # Chờ các lazyload tải xong

        # Lấy từng khối tour
        tour_cards = await page.locator("div.tour-item").all()
        async with aiohttp.ClientSession() as session:
            for i, card in enumerate(tour_cards):
                title = await card.locator("h3 a").inner_text()
                price = await card.locator("div.detail-gia").inner_text()
                location = await card.locator("div.detail-item-value >> nth=1").inner_text()
                dates = await card.locator("div.detail-item-value >> nth=0").inner_text()
                image_url = await card.locator("div.tour-img img").get_attribute("src")
                filename = f"tour_image_{i+1}.jpg"

                if image_url:
                    await download_image(session, image_url, filename)
                print(f"🎯 Tour {i+1}: {title}")
                print(f"📍 {location} | 📅 {dates} | 💰 {price}")
                print(image_url)
                print("────────────")
                if i == 0:
                    break
            await browser.close()

# Nếu chạy trong notebook thì dùng nest_asyncio
await crawl_tours()




In [None]:
import asyncio
from playwright.async_api import async_playwright
import aiohttp
import os
import nest_asyncio

nest_asyncio.apply()

async def download_image(session, url, filename):
    async with session.get(url) as resp:
        if resp.status == 200:
            with open(filename, 'wb') as f:
                f.write(await resp.read())

async def crawl_tour_detail(context, detail_url):
    page = await context.new_page()
    await page.goto(detail_url)
    await page.wait_for_timeout(3000)

    try:
        itinerary = await page.locator(".tab-content .lichtrinh").inner_text()
    except:
        itinerary = "Không có dữ liệu lịch trình"

    try:
        includes = await page.locator("div.includes").inner_text()
    except:
        includes = "Không rõ dịch vụ bao gồm"

    await page.close()
    return {
        "itinerary": itinerary,
        "includes": includes
    }

async def crawl_tours():
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        base_url = "https://hanoitourist.vn/tour-trong-nuoc"
        await page.goto(base_url)
        await page.wait_for_timeout(5000)

        current_page = 1
        max_page = 3  # Đổi số trang muốn crawl tại đây

        all_tours = []
        async with aiohttp.ClientSession() as session:
            while current_page <= max_page:
                print(f"🔎 Crawling page {current_page}...")

                tour_cards = await page.locator("div.tour-item").all()

                for i, card in enumerate(tour_cards):
                    try:
                        title = await card.locator("h3 a").inner_text()
                        price = await card.locator("div.detail-gia").inner_text()
                        location = await card.locator("div.detail-item-value >> nth=1").inner_text()
                        dates = await card.locator("div.detail-item-value >> nth=0").inner_text()
                        detail_url = await card.locator("h3 a").get_attribute("href")
                        full_detail_url = f"https://hanoitourist.vn{detail_url}" if detail_url.startswith("/") else detail_url
                        image_url = await card.locator("div.tour-img img").get_attribute("src")
                        filename = f"tour_image_{current_page}_{i+1}.jpg"

                        if image_url:
                            await download_image(session, image_url, filename)

                        # Crawl chi tiết từng tour
                        detail_data = await crawl_tour_detail(context, full_detail_url)

                        tour_info = {
                            "title": title,
                            "location": location,
                            "dates": dates,
                            "price": price,
                            "image_url": image_url,
                            "detail_url": full_detail_url,
                            **detail_data
                        }

                        print(f"✅ {title}")
                        print(f"📍 {location} | 📅 {dates} | 💰 {price}")
                        print(f"📝 Lịch trình: {detail_data['itinerary'][:100]}...")
                        print("────────────")

                        all_tours.append(tour_info)
                    except Exception as e:
                        print("⚠️ Lỗi xử lý card:", e)

                # Tìm nút "Trang tiếp" để next page
                try:
                    next_btn = page.locator("a.page-link:has-text('>')")
                    if await next_btn.is_visible():
                        await next_btn.click()
                        await page.wait_for_timeout(3000)
                        current_page += 1
                    else:
                        break
                except:
                    break

        await browser.close()

        print(f"🎉 Tổng cộng thu được {len(all_tours)} tour.")
        # Có thể lưu `all_tours` vào file CSV hoặc JSON nếu muốn.

# Chạy
await crawl_tours()
