In [None]:
# ✅ Cài đặt thư viện cần thiết (nếu chưa có)
!pip install selenium bs4 pandas fake_useragent

In [None]:
import sys
!{sys.executable} -m pip install selenium bs4 pandas fake_useragent


In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import os
import time
import random
from selenium.common.exceptions import NoSuchElementException



In [15]:
# 💻 Khởi tạo driver với options
def init_driver():
    options = Options()
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("--start-maximized")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122 Safari/537.36""user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36")
    service = Service()  # Nếu có ChromeDriver cụ thể thì truyền path
    driver = webdriver.Chrome(service=service, options=options)
    return driver

# 🧭 Cuộn trang cho đến khi tải xong
def scroll_to_bottom_until_loaded(driver, pause_time=2):
    last_height = driver.execute_script("return document.body.scrollHeight")
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(pause_time + random.uniform(0.5, 1.5))
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

# 📂 Load dữ liệu đã crawl
def load_existing_links(csv_file):
    if os.path.exists(csv_file):
        df = pd.read_csv(csv_file)
        return set(df["hotel_link"].dropna().tolist())
    return set()

# 📦 Lấy thông tin chi tiết khách sạn


def extract_text(driver, selector):
    try:
        return driver.find_element(By.CSS_SELECTOR, selector).text.strip()
    except NoSuchElementException:
        return ""
def extract_star_number(driver):
    try:
        # Tìm thẻ div chứa các sao
        star_div = driver.find_element(By.CSS_SELECTOR, "div.css-1dbjc4n.r-18u37iz")
        stars = star_div.find_elements(By.TAG_NAME, "svg")
        return len(stars)
    except NoSuchElementException:
        return 0


def extract_checkin_checkout_times(driver):
    received_time = ""
    giveback_time = ""

    try:
        # Tìm tất cả các div có class phù hợp
        time_elements = driver.find_elements(
            By.CSS_SELECTOR, "div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1h0z5md"
        )

        for elem in time_elements:
            text = elem.text.strip()
            if "Giờ nhận phòng:" in text:
                received_time = text.replace("Giờ nhận phòng:", "").strip()
            elif "Giờ trả phòng:" in text:
                giveback_time = text.replace("Giờ trả phòng:", "").strip()

    except Exception as e:
        print("❌ Lỗi khi lấy giờ nhận/trả phòng:", e)

    return received_time, giveback_time


def extract_additional_info(driver):
    from_center = ""
    popular_destination = ""

    try:
        rows = driver.find_elements(By.CSS_SELECTOR, "table tr")
        for row in rows:
            tds = row.find_elements(By.TAG_NAME, "td")
            if len(tds) >= 2:
                label = tds[0].text.strip()
                value = tds[1].text.strip()
                if "Khoảng cách đến trung tâm thành phố" in label:
                    from_center = value
                elif "Điểm đến phổ biến" in label:
                    popular_destination = value
    except Exception as e:
        print("❌ Lỗi khi lấy thông tin từ bảng:", e)

    return from_center, popular_destination


def extract_hotel_info(driver, url, city_name):
    driver.get(url)
    time.sleep(random.uniform(3, 5))

    hotel_name = extract_text(driver, "h1.css-4rbku5.css-901oao.r-uh8wd5.r-b88u0q.r-fdjqy7")
    location = extract_text(driver, "div.css-901oao.css-cens5h.r-13awgt0.r-uh8wd5.r-1b43r93.r-majxgm.r-rjixqe.r-fdjqy7")
    price = extract_text(driver, "div.css-901oao.r-uh8wd5.r-b88u0q.r-1ff274t")
    score_hotels = extract_text(driver, "h1.css-4rbku5.css-901oao.r-1w9mtv9.r-fdjqy7")
    number_rating = extract_text(driver, "div.css-901oao.css-cens5h.r-1b43r93.r-1w9mtv9.r-ovu0ai.r-rjixqe.r-fdjqy7")
    star_number = extract_star_number(driver)
    
    # Thường ở phần thông tin nhận/trả phòng
    received_time, giveback_time = extract_checkin_checkout_times(driver)


    from_center, popular_destination = extract_additional_info(driver)


    return {
        "hotel_name": hotel_name,
        "location": location,
        "price": price,
        "score_hotels": score_hotels,
        "number_rating": number_rating,
        "star_number": star_number,
        "received_time": received_time,
        "giveback_time": giveback_time,
        # "dien_tich": dien_tich,
        "from_center": from_center,
        "popular_destination": popular_destination,
        "hotel_link": url,
        "hotel_city": city_name,
    }


# 🎯 Click từng khách sạn và crawl nếu chưa tồn tại
def crawl_from_city_page(driver, url, city_name, existing_links, output_csv):
    driver.get(url)
    time.sleep(random.uniform(3, 5))
    actions = ActionChains(driver)
    crawled_data = []

    # Đọc dữ liệu hiện có
    if os.path.exists(output_csv):
        df_existing = pd.read_csv(output_csv)
        existing_links = set(df_existing["hotel_link"].dropna().str.strip())
    else:
        df_existing = pd.DataFrame()
        existing_links = set()

    index = 0
    while True:
        selector = f'div[data-index="{index}"]'
        print(f"\n📍 Đang xử lý khách sạn index {index}")
        should_increment = True  # Mặc định là sẽ tăng index

        try:
            hotel_elem = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, selector))
            )
            driver.execute_script(
                "arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", hotel_elem)
            time.sleep(random.uniform(1.2, 2.0))
        except Exception:
            print(f"🌀 Không tìm thấy index {index}, thử cuộn thêm để load...")

            # Lưu vị trí scroll hiện tại
            original_scroll_position = driver.execute_script("return window.pageYOffset;")
            driver.execute_script("window.scrollBy(0, 500);")
            time.sleep(random.uniform(1.2, 1.8))
            new_scroll_position = driver.execute_script("return window.pageYOffset;")

            if new_scroll_position == original_scroll_position:
                print(f"✅ Đã cuộn tới cuối trang nhưng vẫn không tìm thấy index {index}.")
                print("↩️ Quay lại vị trí cũ và thử index tiếp theo...")

                # Quay lại vị trí cũ
                driver.execute_script(f"window.scrollTo(0, {original_scroll_position});")
                time.sleep(random.uniform(1.0, 1.5))

                # Tăng index để tránh lặp lại
                index += 1
                continue

            # Nếu còn cuộn được thì giữ nguyên index để thử lại
            print("🔄 Đã cuộn thêm, sẽ thử lại index hiện tại...")
            continue


        try:
            actions.move_to_element(hotel_elem).perform()
            time.sleep(random.uniform(1.2, 2.2))
            main_window = driver.current_window_handle

            try:
                price_backup = hotel_elem.find_element(
                    By.CSS_SELECTOR, 'div.css-901oao.r-uh8wd5.r-b88u0q.r-1ff274t').text
            except Exception:
                print(f"⚠️ Không tìm thấy giá chính tại trang chi tiết, sử dụng giá backup.")
                price_backup = None
            try:
                score_backup = hotel_elem.find_element(
                    By.CSS_SELECTOR, 'div.css-901oao.r-b88u0q.r-fdjqy7').text
            except Exception:
                print(f"⚠️ Không tìm thấy score chính tại trang chi tiết, sử dụng score backup.")
                score_backup = None
            try:
                link_elem = hotel_elem.find_element(
                    By.CSS_SELECTOR, 'h3.css-4rbku5.css-901oao.r-uh8wd5.r-b88u0q.r-fdjqy7')
            except Exception:
                print(f"⚠️ Không tìm thấy thẻ h3 ở index {index}, bỏ qua.")
                index += 1  # Vẫn tăng index vì đã xử lý xong
                continue

            # Click mở chi tiết khách sạn
            driver.execute_script("arguments[0].click();", link_elem)

            WebDriverWait(driver, 10).until(lambda d: len(d.window_handles) > 1)
            new_window = [w for w in driver.window_handles if w != main_window][0]
            driver.switch_to.window(new_window)

            time.sleep(random.uniform(0.5, 1))
            current_url = driver.execute_script("return window.location.href;").strip().lower().rstrip("/")
            print(f"🔍 URL sau khi click: {current_url}")

            if current_url == "https://www.traveloka.com/en-vn/hotel":
                print(f"↩️ Bị redirect sang TRANG TRUNG GIAN: {current_url}")
                try:
                    driver.execute_script("window.history.back()")
                    time.sleep(random.uniform(2.5, 3.5))
                    current_url = driver.execute_script("return window.location.href;").strip().lower().rstrip("/")
                    if current_url == "https://www.traveloka.com/en-vn/hotel":
                        raise Exception("⛔ KHÔNG thể thoát khỏi trang trung gian → DỪNG chương trình.")
                    else:
                        print("✅ Đã quay lại trang chi tiết.")
                except Exception as e:
                    print(f"❌ Lỗi khi quay lại trang chi tiết: {e}")
                    raise

            hotel_url = driver.current_url.strip()
            if hotel_url in existing_links:
                print(f"⏭️ Đã tồn tại: {hotel_url}")
            else:
                hotel_data = extract_hotel_info(driver, hotel_url, city_name)
                if not hotel_data.get("price") or hotel_data["price"].strip() == "":
                    hotel_data["price"] = price_backup or ""
                    print(f"✅ Gán lại giá từ backup: {price_backup}")
                if not hotel_data.get("score_hotels") or hotel_data["score_hotels"].strip() == "":
                    hotel_data["score_hotels"] = score_backup or ""
                    print(f"✅ Gán lại điểm từ backup: {score_backup}")
                crawled_data.append(hotel_data)
                print(f"✅ Đã crawl: {hotel_url}")
                for key, value in hotel_data.items():
                    print(f"  {key}: {value}")
                print("-" * 50)

                df_new = pd.DataFrame([hotel_data])
                df_combined = pd.concat([df_existing, df_new], ignore_index=True).drop_duplicates(subset="hotel_link")
                df_combined.to_csv(output_csv, index=False, encoding="utf-8-sig")
                df_existing = df_combined
                existing_links.add(hotel_url)

            driver.close()
            driver.switch_to.window(main_window)
            time.sleep(random.uniform(2, 3))

        except Exception as e:
            print(f"❌ Lỗi không xác định tại index {index}: {e}")

        if should_increment:
            index += 1

# 🚀 Main crawl toàn bộ danh sách thành phố
def crawl_all_cities():
    driver = init_driver()
    today = datetime.today().date()
    next_day = today + timedelta(days=1)

    list_of_places = [
    # '10010169.Đà%20Lạt',
    # '10010083.Đà%20Nẵng',
    #'10011570.Phú%20Quốc',
    '30010278.Thành%20Phố%20Hạ%20Long',
    ]

    for place in list_of_places:
        city_name = place.split(".")[1].replace("%20", " ")
        url = f"https://www.traveloka.com/vi-vn/hotel/search?spec={today.strftime('%d-%m-%Y')}.{next_day.strftime('%d-%m-%Y')}.1.1.HOTEL_GEO.{place}.1"
        output_csv = "all_hotels_data.csv"
        existing_links = load_existing_links(output_csv)

        print(f"🔍 Bắt đầu crawl thành phố: {city_name}")
        crawl_from_city_page(driver, url, city_name, existing_links, output_csv)

    driver.quit()

In [16]:
if __name__ == "__main__":
    crawl_all_cities()

🔍 Bắt đầu crawl thành phố: Thành Phố Hạ Long

📍 Đang xử lý khách sạn index 0
🔍 URL sau khi click: https://www.traveloka.com/vi-vn/hotel/detail?spec=16-04-2025.17-04-2025.1.1.hotel.9000000831710.delasea%20ha%20long%20hotel.1&contexts=%7b%22inventoryratekey%22%3a%22povewb3zzsu2c6pd%2b6mdu2b9d%2bxttnp8xbphexkh8zfve1uyh1vio6bcalg%2b7noifwxxnyirzoxm4hx9sdmt9%2fam1ayn3megz8xcpq5hctgkoyhcabuc4ubujddpizcbkjyfe7e9hsd3umkwzskul5pnkvnrbzlwaur9ojsmsogtuf8upiiql0rcsb9xnhru5ji%2fe%2fjhrwvaaxdbrhsz2vojraceykrnrmvjy3c4arnvmzpkb3ijycdccd%2ftpv9xwiotpx7iqs1ga8ljclgbexrmjy4rh8ng%2faj%2bijo4bwp%2fnukxtseviumnrcdmeombpo%2bhaufvfvxc9ii0m6f5quatil8%2fdgqriobor6qo6v9nkq66hmflnbtdjjwegn3zyyy5uy%2f5bwrnglni89mvytxug%2bqalqkc7m%2f4k63hkpse5gxpnofcbw%2fzljiwiaxmuvlzx8vfxj4s6fb9fpgnaliww8pbhdarwzywvr%2bicvyctnv9nmzkzdli2kxnuppqswqryoylpdfcvmk5lzh5%2ba0lc2mxrdnq3zr3ecnbejudntlvzghmerbwzuqiimvm7w8ltsciy%2bjjpafk4m21umi5ixxgs9osc%2b0uqlp85xfoiaenmrhjrlxamthskvhcrv7j2hbyoabf2%2b7%2fjllspgzy7xx8bjfzhi5vkfthmm6yi%2bkhan

KeyboardInterrupt: 