In [2]:
import requests
from urllib.robotparser import RobotFileParser

def is_scraping_allowed(url):
    """Check if scraping is allowed for a specific Booking.com page."""
    robots_url = "https://www.booking.com/robots.txt"

    try:
        rp = RobotFileParser()
        rp.set_url(robots_url)
        rp.read()

        if rp.can_fetch("*", url):
            print(f"✅ ALLOWED: Scraping is permitted for {url}")
        else:
            print(f"❌ BLOCKED: Scraping is NOT permitted for {url}")
    except Exception as e:
        print(f"⚠️ ERROR: Could not check robots.txt for {url} - {e}")

# ✅ Test search results page
is_scraping_allowed("https://www.booking.com/searchresults.en-gb.html")
is_scraping_allowed("https://www.booking.com/searchresults.en-gb.html?ss=Singapore&checkin=2025-05-01&checkout=2025-05-02&group_adults=2&no_rooms=1&group_children=0")
is_scraping_allowed("https://www.booking.com/hotel/sg/marina-bay-sands.en-gb.html?aid=304142&label=gen173nr-1FCAQoggJCEHNlYXJjaF9zaW5nYXBvcmVICVgEaMkBiAEBmAEJuAEXyAEM2AEB6AEB-AEDiAIBqAIDuAK_n9a-BsACAdICJGQwZWQwM2I0LTczYTAtNDc1Mi04MzgzLWE3ODY2NjRiM2Y0N9gCBeACAQ&ucfs=1&arphpl=1&checkin=2025-05-01&checkout=2025-05-02&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=3&hapos=3&sr_order=popularity&srpvid=27c0665fd57806b5&srepoch=1742049218&all_sr_blocks=24588246_266353018_2_2_0_53975&highlighted_blocks=24588246_266353018_2_2_0_53975&matching_block_id=24588246_266353018_2_2_0_53975&sr_pri_blocks=24588246_266353018_2_2_0_53975_119900&from_sustainable_property_sr=1&from=searchresults") # expected yes, defined on robots.txt

# ❌ Test photo page - expected no, defined on robots.txt
is_scraping_allowed("https://www.booking.com/photo.html") 


✅ ALLOWED: Scraping is permitted for https://www.booking.com/searchresults.en-gb.html
✅ ALLOWED: Scraping is permitted for https://www.booking.com/searchresults.en-gb.html?ss=Singapore&checkin=2025-05-01&checkout=2025-05-02&group_adults=2&no_rooms=1&group_children=0
✅ ALLOWED: Scraping is permitted for https://www.booking.com/hotel/sg/marina-bay-sands.en-gb.html?aid=304142&label=gen173nr-1FCAQoggJCEHNlYXJjaF9zaW5nYXBvcmVICVgEaMkBiAEBmAEJuAEXyAEM2AEB6AEB-AEDiAIBqAIDuAK_n9a-BsACAdICJGQwZWQwM2I0LTczYTAtNDc1Mi04MzgzLWE3ODY2NjRiM2Y0N9gCBeACAQ&ucfs=1&arphpl=1&checkin=2025-05-01&checkout=2025-05-02&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=3&hapos=3&sr_order=popularity&srpvid=27c0665fd57806b5&srepoch=1742049218&all_sr_blocks=24588246_266353018_2_2_0_53975&highlighted_blocks=24588246_266353018_2_2_0_53975&matching_block_id=24588246_266353018_2_2_0_53975&sr_pri_blocks=24588246_266353018_2_2_0_53975_119900&from_sustainable_property_sr=1&from=searchresults
❌ BLOC

In [3]:
url = "https://www.booking.com/hotel/sg/marina-bay-sands.en-gb.html?aid=304142&label=gen173nr-1FCAQoggJCEHNlYXJjaF9zaW5nYXBvcmVICVgEaMkBiAEBmAEJuAEXyAEM2AEB6AEB-AEDiAIBqAIDuAK_n9a-BsACAdICJGQwZWQwM2I0LTczYTAtNDc1Mi04MzgzLWE3ODY2NjRiM2Y0N9gCBeACAQ&ucfs=1&arphpl=1&checkin=2025-05-01&checkout=2025-05-02&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&hpos=3&hapos=3&sr_order=popularity&srpvid=27c0665fd57806b5&srepoch=1742049218&all_sr_blocks=24588246_266353018_2_2_0_53975&highlighted_blocks=24588246_266353018_2_2_0_53975&matching_block_id=24588246_266353018_2_2_0_53975&sr_pri_blocks=24588246_266353018_2_2_0_53975_119900&from_sustainable_property_sr=1&from=searchresults"
parts = url.split("/")
slug = parts[5]

clean_slug = slug.split(".")[0]  
print(clean_slug)

marina-bay-sands


### 1. Extracting Property Links

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import datetime
import csv
import re

# --------------------------
# 1. Define function to scrape property listings
# --------------------------

def scrape_property_listings(checkin_date, checkout_date):
    
    # 2. Construct Booking.com Search URL
    BASE_URL = "https://www.booking.com/searchresults.en-gb.html"
    SEARCH_URL = f"{BASE_URL}?ss=Singapore&checkin={checkin_date}&checkout={checkout_date}&group_adults=2&no_rooms=1&group_children=0"

    # 3. Initialize WebDriver
    driver = webdriver.Chrome()
    driver.get(SEARCH_URL)
    time.sleep(5)
    wait = WebDriverWait(driver, 10)

    # 4. Extract property details from search results
    property_data = {}

    # Loop to scroll and click "Load more results" button
    while True:
        try:
            # Scroll to the bottom of the page to trigger lazy loading
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(5)  # Wait for new content to load

            # Check if "Load more results" button exists
            load_more_button = driver.find_element(By.XPATH, '//button[span[text()="Load more results"]]')
            print("Button found, clicking it...")
            driver.execute_script("arguments[0].click();", load_more_button)
            time.sleep(5)
            
        except Exception as e:
            # Break if no more "Load more results" button
            print("No more results to load or button not found")
            break

    properties = driver.find_elements(By.XPATH, '//div[@data-testid="property-card-container"]')

    for prop in properties:
        try:
            # Extract hotel name and link
            hotel_name = prop.find_element(By.XPATH, './/div[@data-testid="title"]').text.strip()
            hotel_url = prop.find_element(By.XPATH, './/a').get_attribute("href")

            # Extract prices
            try:
                original_price_elem = prop.find_element(By.XPATH, './/span[@aria-hidden="true" and contains(@class, "abf093bdfe")]')
                original_price_text = original_price_elem.text.strip().replace("S$", "").replace(",", "").strip()
                original_property_price = int(original_price_text)
            except:
                original_property_price = "N/A"

            try:
                current_price_elem = prop.find_element(By.XPATH, './/span[@data-testid="price-and-discounted-price"]')
                current_price_text = current_price_elem.text.strip().replace("S$", "").replace(",", "").strip()
                current_property_price = int(current_price_text)
            except:
                current_property_price = "N/A"

            # Extract review score
            try:
                review_score_elem = prop.find_element(By.XPATH, './/div[@data-testid="review-score"]/div')
                review_score = review_score_elem.text.strip().split("\n")[-1]
            except:
                review_score = "N/A"

            # Extract review label
            try:
                review_label_elem = prop.find_element(By.XPATH, './/div[contains(@class, "a3b8729ab1 e6208ee469 cb2cbb3ccb")]')
                review_label = review_label_elem.text.strip()
            except:
                review_label = "N/A"

            # Extract number of reviews
            try:
                num_review_elem = prop.find_element(By.XPATH, './/div[@data-testid="review-score"]//div[contains(@class, "abf093bdfe f45d8e4c32 d935416c47")]')
                num_reviews = re.sub(r"[^\d]", "", num_review_elem.text)  # Keep only digits
            except:
                num_reviews = "N/A"

            # Extract star rating
            try:
                star_rating_elem = prop.find_element(By.XPATH, './/div[contains(@class, "b3f3c831be")]')
                star_rating = star_rating_elem.get_attribute("aria-label").split(" out of ")[0]
            except:
                star_rating = "N/A"

            # Extract preferred partner status
            try:
                prop.find_element(By.XPATH, './/span[@data-testid="preferred-badge"]')
                preferred_partner = "Yes"
            except:
                preferred_partner = "No"

            # Extract sustainability certification
            try:
                prop.find_element(By.XPATH, './/div[contains(@class, "abf093bdfe e6208ee469 f68ecd98ea")]')
                sustainability_certified = "Yes"
            except:
                sustainability_certified = "No"
            

            # Store partial property data 
            property_data[hotel_url] = {
                "hotel_name": hotel_name,
                "hotel_url": hotel_url,
                "original_price": original_property_price,
                "current_price": current_property_price,
                "num_reviews": num_reviews,
                "review_score": review_score,
                "review_label": review_label,
                "star_rating": star_rating,
                "preferred_partner": preferred_partner,
                "sustainability_certified": sustainability_certified,
                "last_updated": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            }

        except Exception as e:
            print(f"Skipping property due to error: {e}")

    # 6. Close the driver
    driver.quit()

    return property_data

# --------------------------
# 2. Scrape multiple dates (1st and 14th of each month)
# --------------------------
import pandas as pd

# Generate check-in dates for the 1st and 14th of each month
year = 2025
first_days = pd.date_range(start=f"{year}-04-01", end=f"{year}-12-01", freq="MS")  # 1st of every month
fourteenth_days = [pd.Timestamp(f"{year}-{month:02d}-14") for month in range(4, 13)]  # 14th of every month

# Combine the two lists
checkin_dates = sorted(list(first_days) + fourteenth_days)

# Generate (check-in, check-out) date pairs (checkout = check-in + 1 day)
date_pairs = [(d.strftime("%Y-%m-%d"), (d + pd.Timedelta(days=1)).strftime("%Y-%m-%d")) for d in checkin_dates]

# Store results in a dictionary using `slug` as the unique key
all_listings = {}

for checkin, checkout in date_pairs:
    listings = scrape_property_listings(checkin, checkout)

    for url, details in listings.items():
        try:
            parts = url.split("/")
            slug = parts[5].split(".")[0]  # Extract slug (e.g., "marina-bay-sands")

            # Use slug as the unique key
            if slug not in all_listings:
                all_listings[slug] = details  # Store the first occurrence of the hotel
            else:
                pass  # Skip if the hotel has already been added

        except:
            pass


# --------------------------
# Save to CSV
# --------------------------
df = pd.DataFrame.from_dict(all_listings, orient="index")

csv_filename = "property_data.csv"
df.to_csv(csv_filename, index=False)  

print(f"Saved {len(df)} unique hotel listings to {csv_filename}")

Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
No more results to load or button not found
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
Button found, clicking it...
No more results to load or button not found
Button found, clicking it...
Button found,

### 2. Extracting Details from Each Property Page

In [61]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import datetime

In [62]:
# --------------------------
# Define functions for scraping property details
# --------------------------
def scrape_payment_methods(driver):
    """
    Extracts accepted payment methods (credit cards) and cash acceptance status.
    """
    payment_methods = {
        "accepted_cards": [],
        "cash_accepted": "N/A"
    }

    try:
        # Locate all payment method images inside the section (credit cards)
        card_images = driver.find_elements(By.XPATH, "//div[contains(@class, 'c5e805a2e8')]//picture/img")

        for img in card_images:
            card_name = img.get_attribute("alt").strip()  # Extracts Visa, Mastercard, etc.
            if card_name:
                payment_methods["accepted_cards"].append(card_name)

        # Check for "Cash is not accepted" explicitly
        try:
            cash_element = driver.find_element(By.XPATH, "//span[contains(text(), 'Cash is not accepted')]")
            if cash_element:
                payment_methods["cash_accepted"] = "No"
        except:
            # Check if "Cash" is present alone (meaning it's accepted)
            try:
                cash_element = driver.find_element(By.XPATH, "//span[contains(text(), 'Cash')]")
                if cash_element:
                    payment_methods["cash_accepted"] = "Yes"
            except:
                payment_methods["cash_accepted"] = "No"  # Default assumption: Cash is not accepted

    except Exception as e:
        print(f"Error extracting payment methods: {e}")

    return payment_methods


def scrape_cot_extra_bed_policies(driver):
    """
    Extracts cot and extra bed policies and returns a structured dictionary.
    """
    cot_extra_bed_policies = {}

    try:
        # Locate the section header for Cot and Extra Bed Policies
        policy_section = driver.find_element(By.XPATH, "//h2[contains(text(), 'Cot and extra bed policies')]")
        policy_blocks = driver.find_elements(By.XPATH, "//div[contains(@class, 'e88206330c')]")

        if policy_blocks:
            for block in policy_blocks:
                try:
                    # Extract age group (e.g., "0 - 2 years", "3+ years")
                    age_group = block.find_element(By.XPATH, ".//div[contains(@class, 'df14f5d170')]").text.strip()
                    
                    # Find all policy entries (extra bed, cot, etc.)
                    policies = block.find_elements(By.XPATH, ".//div[contains(@class, 'cbadf7c7a5')]")
                    
                    policy_details = {}
                    
                    for policy_entry in policies:
                        try:
                            # Extract policy description (e.g., "Extra bed upon request", "Cot upon request")
                            policy = policy_entry.find_element(By.XPATH, ".//span[contains(@class, 'f149a96297')]").text.strip()
                            
                            # Extract cost (if available)
                            try:
                                cost = policy_entry.find_element(By.XPATH, ".//div[contains(@class, 'a53cbfa6de')]").text.strip()
                            except:
                                cost = "N/A"  
                            
                            # Check for 'Free' (which is in a different div)
                            try:
                                free_text = policy_entry.find_element(By.XPATH, ".//div[contains(@class, 'ccb65902b2')]").text.strip()
                                if free_text.lower() == "free":
                                    cost = "Free"
                            except:
                                pass  # If not found, ignore

                            policy_details[policy] = cost

                        except Exception as e:
                            print(f"Skipping policy entry due to error: {e}")

                    # Store extracted data in dictionary
                    cot_extra_bed_policies[age_group] = policy_details

                except Exception as e:
                    print(f"Skipping policy block due to error: {e}")

        else:
            # If no structured policies exist, check for plain text message
            try:
                no_cots_message = driver.find_element(By.XPATH, "//p[contains(text(), 'Cots and extra beds')]").text.strip()
                cot_extra_bed_policies["availability"] = no_cots_message
            except:
                cot_extra_bed_policies["availability"] = "N/A"

    except Exception as e:
        cot_extra_bed_policies["availability"] = "N/A"

    return cot_extra_bed_policies


def scrape_review_scores(driver):
    """
    Extracts review categories and their corresponding scores in a database-friendly format.
    """
    review_scores = []

    try:
        # Locate all review score items
        review_items = driver.find_elements(By.XPATH, "//ul[@id='review_list_score_breakdown']//li[contains(@class, 'clearfix one_col')]")

        for item in review_items:
            try:
                # Extract category name
                category_element = item.find_element(By.XPATH, ".//p[contains(@class, 'review_score_name')]")
                category = category_element.get_attribute("innerText").strip() if category_element else None

                if not category:
                    print("Warning: Missing category name, skipping item.")
                    continue  # Skip this entry if the category is still empty

                # Extract corresponding score
                score_element = item.find_element(By.XPATH, ".//p[contains(@class, 'review_score_value')]")
                score_text = score_element.get_attribute("innerText").strip() if score_element else ""

                # Handle missing or empty scores
                if not score_text:
                    print(f"Warning: '{category}' score is empty. Assigning None.")
                    score = None  # Assign None instead of skipping
                else:
                    try:
                        score = float(score_text)
                    except ValueError:
                        print(f"Warning: '{category}' score '{score_text}' is invalid. Assigning None.")
                        score = None  # Assign None if conversion fails

                # Append structured data
                review_scores.append({"category": category, "score": score})

            except Exception as e:
                print(f"Skipping review item due to error: {e}")

    except Exception as e:
        print(f"Error extracting review scores: {e}")

    return review_scores


In [63]:
def scrape_property_details(driver):
    """
    Given a Selenium driver and a Booking.com hotel URL:
    - Loads the page
    - Extracts the property address, description, and review details
    Returns a dictionary with structured data.
    """
    
    wait = WebDriverWait(driver, 10)

    # Initialize dictionary upfront with default values
    property_details = {
        "hotel_id": "N/A",
        "best_review_score_label": "N/A",
        "best_review_score_rating": "N/A",
        "review_scores": [],
        "address": "N/A",
        "latitude": "N/A",
        "longitude": "N/A",
        "description": "N/A",
        "check_in_time": "N/A",
        "check_out_time": "N/A",
        "children_policies": "N/A",
        "cot_extra_bed_policies": {}, 
        "age_restriction": "N/A",
        "payment_methods": {},  
        "smoking_policy": "N/A",
        "pets_policy": "N/A"
    }

    try:
        # Get page source
        page_source = driver.page_source

        # Extract latitude, longitude, and hotel_id using regex
        lat_match = re.search(r"b_map_center_latitude\s*=\s*([\d.]+);", page_source)
        lon_match = re.search(r"b_map_center_longitude\s*=\s*([\d.]+);", page_source)
        hotel_id_match = re.search(r"b_hotel_id\s*=\s*'(\d+)';", page_source)

        property_details["latitude"] = lat_match.group(1) if lat_match else "N/A"
        property_details["longitude"] = lon_match.group(1) if lon_match else "N/A"
        property_details["hotel_id"] = hotel_id_match.group(1) if hotel_id_match else "N/A"

    except Exception as e:
        pass


    # Extract Best Review Score Label
    try:
        property_details["best_review_score_label"] = driver.find_element(By.XPATH, '//p[@class="best-review-score-label"]').text.strip()
    except:
        pass

    # **Extract Best Review Score Rating
    try:
        property_details["best_review_score_rating"] = driver.find_element(By.XPATH, '//span[@class="review-score-badge"]').text.strip()
    except:
        pass

    # **Extract Address**
    try:
        address_elem = driver.find_element(By.XPATH, "(//div[@data-testid='PropertyHeaderAddressDesktop-wrapper']//div[@tabindex='0'])[1]").text.strip()
        cleaned_address = [line.strip() for line in address_elem.split("\n") if line.strip()][0]
        property_details["address"] = cleaned_address
    except:
        pass
        
    # **Extract Description**
    try:
        property_details["description"] = driver.find_element(By.XPATH, "//p[@data-testid='property-description']").text.strip()
    except:
        pass
    
    # **Extract Check-in Time**
    try:
        checkin_label = driver.find_element(By.XPATH, "//div[contains(@class, 'e1eebb6a1e') and contains(text(), 'Check-in')]")
        checkin_time_elem = checkin_label.find_element(By.XPATH, "./ancestor::div[contains(@class, 'c6e1dbf31b')]/following-sibling::div[contains(@class, 'f565581f7e')]/div[contains(@class, 'a53cbfa6de')]")
        property_details["check_in_time"] = checkin_time_elem.text.strip()

    except:
        pass
    
    # **Extract Check-out Time**
    try:
        checkout_label = driver.find_element(By.XPATH, "//div[contains(@class, 'e1eebb6a1e') and contains(text(), 'Check-out')]")
        checkout_time_elem = checkout_label.find_element(By.XPATH, "./ancestor::div[contains(@class, 'c6e1dbf31b')]/following-sibling::div[contains(@class, 'f565581f7e')]/div[contains(@class, 'a53cbfa6de')]")
        property_details["check_out_time"] = checkout_time_elem.text.strip()
    except:
        pass
    
    # **Extract Children Policies**
    try:
        property_details["children_policies"] = " ".join(
            p.text.strip() for p in driver.find_elements(By.XPATH, "//div[@class='c64ba425c8']/p")
        )
    except:
        pass
    
    # **Extract Age Restriction** 
    try:
        age_restriction_label = driver.find_element(By.XPATH, "//div[contains(@class, 'e1eebb6a1e') and (contains(text(), 'Age restriction') or contains(text(), 'age restriction'))]")
        age_restriction_elem = age_restriction_label.find_element(By.XPATH, "./ancestor::div[contains(@class, 'c6e1dbf31b')]/following-sibling::div[contains(@class, 'f565581f7e')]/div[contains(@class, 'a53cbfa6de')]")
        property_details["age_restriction"] = age_restriction_elem.text.strip()
    except:
        pass
     
    # **Call functions - Review scores, payment methods, cot and extra bed policies** 
    property_details["review_scores"] = scrape_review_scores(driver)
    property_details["payment_methods"] = scrape_payment_methods(driver)
    property_details["cot_extra_bed_policies"] = scrape_cot_extra_bed_policies(driver)
    
    
    # **Extract Smoking Policy** 
    try:
        smoking_label = driver.find_element(By.XPATH, "//div[contains(@class, 'e1eebb6a1e') and contains(text(), 'Smoking')]")
        smoking_elem = smoking_label.find_element(By.XPATH, "./ancestor::div[contains(@class, 'c6e1dbf31b')]/following-sibling::div[contains(@class, 'f565581f7e')]/div[contains(@class, 'a53cbfa6de')]")
        property_details["smoking_policy"] = smoking_elem.text.strip()
    except:
        pass
    
    # **Extract Pets** 
    try:
        pets_label = driver.find_element(By.XPATH, "//div[contains(@class, 'e1eebb6a1e') and contains(text(), 'Pets')]")
        pets_elem = pets_label.find_element(By.XPATH, "./ancestor::div[contains(@class, 'c6e1dbf31b')]/following-sibling::div[contains(@class, 'f565581f7e')]/div[contains(@class, 'a53cbfa6de')]")
        property_details["pets_policy"] = pets_elem.text.strip()
    except:
        pass  

    
    return property_details

def scrape_room_details(driver):
    wait = WebDriverWait(driver, 10)
    rooms = []

    # Find all room blocks
    room_elements = driver.find_elements(By.XPATH, '//td[contains(@class, "hprt-table-cell-roomtype")]')

    for room in room_elements:
        try:
            # Extract Room Name
            room_name = room.find_element(By.XPATH, './/a[contains(@class, "hprt-roomtype-link")]').text.strip()

            # Extract Bed Types (concatenating multiple bed types if present)
            bed_elements = room.find_elements(By.XPATH, './/ul[contains(@class, "rt-bed-types")]//li')
            bed_types = ", ".join([bed.text.strip() for bed in bed_elements])

            # Check if "Free cot available on request" exists
            try:
                room.find_element(By.XPATH, './/span[contains(text(), "Free cot available on request")]')
                free_cot = "Yes"
            except:
                free_cot = "No"

            # Extract Room Description
            try:
                room_description = room.find_element(By.XPATH, './/p[contains(@class, "short-room-desc")]').text.strip()
            except:
                room_description = "N/A"

            # Extract Room Size
            try:
                room_size = room.find_element(By.XPATH, './/div[@data-name-en="room size"]').text.strip()
            except:
                room_size = "N/A"

           # Extract Room Highlights (excluding room size)
            try:
                highlights = [
                    elem.text.strip() for elem in room.find_elements(
                        By.XPATH, './/div[contains(@class, "hprt-facilities-facility") and not(@data-name-en="room size")]//span'
                    )
                ]
            except:
                highlights = []

            # Extract Other Facilities (e.g., Free Toiletries, Bathrobe)
            try:
                other_facilities = [
                    elem.text.strip() for elem in room.find_elements(
                        By.XPATH, './/ul[contains(@class, "hprt-facilities-others")]//span'
                    )
                ]
            except:
                other_facilities = []

            # Combine Highlights & Other Facilities
            all_amenities = list(set(highlights + other_facilities))  # Remove duplicates
            room_facilities = ", ".join(all_amenities) if all_amenities else "No facilities listed"

            # Append to results list
            rooms.append({
                "Room Name": room_name,
                "Bed Types": bed_types,
                "Free Cot Available": free_cot,
                "Room Description": room_description,
                "Room Size": room_size,
                "Room Highlights & Facilities": room_facilities
            })

        except Exception as e:
            print(f"Error processing room: {e}")

    return rooms

In [None]:
def scrape_hotel_surroundings(driver):

    wait = WebDriverWait(driver, 10)
    surroundings = {}

    try:
        wait.until(EC.presence_of_element_located((By.ID, "surroundings_block")))
    except:
        return {}

    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

    category_elements = driver.find_elements(By.XPATH, "//div[@data-testid='poi-block']")

    for category_elem in category_elements:
        try:
            # Extract category name (e.g., "What's nearby", "Top attractions")
            category = category_elem.find_element(By.XPATH, ".//div[contains(@class, 'e1eebb6a1e')]").text.strip()
            places = category_elem.find_elements(By.XPATH, ".//ul[@data-testid='poi-block-list']/li")

            place_list = []
            for place in places:
                category_label = ""
                try:
                    category_label = place.find_element(By.XPATH, ".//span[contains(@class, 'b6f930dcc9')]").text.strip()
                except:
                    pass  # Ignore if category label not found

                # Extract place name
                full_text = place.find_element(By.XPATH, ".//div[contains(@class, 'dc5041d860')]").text.strip()
                name = full_text.replace(category_label, "").strip() if category_label else full_text

                # Extract distance
                try:
                    distance = place.find_element(By.XPATH, ".//div[contains(@class, 'e018b15ee8')]//div").text.strip()
                except:
                    distance = ""

                # Ensure valid data
                if name and distance:
                    place_list.append({"name": name, "category": category_label if category_label else None, "distance": distance})

            # Only add non-empty categories
            if place_list:
                surroundings[category] = place_list

        except Exception as e:
            print(f"Error scraping category: {str(e)}")
            continue

    # Return properly structured JSON
    return surroundings


def scrape_hotel_facilities(driver):

    wait = WebDriverWait(driver, 10)
    facilities = {}

    # Ensure facilities section is loaded
    try:
        wait.until(EC.presence_of_element_located((By.ID, "hp_facilities_box")))
    except:
        return {}

    # Scroll down to ensure elements are loaded
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)  # Allow time for elements to load

    # Locate all categories
    try:
        category_elements = wait.until(
            EC.presence_of_all_elements_located((By.XPATH, "//section[@id='hp_facilities_box']//h3"))
        )
    except:
        return {}

    for i in range(len(category_elements)):  # Use index to re-fetch elements
        try:
            # Re-fetch category elements before each iteration
            category_elements = driver.find_elements(By.XPATH, "//section[@id='hp_facilities_box']//h3")
            category_elem = category_elements[i]  # Get fresh reference
            
            category = category_elem.text.strip()

            # Extract the facilities under this category
            try:
                items = WebDriverWait(driver, 5).until(
                    EC.presence_of_all_elements_located((By.XPATH, f"(//section[@id='hp_facilities_box']//h3)[{i+1}]/following-sibling::ul[1]/li"))
                )
            except:
                continue

            facility_list = [item.text.strip() for item in items if item.text.strip()]
            if facility_list:
                facilities[category] = facility_list

        except:
            continue

    return facilities


In [66]:
# --------------------------
# Scrape property details
# --------------------------

csv_filename = "property_data.csv"
df = pd.read_csv(csv_filename)
property_data = df.to_dict(orient="records")

# # SUBSET TO FIRST 3 RECORDS
# property_data = property_data[:3]

# 2. Initialize WebDriver
driver = webdriver.Chrome()
time.sleep(2)

# 3. Scrape each property
results = []

for hotel in property_data:
    hotel_name = hotel["hotel_name"]
    hotel_url = hotel["hotel_url"]
    
    print(f"Scraping: {hotel_name}")
    
    # Load the page
    driver.get(hotel_url)
    time.sleep(3)  
    
    property_details = scrape_property_details(driver)
    room_details = scrape_room_details(driver)
    surroundings = scrape_hotel_surroundings(driver)
    facilities = scrape_hotel_facilities(driver)
    
    # Extract hotel_id and remove from property_details
    hotel_id = property_details.get("hotel_id", "N/A")  # Default to "N/A" if not found
    property_details.pop("hotel_id", None)
    
    results.append({
        "hotel_id": hotel_id,
        "hotel_name": hotel_name,
        **property_details,
        "room_details": room_details,
        "surroundings": surroundings,
        "facilities": facilities
    })
    
# 5. Close the driver
driver.quit()


# --------------------------
# Save to CSV
# --------------------------
csv_filename = "booking_property_details.csv"
with open(csv_filename, "w", newline="", encoding="utf-8") as file:
    fieldnames = list(results[0].keys())  
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(results)

print(f"\nScraping completed. Data saved to '{csv_filename}'.")




Scraping: A Hotel Joo Chiat
Scraping: M Social Singapore
Scraping: KINN Studios
Scraping: Park View Hotel
Scraping: ibis budget Singapore Crystal
Scraping: Park Avenue Changi
Scraping: ibis budget Singapore Ametrine
Scraping: ibis budget Singapore Emerald
Scraping: ibis budget Singapore Ruby
Scraping: ibis budget Singapore Bugis
Scraping: Vibe Hotel Singapore Orchard
Scraping: Rendezvous Hotel Singapore by Far East Hospitality
Scraping: Travelodge Harbourfront Singapore
Scraping: Hotel Grand Central
Scraping: Holiday Inn Express Singapore Katong by IHG
Scraping: Hotel 1900 Chinatown
Scraping: Champion Hotel City
Scraping: Mercure Singapore Bugis
Scraping: JEN Singapore Orchardgateway by Shangri-La
Scraping: Harbour Ville Hotel
Scraping: Aerotel Singapore - Transit Hotel in Terminal 1
Scraping: ibis budget Singapore Joo Chiat
Scraping: Marina Bay Sands
Scraping: 30 Bencoolen
Scraping: CapsulePod@Aljunied
Scraping: Orchard Rendezvous Hotel by Far East Hospitality
Scraping: Hotel Travelti

### 3. Extracting Reviews from Each Property Page

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import os
import time
import datetime
import csv
import random
import re

In [3]:
def click_read_all_reviews(driver):
    """
    Scrolls to and clicks the 'Read all reviews' button on a Booking.com hotel page.
    Ensures it is only clicked once.
    """
    try:
        # Wait for the button to appear in the DOM
        read_reviews_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//button[@data-testid='read-all-actionable']"))
        )

        # Scroll to the button
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", read_reviews_button)
        time.sleep(2)  # Allow scrolling to finish

        # Click the button
        read_reviews_button.click()
        # print("Successfully clicked 'Read all reviews' button.")

        # Wait for new review cards to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@data-testid='review-card']"))
        )

    except Exception as e:
        print(f"Failed to click 'Read all reviews' button: {e}")
        
        
def click_next_page(driver):
    """
    Clicks the 'Next page' button to load more reviews with random delays.
    Returns True if the button was clicked, False if it's the last page.
    """
    try:
        # Wait for the "Next page" button to be clickable
        next_button = WebDriverWait(driver, random.uniform(3, 6)).until(
            EC.element_to_be_clickable((By.XPATH, "//button[@aria-label='Next page']"))
        )

        # Randomly delay before scrolling
        time.sleep(random.uniform(1, 3))

        # Smooth scroll to the "Next page" button
        driver.execute_script("arguments[0].scrollIntoView({behavior: 'smooth', block: 'center'});", next_button)

        # Random delay before clicking
        time.sleep(random.uniform(1, 2))

        # Click the "Next page" button
        next_button.click()

        # Wait for new reviews to load (randomized delay)
        time.sleep(random.uniform(1, 5))

        return True  # More pages exist

    except Exception as e:
        print(f"🚨 No more 'Next page' button found (last page or error): {e}")
        return False  # Last page reached


In [4]:
def extract_review_score(card):
    """Extracts the review score from a review card with proper waiting."""
    try:
        # ✅ Ensure review card is in view (fixes dynamic loading issues)
        card.location_once_scrolled_into_view
        time.sleep(random.uniform(1, 3))  # Random delay to mimic human scrolling

        # ✅ Wait for review score to appear before extracting
        review_score_element = WebDriverWait(card, random.uniform(2, 6)).until(
            EC.presence_of_element_located((By.XPATH, ".//div[@data-testid='review-score']//div[contains(@class, 'ac4a7896c7')]"))
        )

        raw_score_text = review_score_element.text.strip()

        # ✅ Extract only numeric score
        match = re.search(r"(\d+\.\d+|\d+)", raw_score_text)
        review_score = match.group(1) if match else "N/A"

        return review_score

    except Exception as e:
        print(f"🚨 Error extracting review_score: {e}")
        return "N/A"


In [5]:
import time
import random
import re
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# ✅ Helper function to extract text safely
def extract_text(card, xpath):
    try:
        return card.find_element(By.XPATH, xpath).text.strip()
    except:
        return "N/A"


def scrape_booking_reviews(driver, max_retries=3):
    """
    Scrapes hotel reviews from Booking.com while handling detection issues.
    - Clicks "Show All Reviews" if detected.
    - Retries scraping up to 3 times if no reviews load.
    - Extracts reviewer names, stay details, review scores, and comments.
    """
    time.sleep(random.uniform(3, 7))  # Wait randomly to mimic human behavior
    wait = WebDriverWait(driver, 10)  
    all_reviews = []
    no_comment_count = 0
    attempts = 0

    while attempts < max_retries:
        try:
            # Step 1: Click "Read all reviews" (only once per hotel)
            click_read_all_reviews(driver)

            # Step 2: Check if Booking.com is hiding reviews
            try:
                show_all_reviews_btn = driver.find_element(By.XPATH, "//a[contains(text(), 'Show all reviews')]")
                if show_all_reviews_btn.is_displayed():
                    print("🔄 Clicking 'Show All Reviews' button...")
                    driver.execute_script("arguments[0].click();", show_all_reviews_btn)
                    time.sleep(random.uniform(2, 4))  # Wait for reviews to load
            except Exception:
                pass  # Button not found = continue scraping

            # Step 3: Loop through review pages
            while True:
                review_cards = wait.until(
                    EC.presence_of_all_elements_located((By.XPATH, "//div[@data-testid='review-card']"))
                )

                for card in review_cards:
                    try:
                        # Extract reviewer details
                        reviewer_name = extract_text(card, ".//div[contains(@class, 'a3332d346a e6208ee469')]")
                        reviewer_country = extract_text(card, ".//span[contains(@class, 'afac1f68d9')]")
                        review_room_name = extract_text(card, ".//span[@data-testid='review-room-name']")
                        review_num_nights = extract_text(card, ".//span[@data-testid='review-num-nights']").split()[0]
                        review_stay_date = extract_text(card, ".//span[@data-testid='review-stay-date']")
                        review_traveler_type = extract_text(card, ".//span[@data-testid='review-traveler-type']")
                        review_title = extract_text(card, ".//h3[@data-testid='review-title']")
                        review_score = extract_review_score(card)
                        review_positive_text = extract_text(card, ".//div[@data-testid='review-positive-text']//div[contains(@class, 'a53cbfa6de b5726afd0b')]")
                        review_negative_text = extract_text(card, ".//div[@data-testid='review-negative-text']//div[contains(@class, 'a53cbfa6de b5726afd0b')]")

                        # Check for reviews with no comments
                        if review_positive_text == "N/A" and review_negative_text == "N/A":
                            no_comment_count += 1
                        else:
                            no_comment_count = 0

                        if no_comment_count >= 5:
                            print("⚠️ Stopping: 5 consecutive reviews without comments.")
                            return all_reviews

                        # Store the review data
                        all_reviews.append({
                            "reviewer_name": reviewer_name,
                            "reviewer_country": reviewer_country,
                            "review_room_name": review_room_name,
                            "review_num_nights": review_num_nights,
                            "review_stay_date": review_stay_date,
                            "review_traveler_type": review_traveler_type,
                            "review_score": review_score,
                            "review_title": review_title,
                            "review_positive_text": review_positive_text,
                            "review_negative_text": review_negative_text
                        })

                    except Exception:
                        continue  # Skip if any issue occurs

                # Step 4: Click 'Next page' (break loop if last page)
                if not click_next_page(driver):
                    break

            if all_reviews:
                print(f"✅ Successfully extracted {len(all_reviews)} reviews.")
                return all_reviews  # Return if reviews were found

            print(f"⚠️ No reviews found, retrying... ({attempts+1}/{max_retries})")
            time.sleep(random.uniform(3, 6))  # Random wait before retrying
            driver.refresh()  # Reload page to try again
            attempts += 1

        except Exception as e:
            print(f"🚨 Error while scraping reviews: {e}")
            time.sleep(random.uniform(3, 6))
            driver.refresh()
            attempts += 1

    print("❌ Maximum retries reached. No reviews extracted.")
    return all_reviews



In [6]:
# --------------------------
# Scrape Reviews (SELENIUM GRID + PARALLEL) - MAIN SCRIPT
# --------------------------
import os
import time
import csv
import random
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.chrome.options import Options


# ---------- Setup ----------
csv_filename = "property_data.csv"
df = pd.read_csv(csv_filename)
property_data = df.to_dict(orient="records")

batch_size = 1
num_batches = len(property_data) // batch_size + (1 if len(property_data) % batch_size > 0 else 0)

log_file = "completed_batches.log"
output_folder = "scraped_reviews"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

completed_batches = set()
if os.path.exists(log_file):
    with open(log_file, "r") as f:
        completed_batches = {int(line.strip()) for line in f.readlines()}

SELENIUM_GRID_URL = "http://localhost:4445"

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
]

# ---------- Parallel Scraping Function ----------
def scrape_batch(i, batch, batch_output):
    try:
        print(f"[Batch {i+1}] Starting scrape...")

        options = Options()
        options.add_argument("--incognito")
        options.add_argument(f"user-agent={random.choice(USER_AGENTS)}")

        driver = webdriver.Remote(
            command_executor=SELENIUM_GRID_URL,
            options=options
        )
        time.sleep(2)

        all_hotel_reviews = []

        for hotel in batch:
            hotel_name = hotel["hotel_name"]
            hotel_url = hotel["hotel_url"]

            print(f"[Batch {i+1}] Scraping: {hotel_name}")
            
            driver.get(hotel_url)

            time.sleep(3)

            reviews = scrape_booking_reviews(driver)

            for review in reviews:
                review["hotel_name"] = hotel_name
                all_hotel_reviews.append(review)

            print(f"[Batch {i+1}] Scraped {len(reviews)} reviews for {hotel_name}")

        driver.quit()

        if all_hotel_reviews:
            with open(batch_output, "w", newline="", encoding="utf-8") as file:
                fieldnames = list(all_hotel_reviews[0].keys())
                writer = csv.DictWriter(file, fieldnames=fieldnames)
                writer.writeheader()
                writer.writerows(all_hotel_reviews)

            print(f"[Batch {i+1}] Saved to '{batch_output}'.")

            with open(log_file, "a") as f:
                f.write(f"{i}\n")
    except Exception as e:
        print(f"[Batch {i+1}] 🚨 Error: {e}")

# ---------- Run Parallel Batches ----------
tasks = []
with ThreadPoolExecutor(max_workers=15) as executor:  # Adjust based on your Grid capacity
    for i in range(num_batches):
        if i in completed_batches:
            print(f"[Batch {i+1}] Skipped (already completed).")
            continue

        batch = property_data[i * batch_size : (i + 1) * batch_size]
        batch_output = os.path.join(output_folder, f"booking_reviews_batch_{i+1}.csv")
        tasks.append(executor.submit(scrape_batch, i, batch, batch_output))

print("\n✅ Scraping submitted for all remaining batches.")


[Batch 1] Skipped (already completed).
[Batch 2] Skipped (already completed).
[Batch 3] Skipped (already completed).
[Batch 4] Skipped (already completed).
[Batch 5] Skipped (already completed).
[Batch 6] Skipped (already completed).
[Batch 7] Skipped (already completed).
[Batch 8] Skipped (already completed).
[Batch 9] Skipped (already completed).
[Batch 10] Skipped (already completed).
[Batch 11] Skipped (already completed).
[Batch 12] Skipped (already completed).
[Batch 13] Skipped (already completed).
[Batch 14] Skipped (already completed).
[Batch 15] Skipped (already completed).
[Batch 16] Skipped (already completed).
[Batch 17] Skipped (already completed).
[Batch 18] Skipped (already completed).
[Batch 19] Skipped (already completed).
[Batch 20] Skipped (already completed).
[Batch 21] Skipped (already completed).
[Batch 22] Skipped (already completed).
[Batch 23] Skipped (already completed).
[Batch 24] Skipped (already completed).
[Batch 25] Skipped (already completed).
[Batch 26

In [26]:
df = pd.read_csv("booking_reviews_batch_1_old.csv", encoding="utf-8")
df.iloc[11760]


hotel_id                                                              NaN
reviewer_name                                                      Sergey
reviewer_country                                                   Russia
review_room_name                          Deluxe Plus Double or Twin Room
review_num_nights                                                     3.0
review_stay_date                                             October 2023
review_traveler_type                                               Family
review_score                                                          NaN
review_title                                  Не соответствует ожиданиям.
review_positive_text             Близко к метро, удобный спокойный район.
review_negative_text    Старый отель, не соответствует описанию и цене...
hotel_name              Rendezvous Hotel Singapore by Far East Hospita...
Name: 11760, dtype: object

### 4. Merging Property Data

In [None]:
import pandas as pd
import json

property_listing_df = pd.read_csv("property_data.csv")
property_details_df = pd.read_csv("booking_property_details.csv")

# Merge property listing & details data
property_df = property_listing_df.merge(property_details_df, on="hotel_name", how="left")

csv_filename = "property_df.csv"
property_df.to_csv(csv_filename, index=False)
print(f"Merged data saved to '{csv_filename}'.")


Merged data saved to 'property_df.csv'.


### 5. Merging Reviews Data

In [7]:
import pandas as pd
import os
from glob import glob

# Path to your folder
folder_path = "scraped_reviews"

# Find all matching CSV files
csv_files = glob(os.path.join(folder_path, "booking_reviews_batch_*.csv"))

# Read and concatenate all CSV files into one DataFrame
combined_df = pd.concat((pd.read_csv(file) for file in csv_files), ignore_index=True)

# Save combined DataFrame to a new CSV file
combined_df.to_csv("booking_reviews_combined.csv", index=False)
print(f"Merged book saved to '{csv_filename}'.")
combined_df.head()

Merged book saved to 'property_data.csv'.


Unnamed: 0,reviewer_name,reviewer_country,review_room_name,review_num_nights,review_stay_date,review_traveler_type,review_score,review_title,review_positive_text,review_negative_text,hotel_name
0,Gabriela,United States,"Deluxe Twin, Window",2.0,January 2025,Family,10.0,Comfortable and convenient,This was a last minute booking so we arrived t...,,A Hotel Joo Chiat
1,James,Australia,"Superior Double, No Window",1.0,February 2025,Couple,8.0,Very good,no breakfast,water in foyer,A Hotel Joo Chiat
2,A,Singapore,"Superior Double, No Window",4.0,February 2025,Couple,7.0,Overall I'm quite happy with the stay except f...,The location is good.,A little costly for a budget hotel.,A Hotel Joo Chiat
3,Sumarni,Australia,"Superior Twin, No Window",3.0,February 2025,Group,8.0,Good and achieve my expectation,Easy access to anywhere,The fire alarm and no communication with the p...,A Hotel Joo Chiat
4,Samsul,Australia,"Deluxe Double, Window",6.0,February 2025,Couple,8.0,Need room makeup… every stay..,we are regularly…the hotel need to overhaul…\n...,Paid extra for little bit big & View..\nwith W...,A Hotel Joo Chiat
