In [1]:
# Hotel Web Scraper - Setup and Testing
# Initial setup and environment verification

import sys
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime, timedelta
import time
import logging

# Display Python version and key package versions
print(f"Python Version: {sys.version}")
print(f"Pandas Version: {pd.__version__}")
print(f"Requests Version: {requests.__version__}")

# Create basic project structure
import os

directories = [
    'scrapers',
    'data',
    'config',
    'logs',
    'exports'
]

for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"Created directory: {directory}")

print("\nHotel Web Scraper environment setup complete!")


Python Version: 3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]
Pandas Version: 2.2.3
Requests Version: 2.32.3
Created directory: scrapers
Created directory: data
Created directory: config
Created directory: logs
Created directory: exports

Hotel Web Scraper environment setup complete!


In [2]:
# Development workflow helpers for Hotel Web Scraper
def save_and_commit(message="Update Hotel Web Scraper notebook"):
    """Helper function to save work and commit to git"""
    print("Remember to:")
    print("1. Save notebook (Ctrl+S)")
    print("2. In terminal: git add .")
    print(f"3. In terminal: git commit -m '{message}'")
    print("4. In terminal: git push")

# Create config template
config_template = {
    "scraping": {
        "delay_min": 1,
        "delay_max": 3,
        "timeout": 30,
        "max_retries": 3
    },
    "database": {
        "type": "sqlite",
        "path": "data/hotel_prices.db"
    },
    "targets": {
        "booking_com": True,
        "hotels_com": True,
        "expedia": True
    }
}

# Save config file
import json
with open('config/settings.json', 'w') as f:
    json.dump(config_template, f, indent=2)

print("Configuration template created at config/settings.json")
print("Hotel Web Scraper development helpers loaded!")


Configuration template created at config/settings.json
Hotel Web Scraper development helpers loaded!


In [13]:
# Hotel Web Scraper - Load Hotel Data from Excel File
# Read hotel information including booking URLs from column K

import pandas as pd
import os

# File configuration
excel_file_path = r'D:\Hotel Pricing Scraper\NYC Hotel Pricing Crawler.xlsx'
sheet_name = 'Hotel Data'

def load_hotel_data():
    """Load hotel data from Excel file including booking URLs from column K"""
    
    print("LOADING HOTEL DATA FROM EXCEL")
    print("=" * 40)
    print(f"File: {excel_file_path}")
    print(f"Sheet: {sheet_name}")
    print(f"Booking URLs: Column K")
    
    try:
        # Check if file exists
        if not os.path.exists(excel_file_path):
            print(f"ERROR: File not found at {excel_file_path}")
            return None
        
        # Read the Hotel Data sheet
        hotels_df = pd.read_excel(excel_file_path, 
                                 sheet_name=sheet_name, 
                                 header=1)  # Headers are in row 2
        
        # Clean the data - remove empty rows
        hotels_df = hotels_df.dropna(subset=['Hotel Name'])
        
        print(f"Successfully loaded {len(hotels_df)} hotels")
        print()
        
        # Display hotel information with booking URLs from column K
        ready_to_scrape = 0
        
        for idx, hotel in hotels_df.iterrows():
            print(f"{idx+1}. {hotel['Hotel Name']}")
            print(f"   Address: {hotel['Address']}, {hotel['City']}, {hotel['State']} {hotel['ZIP Code']}")
            print(f"   Rooms: {hotel['Number of rooms']}, Suites: {hotel['Suites']}")
            print(f"   Website: {hotel['Website']}")
            
            # Get booking URL from column K
            booking_url = hotel.get('Booking URL', '')
            if pd.notna(booking_url) and str(booking_url).strip() and str(booking_url).strip() != '':
                print(f"   Booking URL: {booking_url}")
                print(f"   Status: READY TO SCRAPE")
                ready_to_scrape += 1
            else:
                print(f"   Booking URL: Not available")
                print(f"   Status: NEEDS BOOKING URL")
            print()
        
        # Save to local CSV for easy access
        hotels_df.to_csv('data/loaded_hotel_data.csv', index=False)
        print("Hotel data saved to: data/loaded_hotel_data.csv")
        
        print("=" * 40)
        print("SCRAPING READINESS SUMMARY")
        print(f"Total hotels: {len(hotels_df)}")
        print(f"Ready to scrape: {ready_to_scrape}")
        print(f"Need booking URLs: {len(hotels_df) - ready_to_scrape}")
        
        return hotels_df
        
    except Exception as e:
        print(f"ERROR loading hotel data: {str(e)}")
        return None

# Load the hotel data
hotels_df = load_hotel_data()

if hotels_df is not None:
    # Show which specific hotels are ready
    ready_hotels = hotels_df[hotels_df['Booking URL'].notna() & 
                           (hotels_df['Booking URL'].astype(str).str.strip() != '') &
                           (hotels_df['Booking URL'].astype(str).str.strip() != 'nan')]
    
    if len(ready_hotels) > 0:
        print("\nHOTELS READY FOR SCRAPING:")
        for _, hotel in ready_hotels.iterrows():
            print(f"✓ {hotel['Hotel Name']}")
    else:
        print("\nNo hotels ready for scraping yet - need to add booking URLs to column K")
else:
    print("Failed to load hotel data") 


LOADING HOTEL DATA FROM EXCEL
File: D:\Hotel Pricing Scraper\NYC Hotel Pricing Crawler.xlsx
Sheet: Hotel Data
Booking URLs: Column K
Successfully loaded 6 hotels

1. Pendry Manhattan West
   Address: 438 W 33rd St, New York, NY 10001
   Rooms: 164, Suites: 30
   Website: https://www.pendry.com/manhattan-west/
   Booking URL: https://www.pendry.com/manhattan-west/booking/#/booking/step-1?data=('hBhd!'pendry-manhattan-west'~ae624dt634fs.~rBat!2~cn!0~cg.~al9po1co1gp1rn.)Ary1rk1re.~rr*)Aax!0~cy1ds!('pe1ls1as1st*)~my9se1ce1ne*)*!null.8%5D1*~4%2F2025'~6!'07%2F08!%5B9!false~A%5D~Bs8('%01BA98641.*_ 
   Status: READY TO SCRAPE

2. The Mercer
   Address: 147 Mercer St, New York, NY 10012
   Rooms: 73, Suites: 73
   Website: https://mercerhotel.com/
   Booking URL: Not available
   Status: NEEDS BOOKING URL

3. Hotel Barriere Fouquets
   Address: 456 Greenwich St, New York, NY 10013
   Rooms: 97, Suites: 31
   Website: https://www.hotelsbarriere.com/en/collection-fouquet-s/new-york/fouquet-s-new-

In [14]:
# Hotel Web Scraper - Test Booking URL Accessibility
# Verify all booking URLs are accessible and return booking data

import requests
import time
from bs4 import BeautifulSoup

def test_booking_urls(hotels_df):
    """Test accessibility of all booking URLs"""
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    print("TESTING BOOKING URL ACCESSIBILITY")
    print("=" * 45)
    
    results = []
    
    # Filter hotels that have booking URLs
    ready_hotels = hotels_df[hotels_df['Booking URL'].notna() & 
                           (hotels_df['Booking URL'].astype(str).str.strip() != '') &
                           (hotels_df['Booking URL'].astype(str).str.strip() != 'nan')]
    
    for _, hotel in ready_hotels.iterrows():
        hotel_name = hotel['Hotel Name']
        booking_url = hotel['Booking URL']
        
        print(f"\nTesting: {hotel_name}")
        print(f"URL: {booking_url[:100]}...")  # Truncate long URLs
        
        try:
            start_time = time.time()
            response = requests.get(booking_url, headers=headers, timeout=20)
            response_time = time.time() - start_time
            
            print(f"   Status Code: {response.status_code}")
            print(f"   Response Time: {response_time:.2f}s")
            print(f"   Content Size: {len(response.content):,} bytes")
            
            # Quick analysis of content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Look for pricing indicators
            price_patterns = soup.find_all(text=re.compile(r'\$\d+'))
            room_elements = len(soup.find_all(attrs={'class': re.compile(r'room|suite|rate', re.I)}))
            
            print(f"   Price indicators: {len(price_patterns)}")
            print(f"   Room/rate elements: {room_elements}")
            
            # Determine scraping readiness
            if response.status_code == 200:
                if len(price_patterns) > 0 or room_elements > 0:
                    status = "EXCELLENT - Ready to scrape"
                else:
                    status = "GOOD - May need Selenium for dynamic content"
            else:
                status = f"WARNING - Status {response.status_code}"
            
            print(f"   Assessment: {status}")
            
            results.append({
                'hotel': hotel_name,
                'status_code': response.status_code,
                'response_time': response_time,
                'price_indicators': len(price_patterns),
                'room_elements': room_elements,
                'assessment': status
            })
            
        except requests.exceptions.Timeout:
            print(f"   TIMEOUT - Request took too long")
            results.append({
                'hotel': hotel_name,
                'assessment': 'TIMEOUT'
            })
        except Exception as e:
            print(f"   ERROR: {str(e)}")
            results.append({
                'hotel': hotel_name,
                'assessment': f'ERROR: {str(e)}'
            })
        
        time.sleep(2)  # Be respectful between requests
    
    return results

# Test all booking URLs
test_results = test_booking_urls(hotels_df)

print(f"\n{'='*45}")
print("BOOKING URL TEST SUMMARY:")
successful_tests = [r for r in test_results if r.get('status_code') == 200]
print(f"Successful connections: {len(successful_tests)}/{len(test_results)}")

for result in test_results:
    print(f"{result['hotel']}: {result['assessment']}")

print(f"\nReady to begin scraper development!")


TESTING BOOKING URL ACCESSIBILITY

Testing: Pendry Manhattan West
URL: https://www.pendry.com/manhattan-west/booking/#/booking/step-1?data=('hBhd!'pendry-manhattan-west'~a...
   Status Code: 200
   Response Time: 0.10s
   Content Size: 296,247 bytes
   Price indicators: 0
   Room/rate elements: 10
   Assessment: EXCELLENT - Ready to scrape


  price_patterns = soup.find_all(text=re.compile(r'\$\d+'))



Testing: Hotel Barriere Fouquets
URL: https://reservations.hotelsbarriere.com/?adult=2&arrive=2025-07-02&chain=32004&child=0&currency=USD&...
   Status Code: 200
   Response Time: 1.03s
   Content Size: 539,293 bytes
   Price indicators: 1
   Room/rate elements: 0
   Assessment: EXCELLENT - Ready to scrape

Testing: Casa Cipriani
URL: https://reservations.casaciprianinewyork.com/?adult=2&arrive=2025-07-02&chain=27604&child=0&currency...
   Status Code: 200
   Response Time: 0.97s
   Content Size: 441,016 bytes
   Price indicators: 2
   Room/rate elements: 17
   Assessment: EXCELLENT - Ready to scrape

Testing: The Mark
URL: https://be.synxis.com/?_ga=2.160812469.948820205.1751430792-153109825.1751227550&_gac=1.19414090.175...
   Status Code: 200
   Response Time: 0.12s
   Content Size: 212 bytes
   Price indicators: 0
   Room/rate elements: 0
   Assessment: GOOD - May need Selenium for dynamic content

Testing: Baccarat Hotel and Residences 
URL: https://www.baccarathotels.com/book?cu

In [15]:
# Hotel Web Scraper - HTTP Scraper MVP
# Build scraper for the 3 HTTP-ready hotels

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
from datetime import datetime

class HTTPHotelScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        
        # Hotels ready for HTTP scraping
        self.http_ready_hotels = ['Pendry Manhattan West', 'Hotel Barriere Fouquets', 'Casa Cipriani']
        
    def scrape_hotel_prices(self, hotel_name, booking_url):
        """Scrape pricing data from a hotel booking page"""
        
        print(f"\nScraping: {hotel_name}")
        print(f"URL: {booking_url[:80]}...")
        
        try:
            response = requests.get(booking_url, headers=self.headers, timeout=20)
            
            if response.status_code != 200:
                return {'error': f'HTTP {response.status_code}'}
            
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract pricing information
            prices = self.extract_prices(soup)
            rooms = self.extract_room_info(soup)
            
            result = {
                'hotel_name': hotel_name,
                'scrape_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'url': booking_url,
                'prices_found': len(prices),
                'rooms_found': len(rooms),
                'prices': prices,
                'rooms': rooms,
                'status': 'SUCCESS'
            }
            
            print(f"   Found: {len(prices)} prices, {len(rooms)} room types")
            return result
            
        except Exception as e:
            print(f"   ERROR: {str(e)}")
            return {
                'hotel_name': hotel_name,
                'error': str(e),
                'status': 'ERROR'
            }
    
    def extract_prices(self, soup):
        """Extract price information from the page"""
        prices = []
        
        # Look for price patterns - various formats
        price_patterns = [
            r'\$[\d,]+\.?\d*',  # $500, $1,200.50
            r'USD\s*[\d,]+',    # USD 500
            r'[\d,]+\s*USD'     # 500 USD
        ]
        
        for pattern in price_patterns:
            matches = soup.find_all(string=re.compile(pattern))
            for match in matches:
                # Clean and extract price
                price_text = re.search(pattern, match)
                if price_text:
                    clean_price = re.sub(r'[^\d.]', '', price_text.group())
                    if clean_price:
                        try:
                            prices.append(float(clean_price))
                        except ValueError:
                            continue
        
        # Remove duplicates and sort
        prices = sorted(list(set(prices)))
        return prices
    
    def extract_room_info(self, soup):
        """Extract room/suite information"""
        rooms = []
        
        # Look for room-related elements
        room_selectors = [
            '[class*="room"]', '[class*="suite"]', '[class*="accommodation"]',
            '[data-room]', '[data-suite]', 'h2, h3, h4'  # Headers often contain room names
        ]
        
        for selector in room_selectors:
            elements = soup.select(selector)
            for element in elements:
                text = element.get_text(strip=True)
                if text and any(keyword in text.lower() for keyword in ['room', 'suite', 'king', 'queen', 'deluxe', 'standard']):
                    if len(text) < 100:  # Avoid very long text blocks
                        rooms.append(text)
        
        # Remove duplicates
        rooms = list(set(rooms))
        return rooms
    
    def scrape_all_ready_hotels(self, hotels_df):
        """Scrape all HTTP-ready hotels"""
        
        print("HTTP SCRAPER MVP - SCRAPING READY HOTELS")
        print("=" * 50)
        
        results = []
        
        # Filter to HTTP-ready hotels
        ready_hotels = hotels_df[hotels_df['Hotel Name'].isin(self.http_ready_hotels) &
                                hotels_df['Booking URL'].notna() &
                                (hotels_df['Booking URL'].astype(str).str.strip() != '')]
        
        print(f"Scraping {len(ready_hotels)} HTTP-ready hotels")
        
        for _, hotel in ready_hotels.iterrows():
            result = self.scrape_hotel_prices(hotel['Hotel Name'], hotel['Booking URL'])
            results.append(result)
            
            # Be respectful - pause between requests
            time.sleep(3)
        
        return results

# Initialize scraper and run
scraper = HTTPHotelScraper()
scraping_results = scraper.scrape_all_ready_hotels(hotels_df)

print(f"\n{'='*50}")
print("SCRAPING RESULTS SUMMARY:")
for result in scraping_results:
    if result['status'] == 'SUCCESS':
        print(f"✓ {result['hotel_name']}: {result['prices_found']} prices, {result['rooms_found']} rooms")
    else:
        print(f"✗ {result['hotel_name']}: {result.get('error', 'Unknown error')}")

print(f"\nMVP HTTP scraper complete!")


HTTP SCRAPER MVP - SCRAPING READY HOTELS
Scraping 3 HTTP-ready hotels

Scraping: Pendry Manhattan West
URL: https://www.pendry.com/manhattan-west/booking/#/booking/step-1?data=('hBhd!'pend...
   Found: 0 prices, 0 room types

Scraping: Hotel Barriere Fouquets
URL: https://reservations.hotelsbarriere.com/?adult=2&arrive=2025-07-02&chain=32004&c...
   Found: 1 prices, 0 room types

Scraping: Casa Cipriani
URL: https://reservations.casaciprianinewyork.com/?adult=2&arrive=2025-07-02&chain=27...
   Found: 2 prices, 9 room types

SCRAPING RESULTS SUMMARY:
✓ Pendry Manhattan West: 0 prices, 0 rooms
✓ Hotel Barriere Fouquets: 1 prices, 0 rooms
✓ Casa Cipriani: 2 prices, 9 rooms

MVP HTTP scraper complete!


In [18]:
# Hotel Web Scraper - Simplified Data Check
# Quick assessment of what we can actually scrape

def simple_data_check(hotels_df):
    """Simple check to see if sites have scrapeable data"""
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
    
    print("SIMPLE DATA CHECK")
    print("=" * 30)
    
    ready_hotels = hotels_df[hotels_df['Booking URL'].notna() & 
                           (hotels_df['Booking URL'].astype(str).str.strip() != '')]
    
    for _, hotel in ready_hotels.iterrows():  # Fixed: changed * to _
        print(f"\n{hotel['Hotel Name']}:")
        
        try:
            response = requests.get(hotel['Booking URL'], headers=headers, timeout=15)
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Simple checks
            has_prices = len(soup.find_all(string=re.compile(r'\$[0-9,]+\.[0-9]{2}'))) > 0
            has_rooms = len(soup.find_all(string=re.compile(r'[Ss]uite|[Rr]oom', re.I))) > 5
            page_size = len(response.content)
            
            print(f"   Has visible prices: {has_prices}")
            print(f"   Has room info: {has_rooms}")
            print(f"   Page size: {page_size:,} bytes")
            
            if page_size < 50000 and not has_prices:
                print(f"   Assessment: Likely needs Selenium")
            elif has_prices and has_rooms:
                print(f"   Assessment: Good for HTTP scraping")
            else:
                print(f"   Assessment: Mixed - may need hybrid approach")
                
        except Exception as e:
            print(f"   Error: {str(e)}")
        
        time.sleep(1)

# Run simplified check
simple_data_check(hotels_df)


SIMPLE DATA CHECK

Pendry Manhattan West:
   Has visible prices: False
   Has room info: False
   Page size: 5,451 bytes
   Assessment: Likely needs Selenium

Hotel Barriere Fouquets:
   Has visible prices: False
   Has room info: False
   Page size: 1,029 bytes
   Assessment: Likely needs Selenium

Casa Cipriani:
   Has visible prices: False
   Has room info: False
   Page size: 1,035 bytes
   Assessment: Likely needs Selenium

The Mark:
   Has visible prices: False
   Has room info: False
   Page size: 903 bytes
   Assessment: Likely needs Selenium

Baccarat Hotel and Residences :
   Has visible prices: False
   Has room info: False
   Page size: 1,516 bytes
   Assessment: Likely needs Selenium


In [19]:
# Hotel Web Scraper - Selenium Setup
# Install and configure Selenium for dynamic content scraping

# First, install selenium if not already installed
import subprocess
import sys

def install_selenium():
    """Install selenium package"""
    try:
        import selenium
        print("Selenium already installed")
        return True
    except ImportError:
        print("Installing Selenium...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "selenium"])
        print("Selenium installed successfully")
        return True

# Install selenium
install_selenium()

# Import selenium components
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time

class SeleniumHotelScraper:
    def __init__(self):
        self.setup_driver()
    
    def setup_driver(self):
        """Set up Chrome driver with options"""
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # Run in background
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            print("Chrome driver initialized successfully")
            return True
        except Exception as e:
            print(f"Error setting up Chrome driver: {str(e)}")
            print("You may need to install ChromeDriver")
            return False
    
    def test_selenium_access(self, hotel_name, booking_url):
        """Test if we can access booking page with Selenium"""
        
        print(f"\nTesting Selenium access: {hotel_name}")
        print(f"URL: {booking_url[:60]}...")
        
        try:
            self.driver.get(booking_url)
            time.sleep(5)  # Wait for page to load
            
            # Check what we can find now
            page_title = self.driver.title
            page_source_length = len(self.driver.page_source)
            
            # Look for price elements
            price_elements = self.driver.find_elements(By.XPATH, "//*[contains(text(), '$')]")
            room_elements = self.driver.find_elements(By.XPATH, "//*[contains(text(), 'room') or contains(text(), 'suite')]")
            
            print(f"   Page title: {page_title}")
            print(f"   Page source length: {page_source_length:,} characters")
            print(f"   Price elements found: {len(price_elements)}")
            print(f"   Room elements found: {len(room_elements)}")
            
            if len(price_elements) > 0 or len(room_elements) > 0:
                print(f"   Assessment: SUCCESS - Data available with Selenium")
                return True
            else:
                print(f"   Assessment: May need more time or interaction")
                return False
                
        except Exception as e:
            print(f"   Error: {str(e)}")
            return False
    
    def close(self):
        """Close the driver"""
        if hasattr(self, 'driver'):
            self.driver.quit()

# Test Selenium setup
print("SELENIUM SETUP AND TESTING")
print("=" * 40)

scraper = SeleniumHotelScraper()

# Test with one hotel first
if hasattr(scraper, 'driver'):
    test_hotel = hotels_df.iloc[0]  # Test with first hotel
    scraper.test_selenium_access(test_hotel['Hotel Name'], test_hotel['Booking URL'])
    scraper.close()
else:
    print("Selenium setup failed - will need to troubleshoot driver installation")
    

Installing Selenium...
Selenium installed successfully
SELENIUM SETUP AND TESTING
Chrome driver initialized successfully

Testing Selenium access: Pendry Manhattan West
URL: https://www.pendry.com/manhattan-west/booking/#/booking/step...
   Page title: Booking | Manhattan West
   Page source length: 586,811 characters
   Price elements found: 2
   Room elements found: 5
   Assessment: SUCCESS - Data available with Selenium


In [20]:
# Hotel Web Scraper - Test All Hotels with Selenium
# Test Selenium access across all our hotels

# Test all hotels with Selenium
def test_all_hotels_selenium(hotels_df):
    """Test Selenium access for all hotels with booking URLs"""
    
    scraper = SeleniumHotelScraper()
    
    if not hasattr(scraper, 'driver'):
        print("Selenium setup failed")
        return
    
    print("TESTING ALL HOTELS WITH SELENIUM")
    print("=" * 45)
    
    ready_hotels = hotels_df[hotels_df['Booking URL'].notna() & 
                           (hotels_df['Booking URL'].astype(str).str.strip() != '')]
    
    results = []
    
    for _, hotel in ready_hotels.iterrows():
        success = scraper.test_selenium_access(hotel['Hotel Name'], hotel['Booking URL'])
        results.append({
            'hotel': hotel['Hotel Name'],
            'success': success
        })
        
        time.sleep(3)  # Be respectful between requests
    
    scraper.close()
    
    print(f"\n{'='*45}")
    print("SELENIUM TEST SUMMARY:")
    successful = [r for r in results if r['success']]
    print(f"Successfully scraped: {len(successful)}/5 hotels")
    
    for result in results:
        status = "✓ SUCCESS" if result['success'] else "✗ NEEDS WORK"
        print(f"{status}: {result['hotel']}")
    
    return results

# Test all hotels
selenium_results = test_all_hotels_selenium(hotels_df)


Chrome driver initialized successfully
TESTING ALL HOTELS WITH SELENIUM

Testing Selenium access: Pendry Manhattan West
URL: https://www.pendry.com/manhattan-west/booking/#/booking/step...
   Page title: Booking | Manhattan West
   Page source length: 586,811 characters
   Price elements found: 2
   Room elements found: 5
   Assessment: SUCCESS - Data available with Selenium

Testing Selenium access: Hotel Barriere Fouquets
URL: https://reservations.hotelsbarriere.com/?adult=2&arrive=2025...
   Page title: 
   Page source length: 1,062 characters
   Price elements found: 0
   Room elements found: 0
   Assessment: May need more time or interaction

Testing Selenium access: Casa Cipriani
URL: https://reservations.casaciprianinewyork.com/?adult=2&arrive...
   Page title: 
   Page source length: 1,072 characters
   Price elements found: 0
   Room elements found: 0
   Assessment: May need more time or interaction

Testing Selenium access: The Mark
URL: https://be.synxis.com/?_ga=2.160812469

In [21]:
# Hotel Web Scraper - Build Functional Scraper
# Create working scraper for Pendry and Baccarat

class WorkingHotelScraper:
    def __init__(self):
        self.setup_driver()
        self.working_hotels = ['Pendry Manhattan West', 'Baccarat Hotel and Residences']
    
    def setup_driver(self):
        """Set up Chrome driver"""
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
        
        self.driver = webdriver.Chrome(options=chrome_options)
    
    def scrape_hotel_data(self, hotel_name, booking_url):
        """Scrape actual pricing data from working hotels"""
        
        print(f"\nSCRAPING: {hotel_name}")
        print("=" * 40)
        
        try:
            self.driver.get(booking_url)
            time.sleep(8)  # Give more time for content to load
            
            # Extract prices
            price_elements = self.driver.find_elements(By.XPATH, "//*[contains(text(), '$')]")
            prices = []
            
            for element in price_elements:
                price_text = element.text
                # Extract actual dollar amounts
                price_matches = re.findall(r'\$([0-9,]+(?:\.[0-9]{2})?)', price_text)
                for match in price_matches:
                    try:
                        clean_price = float(match.replace(',', ''))
                        if 50 <= clean_price <= 5000:  # Reasonable hotel price range
                            prices.append(clean_price)
                    except ValueError:
                        continue
            
            # Extract room names
            room_elements = self.driver.find_elements(By.XPATH, "//*[contains(text(), 'room') or contains(text(), 'suite') or contains(text(), 'Room') or contains(text(), 'Suite')]")
            rooms = []
            
            for element in room_elements:
                room_text = element.text.strip()
                if room_text and len(room_text) < 100:  # Avoid long descriptions
                    if any(keyword in room_text.lower() for keyword in ['room', 'suite', 'king', 'queen', 'deluxe']):
                        rooms.append(room_text)
            
            # Remove duplicates
            unique_prices = sorted(list(set(prices)))
            unique_rooms = list(set(rooms))
            
            print(f"Extracted Prices: {unique_prices}")
            print(f"Number of unique prices: {len(unique_prices)}")
            print(f"Extracted Rooms ({len(unique_rooms)}):")
            for i, room in enumerate(unique_rooms[:10], 1):  # Show first 10
                print(f"   {i}. {room}")
            
            return {
                'hotel_name': hotel_name,
                'prices': unique_prices,
                'rooms': unique_rooms,
                'scrape_time': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'status': 'SUCCESS'
            }
            
        except Exception as e:
            print(f"Error scraping {hotel_name}: {str(e)}")
            return {
                'hotel_name': hotel_name,
                'error': str(e),
                'status': 'ERROR'
            }
    
    def scrape_working_hotels(self, hotels_df):
        """Scrape data from our 2 working hotels"""
        
        results = []
        
        for hotel_name in self.working_hotels:
            hotel_data = hotels_df[hotels_df['Hotel Name'] == hotel_name]
            if not hotel_data.empty:
                booking_url = hotel_data.iloc[0]['Booking URL']
                result = self.scrape_hotel_data(hotel_name, booking_url)
                results.append(result)
        
        return results
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

# Test the working scraper
print("FUNCTIONAL SCRAPER TEST")
print("=" * 30)

working_scraper = WorkingHotelScraper()
scraping_results = working_scraper.scrape_working_hotels(hotels_df)
working_scraper.close()

print(f"\nSCRAPING COMPLETE!")
print(f"Successfully scraped: {len([r for r in scraping_results if r['status'] == 'SUCCESS'])}/2 hotels")


FUNCTIONAL SCRAPER TEST

SCRAPING: Pendry Manhattan West
Extracted Prices: []
Number of unique prices: 0
Extracted Rooms (0):

SCRAPING COMPLETE!
Successfully scraped: 1/2 hotels


In [22]:
# Hotel Web Scraper - Improved Selenium Scraper
# Better handling of dynamic content and element detection

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

class ImprovedHotelScraper:
    def __init__(self):
        self.setup_driver()
    
    def setup_driver(self):
        """Set up Chrome driver with better options"""
        chrome_options = Options()
        # Remove headless mode to see what's happening
        # chrome_options.add_argument('--headless')  # Comment out for debugging
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        self.wait = WebDriverWait(self.driver, 20)
    
    def scrape_with_debug(self, hotel_name, booking_url):
        """Scrape with detailed debugging to see what's happening"""
        
        print(f"\nDEBUG SCRAPING: {hotel_name}")
        print("=" * 50)
        
        try:
            print("Loading page...")
            self.driver.get(booking_url)
            
            print("Waiting for page to load...")
            time.sleep(10)  # Give plenty of time
            
            print(f"Page title: {self.driver.title}")
            print(f"Current URL: {self.driver.current_url}")
            
            # Take a screenshot for debugging (optional)
            # self.driver.save_screenshot(f'{hotel_name.replace(" ", "_")}_debug.png')
            
            # Try different methods to find content
            print("\n--- Method 1: Look for any text with $ ---")
            dollar_elements = self.driver.find_elements(By.XPATH, "//*[contains(text(), '$')]")
            print(f"Found {len(dollar_elements)} elements with '$'")
            for i, elem in enumerate(dollar_elements[:5]):  # Show first 5
                try:
                    print(f"   ${i+1}: '{elem.text}' (tag: {elem.tag_name})")
                except:
                    print(f"   ${i+1}: Could not read text")
            
            print("\n--- Method 2: Look for common price classes/IDs ---")
            price_selectors = [
                "[class*='price']", "[id*='price']", "[class*='rate']", "[id*='rate']",
                "[class*='cost']", "[class*='amount']", ".price", "#price"
            ]
            
            for selector in price_selectors:
                try:
                    elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    if elements:
                        print(f"   Selector '{selector}': {len(elements)} elements")
                        for elem in elements[:2]:  # Show first 2
                            try:
                                print(f"      Text: '{elem.text}'")
                            except:
                                print(f"      Could not read text")
                except:
                    continue
            
            print("\n--- Method 3: Look for room information ---")
            room_keywords = ['room', 'suite', 'king', 'queen', 'deluxe', 'standard']
            for keyword in room_keywords:
                elements = self.driver.find_elements(By.XPATH, f"//*[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), '{keyword}')]")
                if elements:
                    print(f"   Keyword '{keyword}': {len(elements)} elements")
                    for elem in elements[:3]:  # Show first 3
                        try:
                            text = elem.text.strip()
                            if text and len(text) < 100:
                                print(f"      '{text}'")
                        except:
                            continue
            
            print("\n--- Method 4: Check if page is still loading ---")
            # Look for loading indicators
            loading_indicators = self.driver.find_elements(By.XPATH, "//*[contains(@class, 'loading') or contains(@class, 'spinner') or contains(text(), 'Loading')]")
            print(f"Loading indicators: {len(loading_indicators)}")
            
            # Look for JavaScript errors in console
            try:
                logs = self.driver.get_log('browser')
                errors = [log for log in logs if log['level'] == 'SEVERE']
                if errors:
                    print(f"JavaScript errors: {len(errors)}")
                    for error in errors[:2]:
                        print(f"   {error['message']}")
            except:
                print("Could not check browser logs")
            
            print(f"\nPage source length: {len(self.driver.page_source):,} characters")
            
            return {
                'hotel_name': hotel_name,
                'status': 'DEBUG_COMPLETE',
                'dollar_elements': len(dollar_elements),
                'page_loaded': len(self.driver.page_source) > 10000
            }
            
        except Exception as e:
            print(f"Error during debug: {str(e)}")
            return {'hotel_name': hotel_name, 'status': 'ERROR', 'error': str(e)}
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

# Test with debugging
print("IMPROVED SCRAPER WITH DEBUG")
print("=" * 40)

debug_scraper = ImprovedHotelScraper()

# Test with Pendry first
pendry_data = hotels_df[hotels_df['Hotel Name'] == 'Pendry Manhattan West'].iloc[0]
debug_result = debug_scraper.scrape_with_debug('Pendry Manhattan West', pendry_data['Booking URL'])

print(f"\nDEBUG RESULT: {debug_result}")

debug_scraper.close()


IMPROVED SCRAPER WITH DEBUG

DEBUG SCRAPING: Pendry Manhattan West
Loading page...
Waiting for page to load...
Page title: Booking | Manhattan West
Current URL: https://www.pendry.com/manhattan-west/booking/#/booking/step-1?data=('hBhd!'pendry-manhattan-west'~ae624dt634fs.~rBat!2~cn!0~cg.~al9po1co1gp1rn.)Ary1rk1re.~rr*)Aax!0~cy1ds!('pe1ls1as1st*)~my9se1ce1ne*)*!null.8%5D1*~4%2F2025'~6!'07%2F08!%5B9!false~A%5D~Bs8('%01BA98641.*_

--- Method 1: Look for any text with $ ---
Found 2 elements with '$'
   $1: '' (tag: script)
   $2: '' (tag: script)

--- Method 2: Look for common price classes/IDs ---
   Selector '[class*='rate']': 10 elements
      Text: ''
      Text: ''
   Selector '[id*='rate']': 1 elements
      Text: ''

--- Method 3: Look for room information ---
   Keyword 'room': 15 elements
   Keyword 'suite': 3 elements
      'Suite'
   Keyword 'king': 20 elements
   Keyword 'queen': 3 elements

--- Method 4: Check if page is still loading ---
Loading indicators: 2

Page source le

In [23]:
# Hotel Web Scraper - Enhanced Element Extraction
# Better methods to extract content from elements

class EnhancedScraper:
    def __init__(self):
        self.setup_driver()
    
    def setup_driver(self):
        """Set up Chrome driver"""
        chrome_options = Options()
        # chrome_options.add_argument('--headless')  # Keep visible for now
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        self.wait = WebDriverWait(self.driver, 30)
    
    def wait_for_content(self):
        """Wait for dynamic content to load"""
        print("Waiting for content to load...")
        
        # Wait for loading indicators to disappear
        try:
            self.wait.until_not(EC.presence_of_element_located((By.XPATH, "//*[contains(@class, 'loading') or contains(text(), 'Loading')]")))
            print("Loading indicators gone")
        except:
            print("Loading indicators timeout - continuing anyway")
        
        # Additional wait for content
        time.sleep(5)
        
        # Try to wait for specific content
        try:
            self.wait.until(EC.presence_of_element_located((By.XPATH, "//*[contains(@class, 'rate') or contains(@class, 'price')]")))
            print("Price elements detected")
        except:
            print("Price elements timeout - continuing anyway")
    
    def extract_content_advanced(self, hotel_name, booking_url):
        """Advanced content extraction with multiple methods"""
        
        print(f"\nADVANCED EXTRACTION: {hotel_name}")
        print("=" * 50)
        
        try:
            self.driver.get(booking_url)
            self.wait_for_content()
            
            # Method 1: Check element attributes and innerHTML
            print("\n--- Method 1: Extract from element attributes ---")
            rate_elements = self.driver.find_elements(By.CSS_SELECTOR, "[class*='rate']")
            
            for i, elem in enumerate(rate_elements[:5]):
                try:
                    # Try different ways to get content
                    text = elem.text
                    inner_html = elem.get_attribute('innerHTML')
                    value = elem.get_attribute('value')
                    data_price = elem.get_attribute('data-price')
                    
                    print(f"   Element {i+1}:")
                    if text: print(f"      Text: '{text}'")
                    if value: print(f"      Value: '{value}'")
                    if data_price: print(f"      Data-price: '{data_price}'")
                    if inner_html and len(inner_html) < 200:
                        print(f"      innerHTML: '{inner_html}'")
                except Exception as e:
                    print(f"      Error reading element: {str(e)}")
            
            # Method 2: Execute JavaScript to find content
            print("\n--- Method 2: JavaScript content extraction ---")
            try:
                # Look for price data in JavaScript
                price_script = """
                    var prices = [];
                    var elements = document.querySelectorAll('*');
                    for (var i = 0; i < elements.length; i++) {
                        var text = elements[i].textContent || elements[i].innerText || '';
                        var matches = text.match(/\\$[0-9,]+(?:\\.[0-9]{2})?/g);
                        if (matches) {
                            prices = prices.concat(matches);
                        }
                    }
                    return prices.slice(0, 10); // Return first 10
                """
                
                js_prices = self.driver.execute_script(price_script)
                print(f"   JavaScript found prices: {js_prices}")
                
                # Look for room data
                room_script = """
                    var rooms = [];
                    var elements = document.querySelectorAll('*');
                    for (var i = 0; i < elements.length; i++) {
                        var text = elements[i].textContent || elements[i].innerText || '';
                        if (text.length > 5 && text.length < 50 && 
                            (text.toLowerCase().includes('room') || 
                             text.toLowerCase().includes('suite') || 
                             text.toLowerCase().includes('king') || 
                             text.toLowerCase().includes('queen'))) {
                            rooms.push(text.trim());
                        }
                    }
                    return [...new Set(rooms)].slice(0, 10); // Return unique first 10
                """
                
                js_rooms = self.driver.execute_script(room_script)
                print(f"   JavaScript found rooms: {js_rooms}")
                
                return {
                    'hotel_name': hotel_name,
                    'js_prices': js_prices,
                    'js_rooms': js_rooms,
                    'status': 'SUCCESS'
                }
                
            except Exception as e:
                print(f"   JavaScript extraction error: {str(e)}")
                return {'hotel_name': hotel_name, 'status': 'JS_ERROR', 'error': str(e)}
            
        except Exception as e:
            print(f"Overall error: {str(e)}")
            return {'hotel_name': hotel_name, 'status': 'ERROR', 'error': str(e)}
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

# Test enhanced extraction
print("ENHANCED CONTENT EXTRACTION")
print("=" * 40)

enhanced_scraper = EnhancedScraper()

# Test with Pendry
pendry_data = hotels_df[hotels_df['Hotel Name'] == 'Pendry Manhattan West'].iloc[0]
enhanced_result = enhanced_scraper.extract_content_advanced('Pendry Manhattan West', pendry_data['Booking URL'])

print(f"\nENHANCED RESULT:")
print(f"Status: {enhanced_result['status']}")
if 'js_prices' in enhanced_result:
    print(f"Prices found: {enhanced_result['js_prices']}")
if 'js_rooms' in enhanced_result:
    print(f"Rooms found: {enhanced_result['js_rooms']}")

enhanced_scraper.close()


ENHANCED CONTENT EXTRACTION

ADVANCED EXTRACTION: Pendry Manhattan West
Waiting for content to load...
Loading indicators gone
Price elements detected

--- Method 1: Extract from element attributes ---
   Element 1:
   Element 2:
   Element 3:
   Element 4:
   Element 5:

--- Method 2: JavaScript content extraction ---
   JavaScript found prices: []
   JavaScript found rooms: ['Booking | Manhattan West', 'Suites', 'Garden Room', 'In-Room Dining', 'Room Type', 'Bedrooms', 'Room TypeAll Room  Suite  Accessible', 'All Room  Suite  Accessible', 'Room', 'Suite']

ENHANCED RESULT:
Status: SUCCESS
Prices found: []
Rooms found: ['Booking | Manhattan West', 'Suites', 'Garden Room', 'In-Room Dining', 'Room Type', 'Bedrooms', 'Room TypeAll Room  Suite  Accessible', 'All Room  Suite  Accessible', 'Room', 'Suite']


In [24]:
# Hotel Web Scraper - Date Selection and Search
# Add proper booking flow: select dates → search → extract prices

class DateAwareHotelScraper:
    def __init__(self):
        self.setup_driver()
    
    def setup_driver(self):
        """Set up Chrome driver"""
        chrome_options = Options()
        # chrome_options.add_argument('--headless')  # Keep visible for debugging
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 30)
    
    def select_dates_and_search(self):
        """Select check-in/check-out dates and trigger search"""
        
        print("Selecting dates and searching...")
        
        try:
            # Method 1: Look for date input fields
            date_inputs = self.driver.find_elements(By.CSS_SELECTOR, "input[type='date'], input[placeholder*='date'], input[class*='date']")
            print(f"Found {len(date_inputs)} date input fields")
            
            # Method 2: Look for calendar/datepicker elements
            calendar_elements = self.driver.find_elements(By.CSS_SELECTOR, "[class*='calendar'], [class*='datepicker'], [class*='date-picker']")
            print(f"Found {len(calendar_elements)} calendar elements")
            
            # Method 3: Look for check-in/check-out specific elements
            checkin_elements = self.driver.find_elements(By.XPATH, "//*[contains(@placeholder, 'check') or contains(@class, 'checkin') or contains(@class, 'check-in')]")
            checkout_elements = self.driver.find_elements(By.XPATH, "//*[contains(@placeholder, 'check') or contains(@class, 'checkout') or contains(@class, 'check-out')]")
            print(f"Found {len(checkin_elements)} check-in elements, {len(checkout_elements)} check-out elements")
            
            # Method 4: Look for search/availability buttons
            search_buttons = self.driver.find_elements(By.XPATH, "//button[contains(text(), 'Search') or contains(text(), 'Check') or contains(text(), 'Availability') or contains(@class, 'search')]")
            print(f"Found {len(search_buttons)} search buttons")
            
            # Try to interact with date fields
            if date_inputs:
                print("Attempting to set dates in date inputs...")
                try:
                    # Set check-in date (July 2, 2025)
                    date_inputs[0].clear()
                    date_inputs[0].send_keys("07/02/2025")
                    
                    if len(date_inputs) > 1:
                        # Set check-out date (July 3, 2025) 
                        date_inputs[1].clear()
                        date_inputs[1].send_keys("07/03/2025")
                    
                    print("Dates entered successfully")
                except Exception as e:
                    print(f"Error setting dates: {str(e)}")
            
            # Try to click search button
            if search_buttons:
                print("Clicking search button...")
                try:
                    search_buttons[0].click()
                    print("Search button clicked")
                    time.sleep(5)  # Wait for results
                except Exception as e:
                    print(f"Error clicking search: {str(e)}")
            
            return True
            
        except Exception as e:
            print(f"Error in date selection: {str(e)}")
            return False
    
    def scrape_with_dates(self, hotel_name, booking_url):
        """Complete scraping flow with date selection"""
        
        print(f"\nDATE-AWARE SCRAPING: {hotel_name}")
        print("=" * 50)
        
        try:
            # Step 1: Load page
            print("Step 1: Loading page...")
            self.driver.get(booking_url)
            time.sleep(5)
            
            # Step 2: Select dates and search
            print("Step 2: Selecting dates and searching...")
            dates_selected = self.select_dates_and_search()
            
            # Step 3: Wait for results
            print("Step 3: Waiting for results...")
            time.sleep(8)
            
            # Step 4: Extract prices after search
            print("Step 4: Extracting prices after search...")
            price_script = """
                var prices = [];
                var elements = document.querySelectorAll('*');
                for (var i = 0; i < elements.length; i++) {
                    var text = elements[i].textContent || elements[i].innerText || '';
                    var matches = text.match(/\\$[0-9,]+(?:\\.[0-9]{2})?/g);
                    if (matches) {
                        prices = prices.concat(matches);
                    }
                }
                return [...new Set(prices)]; // Return unique prices
            """
            
            prices = self.driver.execute_script(price_script)
            print(f"Prices found after search: {prices}")
            
            # Step 5: Extract room info
            room_script = """
                var rooms = [];
                var elements = document.querySelectorAll('*');
                for (var i = 0; i < elements.length; i++) {
                    var text = elements[i].textContent || elements[i].innerText || '';
                    if (text.length > 5 && text.length < 80 && 
                        (text.toLowerCase().includes('room') || 
                         text.toLowerCase().includes('suite') || 
                         text.toLowerCase().includes('king') || 
                         text.toLowerCase().includes('queen') ||
                         text.toLowerCase().includes('deluxe'))) {
                        rooms.push(text.trim());
                    }
                }
                return [...new Set(rooms)].slice(0, 15);
            """
            
            rooms = self.driver.execute_script(room_script)
            clean_rooms = [room for room in rooms if not any(skip in room.lower() for skip in ['booking', 'manhattan', 'dining', 'type'])]
            
            print(f"Rooms found: {clean_rooms}")
            
            return {
                'hotel_name': hotel_name,
                'prices': prices,
                'rooms': clean_rooms,
                'dates_selected': dates_selected,
                'status': 'SUCCESS'
            }
            
        except Exception as e:
            print(f"Error in date-aware scraping: {str(e)}")
            return {'hotel_name': hotel_name, 'status': 'ERROR', 'error': str(e)}
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

# Test date-aware scraping
print("DATE-AWARE SCRAPING TEST")
print("=" * 30)

date_scraper = DateAwareHotelScraper()

# Test with Pendry
pendry_data = hotels_df[hotels_df['Hotel Name'] == 'Pendry Manhattan West'].iloc[0]
final_result = date_scraper.scrape_with_dates('Pendry Manhattan West', pendry_data['Booking URL'])

print(f"\nFINAL RESULT:")
print(f"Status: {final_result['status']}")
if 'prices' in final_result:
    print(f"Prices: {final_result['prices']}")
if 'rooms' in final_result:
    print(f"Rooms: {final_result['rooms']}")

date_scraper.close()


DATE-AWARE SCRAPING TEST

DATE-AWARE SCRAPING: Pendry Manhattan West
Step 1: Loading page...
Step 2: Selecting dates and searching...
Selecting dates and searching...
Found 3 date input fields
Found 85 calendar elements
Found 0 check-in elements, 0 check-out elements
Found 2 search buttons
Attempting to set dates in date inputs...
Dates entered successfully
Clicking search button...
Error clicking search: Message: element not interactable
  (Session info: chrome=137.0.7151.122); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#elementnotinteractableexception
Stacktrace:
	GetHandleVerifier [0x0x7ff63b13cda5+78885]
	GetHandleVerifier [0x0x7ff63b13ce00+78976]
	(No symbol) [0x0x7ff63aef99fc]
	(No symbol) [0x0x7ff63af51c64]
	(No symbol) [0x0x7ff63af43654]
	(No symbol) [0x0x7ff63af78b8a]
	(No symbol) [0x0x7ff63af42f06]
	(No symbol) [0x0x7ff63af78da0]
	(No symbol) [0x0x7ff63afa122f]
	(No symbol) [0x0x7ff63af78963]
	(No symb

In [25]:
# Hotel Web Scraper - Complete Booking Flow (1 Person, 1 Night)
# Add guest selection: 1 adult, 0 children + 1-night stay

class CompleteBookingFlowScraper:
    def __init__(self):
        self.setup_driver()
    
    def setup_driver(self):
        """Set up Chrome driver"""
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 30)
    
    def set_guest_count_1_person(self):
        """Set guest count to 1 adult, 0 children"""
        
        print("Setting guest count: 1 adult, 0 children...")
        
        try:
            # Look for adult/guest selectors
            adult_selectors = [
                "select[name*='adult']", "input[name*='adult']",
                "select[class*='adult']", "input[class*='adult']",
                "select[name*='guest']", "input[name*='guest']",
                "//*[contains(text(), 'Adults')]/..//select",
                "//*[contains(text(), 'Adults')]/..//input",
                "//*[contains(text(), 'Guests')]/..//select"
            ]
            
            for selector in adult_selectors:
                try:
                    if selector.startswith("//"):
                        elements = self.driver.find_elements(By.XPATH, selector)
                    else:
                        elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    
                    if elements:
                        element = elements[0]
                        
                        # Try different methods to set to 1
                        if element.tag_name == 'select':
                            # Dropdown selection
                            from selenium.webdriver.support.ui import Select
                            select = Select(element)
                            select.select_by_value('1')
                            print(f"Set adults to 1 via dropdown: {selector}")
                            break
                        elif element.tag_name == 'input':
                            # Input field
                            element.clear()
                            element.send_keys('1')
                            print(f"Set adults to 1 via input: {selector}")
                            break
                            
                except Exception as e:
                    continue
            
            # Look for children selectors and set to 0
            children_selectors = [
                "select[name*='child']", "input[name*='child']",
                "select[class*='child']", "input[class*='child']",
                "//*[contains(text(), 'Children')]/..//select",
                "//*[contains(text(), 'Children')]/..//input"
            ]
            
            for selector in children_selectors:
                try:
                    if selector.startswith("//"):
                        elements = self.driver.find_elements(By.XPATH, selector)
                    else:
                        elements = self.driver.find_elements(By.CSS_SELECTOR, selector)
                    
                    if elements:
                        element = elements[0]
                        
                        if element.tag_name == 'select':
                            from selenium.webdriver.support.ui import Select
                            select = Select(element)
                            select.select_by_value('0')
                            print(f"Set children to 0 via dropdown: {selector}")
                            break
                        elif element.tag_name == 'input':
                            element.clear()
                            element.send_keys('0')
                            print(f"Set children to 0 via input: {selector}")
                            break
                            
                except Exception as e:
                    continue
            
            return True
            
        except Exception as e:
            print(f"Error setting guest count: {str(e)}")
            return False
    
    def select_dates_1_night(self):
        """Select 1-night stay: July 2-3, 2025"""
        
        print("Selecting 1-night stay (July 2-3, 2025)...")
        
        try:
            date_inputs = self.driver.find_elements(By.CSS_SELECTOR, "input[type='date'], input[placeholder*='date'], input[class*='date']")
            
            if date_inputs:
                print("Setting check-in: July 2, 2025")
                date_inputs[0].clear()
                date_inputs[0].send_keys("2025-07-02")
                
                if len(date_inputs) > 1:
                    print("Setting check-out: July 3, 2025 (1 night)")
                    date_inputs[1].clear()
                    date_inputs[1].send_keys("2025-07-03")
                
                return True
            else:
                print("No date inputs found")
                return False
                
        except Exception as e:
            print(f"Error setting dates: {str(e)}")
            return False
    
    def trigger_search(self):
        """Click search/availability button"""
        
        print("Triggering availability search...")
        
        try:
            search_selectors = [
                "//button[contains(text(), 'Search')]",
                "//button[contains(text(), 'Check Availability')]", 
                "//button[contains(text(), 'Find Rooms')]",
                "//input[@type='submit']",
                "//*[@class*='search' and (@type='button' or name()='button')]"
            ]
            
            for selector in search_selectors:
                try:
                    buttons = self.driver.find_elements(By.XPATH, selector)
                    if buttons:
                        print(f"Clicking search button: {selector}")
                        buttons[0].click()
                        time.sleep(5)
                        return True
                except:
                    continue
            
            print("No search button found")
            return False
            
        except Exception as e:
            print(f"Error triggering search: {str(e)}")
            return False
    
    def select_room_and_get_price(self):
        """Select a room type and get final price"""
        
        print("Looking for room selection options...")
        
        try:
            room_selectors = [
                "//button[contains(text(), 'Select')]",
                "//button[contains(text(), 'Book')]", 
                "//button[contains(text(), 'Choose')]",
                "//*[@class*='room']//button",
                "//*[@class*='rate']//button"
            ]
            
            room_buttons = []
            for selector in room_selectors:
                try:
                    buttons = self.driver.find_elements(By.XPATH, selector)
                    room_buttons.extend(buttons)
                except:
                    continue
            
            print(f"Found {len(room_buttons)} room selection buttons")
            
            if room_buttons:
                print("Selecting first available room...")
                room_buttons[0].click()
                time.sleep(3)
                
                # Extract final prices
                price_script = """
                    var prices = [];
                    var elements = document.querySelectorAll('*');
                    for (var i = 0; i < elements.length; i++) {
                        var text = elements[i].textContent || elements[i].innerText || '';
                        var matches = text.match(/\\$[0-9,]+(?:\\.[0-9]{2})?/g);
                        if (matches) {
                            prices = prices.concat(matches);
                        }
                    }
                    return [...new Set(prices)];
                """
                
                final_prices = self.driver.execute_script(price_script)
                print(f"Final prices after room selection: {final_prices}")
                return final_prices
            else:
                print("No room selection buttons found")
                return []
                
        except Exception as e:
            print(f"Error in room selection: {str(e)}")
            return []
    
    def complete_booking_flow_1_person(self, hotel_name, booking_url):
        """Complete booking flow: 1 person, 1 night"""
        
        print(f"\nCOMPLETE BOOKING FLOW: {hotel_name}")
        print("Criteria: 1 adult, 0 children, 1 night (July 2-3, 2025)")
        print("=" * 60)
        
        try:
            # Step 1: Load page
            print("Step 1: Loading booking page...")
            self.driver.get(booking_url)
            time.sleep(5)
            
            # Step 2: Set guest count
            print("Step 2: Setting guest count (1 person)...")
            guests_set = self.set_guest_count_1_person()
            
            # Step 3: Select dates
            print("Step 3: Selecting dates (1 night)...")
            dates_set = self.select_dates_1_night()
            
            # Step 4: Trigger search
            print("Step 4: Searching availability...")
            search_triggered = self.trigger_search()
            
            # Step 5: Wait for results
            print("Step 5: Waiting for room options...")
            time.sleep(8)
            
            # Step 6: Select room and get prices
            print("Step 6: Selecting room type...")
            final_prices = self.select_room_and_get_price()
            
            return {
                'hotel_name': hotel_name,
                'criteria': '1 adult, 1 night',
                'guests_set': guests_set,
                'dates_set': dates_set,
                'search_triggered': search_triggered, 
                'final_prices': final_prices,
                'price_count': len(final_prices),
                'status': 'SUCCESS'
            }
            
        except Exception as e:
            print(f"Error in complete flow: {str(e)}")
            return {'hotel_name': hotel_name, 'status': 'ERROR', 'error': str(e)}
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

# Test complete 1-person booking flow
print("COMPLETE BOOKING FLOW: 1 PERSON, 1 NIGHT")
print("=" * 45)

complete_scraper = CompleteBookingFlowScraper()

# Test with Pendry
pendry_data = hotels_df[hotels_df['Hotel Name'] == 'Pendry Manhattan West'].iloc[0]
complete_result = complete_scraper.complete_booking_flow_1_person('Pendry Manhattan West', pendry_data['Booking URL'])

print(f"\nFINAL RESULT:")
for key, value in complete_result.items():
    print(f"{key}: {value}")

complete_scraper.close()

COMPLETE BOOKING FLOW: 1 PERSON, 1 NIGHT

COMPLETE BOOKING FLOW: Pendry Manhattan West
Criteria: 1 adult, 0 children, 1 night (July 2-3, 2025)
Step 1: Loading booking page...
Step 2: Setting guest count (1 person)...
Setting guest count: 1 adult, 0 children...
Step 3: Selecting dates (1 night)...
Selecting 1-night stay (July 2-3, 2025)...
Setting check-in: July 2, 2025
Setting check-out: July 3, 2025 (1 night)
Step 4: Searching availability...
Triggering availability search...
Clicking search button: //button[contains(text(), 'Check Availability')]
Clicking search button: //input[@type='submit']
No search button found
Step 5: Waiting for room options...
Step 6: Selecting room type...
Looking for room selection options...
Found 0 room selection buttons
No room selection buttons found

FINAL RESULT:
hotel_name: Pendry Manhattan West
criteria: 1 adult, 1 night
guests_set: True
dates_set: True
search_triggered: False
final_prices: []
price_count: 0
status: SUCCESS


In [26]:
# Hotel Web Scraper - Hotel-Specific Interactions
# Add Pendry's "View Rates & Reserve" button click

class HotelSpecificScraper:
    def __init__(self):
        self.setup_driver()
    
    def setup_driver(self):
        """Set up Chrome driver"""
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 30)
    
    def handle_pendry_specific(self):
        """Handle Pendry's specific 'View Rates & Reserve' button"""
        
        print("Looking for Pendry's 'View Rates & Reserve' button...")
        
        try:
            # Look for the specific button
            button_selectors = [
                "//button[contains(text(), 'View Rates & Reserve')]",
                "//button[contains(text(), 'View Rates')]",
                "//a[contains(text(), 'View Rates & Reserve')]",
                "//a[contains(text(), 'View Rates')]",
                "//*[contains(@class, 'reserve') and contains(text(), 'View')]"
            ]
            
            for selector in button_selectors:
                try:
                    buttons = self.driver.find_elements(By.XPATH, selector)
                    if buttons:
                        print(f"Found 'View Rates & Reserve' button: {selector}")
                        buttons[0].click()
                        print("Clicked 'View Rates & Reserve' button")
                        time.sleep(5)  # Wait for booking engine to load
                        return True
                except Exception as e:
                    print(f"Error with selector {selector}: {str(e)}")
                    continue
            
            print("'View Rates & Reserve' button not found")
            return False
            
        except Exception as e:
            print(f"Error handling Pendry button: {str(e)}")
            return False
    
    def pendry_complete_flow(self, hotel_name, booking_url):
        """Complete Pendry booking flow with special button"""
        
        print(f"\nPENDRY COMPLETE FLOW: {hotel_name}")
        print("=" * 50)
        
        try:
            # Step 1: Load page
            print("Step 1: Loading Pendry page...")
            self.driver.get(booking_url)
            time.sleep(5)
            
            # Step 2: Click "View Rates & Reserve" button
            print("Step 2: Clicking 'View Rates & Reserve'...")
            button_clicked = self.handle_pendry_specific()
            
            # Step 3: Now look for the booking interface
            print("Step 3: Looking for booking interface...")
            time.sleep(5)
            
            # Step 4: Set guest count (1 person)
            print("Step 4: Setting guest count...")
            # Try to find guest selectors after booking engine loads
            adult_elements = self.driver.find_elements(By.XPATH, "//*[contains(@class, 'adult') or contains(@name, 'adult') or contains(text(), 'Adult')]")
            print(f"Found {len(adult_elements)} adult-related elements")
            
            # Step 5: Set dates if needed
            print("Step 5: Checking dates...")
            date_inputs = self.driver.find_elements(By.CSS_SELECTOR, "input[type='date'], input[placeholder*='date']")
            print(f"Found {len(date_inputs)} date inputs")
            
            # Step 6: Look for room rates
            print("Step 6: Looking for room rates...")
            time.sleep(8)  # Give more time for rates to load
            
            # Extract prices using JavaScript
            price_script = """
                var prices = [];
                var elements = document.querySelectorAll('*');
                for (var i = 0; i < elements.length; i++) {
                    var text = elements[i].textContent || elements[i].innerText || '';
                    var matches = text.match(/\\$[0-9,]+(?:\\.[0-9]{2})?/g);
                    if (matches) {
                        prices = prices.concat(matches);
                    }
                }
                return [...new Set(prices)];
            """
            
            prices = self.driver.execute_script(price_script)
            
            # Clean up prices (remove very small amounts like $0.00)
            clean_prices = [p for p in prices if not p.endswith('$0.00') and not p.endswith('$0')]
            
            print(f"Raw prices found: {prices}")
            print(f"Clean prices found: {clean_prices}")
            
            # Also look for room names
            room_script = """
                var rooms = [];
                var elements = document.querySelectorAll('*');
                for (var i = 0; i < elements.length; i++) {
                    var text = elements[i].textContent || elements[i].innerText || '';
                    if (text.length > 5 && text.length < 80 && 
                        (text.toLowerCase().includes('room') || 
                         text.toLowerCase().includes('suite') || 
                         text.toLowerCase().includes('king') || 
                         text.toLowerCase().includes('queen'))) {
                        rooms.push(text.trim());
                    }
                }
                return [...new Set(rooms)].slice(0, 10);
            """
            
            rooms = self.driver.execute_script(room_script)
            clean_rooms = [r for r in rooms if not any(skip in r.lower() for skip in ['booking', 'manhattan', 'dining', 'type', 'all room'])]
            
            print(f"Rooms found: {clean_rooms}")
            
            return {
                'hotel_name': hotel_name,
                'button_clicked': button_clicked,
                'raw_prices': prices,
                'clean_prices': clean_prices,
                'rooms': clean_rooms,
                'status': 'SUCCESS'
            }
            
        except Exception as e:
            print(f"Error in Pendry flow: {str(e)}")
            return {'hotel_name': hotel_name, 'status': 'ERROR', 'error': str(e)}
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

# Test Pendry-specific flow
print("PENDRY-SPECIFIC BOOKING FLOW")
print("=" * 35)

pendry_scraper = HotelSpecificScraper()

# Test with Pendry
pendry_data = hotels_df[hotels_df['Hotel Name'] == 'Pendry Manhattan West'].iloc[0]
pendry_result = pendry_scraper.pendry_complete_flow('Pendry Manhattan West', pendry_data['Booking URL'])

print(f"\nPENDRY RESULT:")
for key, value in pendry_result.items():
    print(f"{key}: {value}")

pendry_scraper.close()


PENDRY-SPECIFIC BOOKING FLOW

PENDRY COMPLETE FLOW: Pendry Manhattan West
Step 1: Loading Pendry page...
Step 2: Clicking 'View Rates & Reserve'...
Looking for Pendry's 'View Rates & Reserve' button...
'View Rates & Reserve' button not found
Step 3: Looking for booking interface...
Step 4: Setting guest count...
Found 1 adult-related elements
Step 5: Checking dates...
Found 0 date inputs
Step 6: Looking for room rates...
Raw prices found: []
Clean prices found: []
Rooms found: ['Suites', 'Garden Room', 'Rooms', 'Bedrooms']

PENDRY RESULT:
hotel_name: Pendry Manhattan West
button_clicked: False
raw_prices: []
clean_prices: []
rooms: ['Suites', 'Garden Room', 'Rooms', 'Bedrooms']
status: SUCCESS


In [27]:
# Hotel Web Scraper - Improved Button Detection
# More comprehensive search for the Pendry button

class ImprovedButtonScraper:
    def __init__(self):
        self.setup_driver()
    
    def setup_driver(self):
        """Set up Chrome driver"""
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 30)
    
    def find_all_clickable_elements(self):
        """Find all clickable elements that might be the booking button"""
        
        print("Searching for ALL clickable elements...")
        
        try:
            # Get all potentially clickable elements
            clickable_elements = self.driver.find_elements(By.XPATH, 
                "//button | //a | //*[@onclick] | //*[contains(@class, 'btn')] | //*[contains(@class, 'button')] | //*[@role='button']"
            )
            
            print(f"Found {len(clickable_elements)} clickable elements")
            
            # Check each one for booking-related text
            booking_candidates = []
            
            for i, element in enumerate(clickable_elements):
                try:
                    # Get text content
                    text = element.text.strip()
                    
                    # Check for booking-related keywords
                    booking_keywords = ['view', 'rate', 'reserve', 'book', 'check', 'availability']
                    
                    if text and any(keyword in text.lower() for keyword in booking_keywords):
                        booking_candidates.append({
                            'element': element,
                            'text': text,
                            'tag': element.tag_name,
                            'class': element.get_attribute('class') or 'no-class'
                        })
                        
                        print(f"   Candidate {len(booking_candidates)}: '{text}' ({element.tag_name})")
                        
                except Exception as e:
                    continue
            
            return booking_candidates
            
        except Exception as e:
            print(f"Error finding clickable elements: {str(e)}")
            return []
    
    def try_clicking_candidates(self, candidates):
        """Try clicking each candidate button"""
        
        print(f"\nTrying to click {len(candidates)} candidate buttons...")
        
        for i, candidate in enumerate(candidates):
            try:
                print(f"Trying candidate {i+1}: '{candidate['text']}'")
                
                # Scroll to element first
                self.driver.execute_script("arguments[0].scrollIntoView(true);", candidate['element'])
                time.sleep(1)
                
                # Try to click
                candidate['element'].click()
                print(f"Successfully clicked: '{candidate['text']}'")
                time.sleep(5)  # Wait for page to respond
                
                # Check if page changed (new content loaded)
                new_content_length = len(self.driver.page_source)
                print(f"Page source length after click: {new_content_length:,} characters")
                
                return True, candidate['text']
                
            except Exception as e:
                print(f"Failed to click candidate {i+1}: {str(e)}")
                continue
        
        return False, "No button clicked successfully"
    
    def comprehensive_pendry_search(self, hotel_name, booking_url):
        """Comprehensive search for Pendry booking elements"""
        
        print(f"\nCOMPREHENSIVE PENDRY SEARCH: {hotel_name}")
        print("=" * 50)
        
        try:
            # Step 1: Load page
            print("Step 1: Loading page...")
            self.driver.get(booking_url)
            time.sleep(8)  # Give more time for dynamic content
            
            initial_content_length = len(self.driver.page_source)
            print(f"Initial page content: {initial_content_length:,} characters")
            
            # Step 2: Find all clickable elements
            print("Step 2: Finding clickable elements...")
            candidates = self.find_all_clickable_elements()
            
            # Step 3: Try clicking candidates
            button_success = False
            clicked_button = "None"
            
            if candidates:
                print("Step 3: Trying to click booking buttons...")
                button_success, clicked_button = self.try_clicking_candidates(candidates)
            else:
                print("Step 3: No booking button candidates found")
            
            # Step 4: Look for prices after potential button click
            print("Step 4: Looking for prices...")
            time.sleep(5)
            
            price_script = """
                var prices = [];
                var elements = document.querySelectorAll('*');
                for (var i = 0; i < elements.length; i++) {
                    var text = elements[i].textContent || elements[i].innerText || '';
                    var matches = text.match(/\\$[0-9,]+(?:\\.[0-9]{2})?/g);
                    if (matches) {
                        prices = prices.concat(matches);
                    }
                }
                return [...new Set(prices)];
            """
            
            all_prices = self.driver.execute_script(price_script)
            
            # Filter out $0.00 and very low amounts
            meaningful_prices = [p for p in all_prices if not p in ['$0.00', '$0', '$1.00', '$1']]
            
            print(f"All prices found: {all_prices}")
            print(f"Meaningful prices: {meaningful_prices}")
            
            return {
                'hotel_name': hotel_name,
                'candidates_found': len(candidates),
                'button_clicked': button_success,
                'clicked_button_text': clicked_button,
                'all_prices': all_prices,
                'meaningful_prices': meaningful_prices,
                'price_count': len(meaningful_prices),
                'status': 'SUCCESS'
            }
            
        except Exception as e:
            print(f"Error in comprehensive search: {str(e)}")
            return {'hotel_name': hotel_name, 'status': 'ERROR', 'error': str(e)}
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

# Test comprehensive search
print("COMPREHENSIVE PENDRY BUTTON SEARCH")
print("=" * 40)

comprehensive_scraper = ImprovedButtonScraper()

# Test with Pendry
pendry_data = hotels_df[hotels_df['Hotel Name'] == 'Pendry Manhattan West'].iloc[0]
comprehensive_result = comprehensive_scraper.comprehensive_pendry_search('Pendry Manhattan West', pendry_data['Booking URL'])

print(f"\nCOMPREHENSIVE RESULT:")
for key, value in comprehensive_result.items():
    print(f"{key}: {value}")

comprehensive_scraper.close()


COMPREHENSIVE PENDRY BUTTON SEARCH

COMPREHENSIVE PENDRY SEARCH: Pendry Manhattan West
Step 1: Loading page...
Initial page content: 586,837 characters
Step 2: Finding clickable elements...
Searching for ALL clickable elements...
Found 141 clickable elements
   Candidate 1: 'RESERVE' (a)
   Candidate 2: 'CHECK AVAILABILITY' (div)
   Candidate 3: 'CHECK AVAILABILITY' (button)
   Candidate 4: 'Facebook' (a)
Step 3: Trying to click booking buttons...

Trying to click 4 candidate buttons...
Trying candidate 1: 'RESERVE'
Failed to click candidate 1: Message: element click intercepted: Element <a class="c-button c-button--secondary header__reserve-button inverse-default" href="https://www.pendry.com/manhattan-west/booking/#/booking/step-1">...</a> is not clickable at point (847, 11). Other element would receive the click: <div class="onetrust-pc-dark-filter ot-fade-in" style="z-index:2147483645;"></div>
  (Session info: chrome=137.0.7151.122); For documentation on this error, please visit: h

In [28]:
# Hotel Web Scraper - Handle Popups and Click Buttons
# Dismiss overlay popups then click booking buttons

class PopupHandlingScraper:
    def __init__(self):
        self.setup_driver()
    
    def setup_driver(self):
        """Set up Chrome driver"""
        chrome_options = Options()
        # chrome_options.add_argument('--headless')
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        
        self.driver = webdriver.Chrome(options=chrome_options)
        self.wait = WebDriverWait(self.driver, 30)
    
    def dismiss_popups(self):
        """Dismiss cookie/privacy popups and overlays"""
        
        print("Looking for and dismissing popups...")
        
        try:
            # Common popup/overlay dismissal strategies
            popup_dismissers = [
                # Cookie consent buttons
                "//button[contains(text(), 'Accept')]",
                "//button[contains(text(), 'OK')]", 
                "//button[contains(text(), 'Close')]",
                "//button[contains(text(), 'Dismiss')]",
                "//button[contains(@class, 'accept')]",
                "//button[contains(@class, 'close')]",
                
                # OneTrust specific (from error message)
                "//*[contains(@class, 'onetrust-close-btn-handler')]",
                "//*[contains(@class, 'ot-sdk-close')]",
                "//*[@id='onetrust-accept-btn-handler']",
                "//*[@id='onetrust-pc-btn-handler']",
                
                # Generic overlay closers
                "//*[contains(@class, 'overlay')]//button",
                "//*[contains(@class, 'modal')]//button[contains(@class, 'close')]",
                
                # X buttons
                "//button[text()='×']",
                "//button[text()='✕']",
                "//*[@class='close' or @class='x']"
            ]
            
            for selector in popup_dismissers:
                try:
                    elements = self.driver.find_elements(By.XPATH, selector)
                    if elements:
                        print(f"Found popup dismisser: {selector}")
                        elements[0].click()
                        print("Clicked popup dismisser")
                        time.sleep(2)
                        return True
                except Exception as e:
                    continue
            
            # Try JavaScript to remove overlay
            print("Trying to remove overlay with JavaScript...")
            overlay_removal_script = """
                // Remove OneTrust overlays
                var overlays = document.querySelectorAll('.onetrust-pc-dark-filter, .ot-fade-in, [class*="onetrust"]');
                for (var i = 0; i < overlays.length; i++) {
                    overlays[i].remove();
                }
                
                // Remove other common overlays
                var genericOverlays = document.querySelectorAll('[style*="z-index"], .overlay, .modal-backdrop');
                for (var i = 0; i < genericOverlays.length; i++) {
                    if (genericOverlays[i].style.zIndex > 1000) {
                        genericOverlays[i].remove();
                    }
                }
                
                return 'Overlay removal attempted';
            """
            
            result = self.driver.execute_script(overlay_removal_script)
            print(f"JavaScript overlay removal: {result}")
            
            return True
            
        except Exception as e:
            print(f"Error dismissing popups: {str(e)}")
            return False
    
    def click_booking_button_force(self):
        """Force click booking buttons using multiple methods"""
        
        print("Force clicking booking buttons...")
        
        try:
            # Find the booking buttons we identified earlier
            booking_selectors = [
                "//a[contains(text(), 'RESERVE')]",
                "//button[contains(text(), 'CHECK AVAILABILITY')]",
                "//div[contains(text(), 'CHECK AVAILABILITY')]"
            ]
            
            for selector in booking_selectors:
                try:
                    elements = self.driver.find_elements(By.XPATH, selector)
                    if elements:
                        element = elements[0]
                        
                        print(f"Attempting to click: {selector}")
                        
                        # Method 1: Regular click
                        try:
                            element.click()
                            print("Regular click successful")
                            time.sleep(5)
                            return True, "Regular click"
                        except:
                            print("Regular click failed, trying JavaScript...")
                        
                        # Method 2: JavaScript click
                        try:
                            self.driver.execute_script("arguments[0].click();", element)
                            print("JavaScript click successful")
                            time.sleep(5)
                            return True, "JavaScript click"
                        except:
                            print("JavaScript click failed, trying ActionChains...")
                        
                        # Method 3: ActionChains
                        try:
                            from selenium.webdriver.common.action_chains import ActionChains
                            actions = ActionChains(self.driver)
                            actions.move_to_element(element).click().perform()
                            print("ActionChains click successful")
                            time.sleep(5)
                            return True, "ActionChains click"
                        except:
                            print("ActionChains click failed")
                            continue
                            
                except Exception as e:
                    print(f"Error with selector {selector}: {str(e)}")
                    continue
            
            return False, "All click methods failed"
            
        except Exception as e:
            print(f"Error in force click: {str(e)}")
            return False, str(e)
    
    def pendry_with_popup_handling(self, hotel_name, booking_url):
        """Complete Pendry flow with popup handling"""
        
        print(f"\nPENDRY WITH POPUP HANDLING: {hotel_name}")
        print("=" * 50)
        
        try:
            # Step 1: Load page
            print("Step 1: Loading page...")
            self.driver.get(booking_url)
            time.sleep(8)
            
            # Step 2: Dismiss popups
            print("Step 2: Dismissing popups...")
            popup_dismissed = self.dismiss_popups()
            
            # Step 3: Force click booking button
            print("Step 3: Force clicking booking button...")
            button_clicked, click_method = self.click_booking_button_force()
            
            # Step 4: Look for prices
            print("Step 4: Looking for prices after button click...")
            time.sleep(8)
            
            price_script = """
                var prices = [];
                var elements = document.querySelectorAll('*');
                for (var i = 0; i < elements.length; i++) {
                    var text = elements[i].textContent || elements[i].innerText || '';
                    var matches = text.match(/\\$[0-9,]+(?:\\.[0-9]{2})?/g);
                    if (matches) {
                        prices = prices.concat(matches);
                    }
                }
                return [...new Set(prices)];
            """
            
            all_prices = self.driver.execute_script(price_script)
            meaningful_prices = [p for p in all_prices if not p in ['$0.00', '$0', '$1.00', '$1']]
            
            print(f"All prices: {all_prices}")
            print(f"Meaningful prices: {meaningful_prices}")
            
            return {
                'hotel_name': hotel_name,
                'popup_dismissed': popup_dismissed,
                'button_clicked': button_clicked,
                'click_method': click_method,
                'all_prices': all_prices,
                'meaningful_prices': meaningful_prices,
                'success': len(meaningful_prices) > 0,
                'status': 'SUCCESS'
            }
            
        except Exception as e:
            print(f"Error in popup handling flow: {str(e)}")
            return {'hotel_name': hotel_name, 'status': 'ERROR', 'error': str(e)}
    
    def close(self):
        """Close the driver"""
        self.driver.quit()

# Test popup handling
print("PENDRY WITH POPUP HANDLING")
print("=" * 30)

popup_scraper = PopupHandlingScraper()

# Test with Pendry
pendry_data = hotels_df[hotels_df['Hotel Name'] == 'Pendry Manhattan West'].iloc[0]
popup_result = popup_scraper.pendry_with_popup_handling('Pendry Manhattan West', pendry_data['Booking URL'])

print(f"\nPOPUP HANDLING RESULT:")
for key, value in popup_result.items():
    print(f"{key}: {value}")

popup_scraper.close()


PENDRY WITH POPUP HANDLING

PENDRY WITH POPUP HANDLING: Pendry Manhattan West
Step 1: Loading page...
Step 2: Dismissing popups...
Looking for and dismissing popups...
Found popup dismisser: //button[contains(text(), 'OK')]
Clicked popup dismisser
Step 3: Force clicking booking button...
Force clicking booking buttons...
Step 4: Looking for prices after button click...
All prices: []
Meaningful prices: []

POPUP HANDLING RESULT:
hotel_name: Pendry Manhattan West
popup_dismissed: True
button_clicked: False
click_method: All click methods failed
all_prices: []
meaningful_prices: []
success: False
status: SUCCESS
