In [3]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import pandas as pd

class BookingScraper:
    def __init__(self, url, output_file):
        self.url = url
        self.output_file = str(output_file + '_hotels_data.csv')

    def get_last_page_number(self):
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'
        }

        response = requests.get(self.url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        last_page_button = soup.find('ol', class_='ef2dbaeb17').find_all('button')[-1]
        if last_page_button:
            last_page_number = int(last_page_button.text)
            return last_page_number
        else:
            return None

    def scrape_hotels_from_page(self, url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
            'Accept-Language': 'en-US, en;q=0.5'
        }

        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the hotel elements in the HTML document
        hotels = soup.findAll('div', {'data-testid': 'property-card'})

        hotels_data = []
        # Loop over the hotel elements and extract the desired data
        for hotel in hotels:
            # Extract the hotel name
            name_element = hotel.find('div', {'data-testid': 'title'})
            name = name_element.text.strip()

            # Extract the hotel location
            location_element = hotel.find('span', {'data-testid': 'address'})
            location = location_element.text.strip()

            # Extract the hotel price
            price_element = hotel.find('span', {'data-testid': 'price-and-discounted-price'})
            price = price_element.text.strip()

            # Extract the hotel bed
            bed_element = hotel.find('h4', {'class': 'abf093bdfe e8f7c070a7'})
            bed = bed_element.text.strip()
            
            
            # Extract the hotel rating
            rating_element = hotel.find('div', {'class': 'a3b8729ab1 d86cee9b25'})
            rating = rating_element.text.strip()
            
            # Extract the sustainable level
            sustainability_element = hotel.find('span', {'class': 'abf093bdfe d068504c75 f68ecd98ea'})
            sustainable_level = sustainability_element.text.strip() if sustainability_element else None

            # Extract the hotel link
            link = hotel.find('h3', class_='aab71f8e4e').find('a')['href']

            # Append hotels_data with info about the hotel
            hotels_data.append({
                'name': name,
                'location': location,
                'bed': bed,
                'price': price,
                'rating': rating,
                'hotel_link': link,
                'sustainable_level': sustainable_level
            })

        return hotels_data

    def scrape_and_save_all_hotels(self):
        last_page_number = self.get_last_page_number()

        if last_page_number is not None:
            all_hotels_data = []
            for offset in range(0, last_page_number * 25, 25):
                page_url = f'{self.url}&offset={offset}'
                hotels_data = self.scrape_hotels_from_page(page_url)
                all_hotels_data.extend(hotels_data)

            hotels_df = pd.DataFrame(all_hotels_data)
            hotels_df.to_csv(self.output_file, index=False)
            print(f"Data saved to {self.output_file}")
        else:
            print("Last page number not found.")

# Example usage:
user_url = input("Enter the Booking.com search results URL: ")
user_output_file = input("Enter the desired output file name (e.g., hotels_data.csv): ")

booking_scraper = BookingScraper(user_url, user_output_file)
booking_scraper.scrape_and_save_all_hotels()


Enter the Booking.com search results URL: https://www.booking.com/searchresults.en-gb.html?ss=Malaysia&ssne=Singapore&ssne_untouched=Singapore&label=gen173nr-1BCAEoggI46AdIM1gEaMkBiAEBmAEJuAEHyAEM2AEB6AEBiAIBqAIDuAKm_f-rBsACAdICJGRkOGUyZmRjLTEwODAtNGU2Yy04ZTE5LTllZjY3YmY0MTE0MtgCBeACAQ&sid=52e2fb6e5594864897f2feac2959255c&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=index&dest_id=128&dest_type=country&ac_position=1&ac_click_type=b&ac_langcode=en&ac_suggestion_list_length=5&search_selected=true&search_pageview_id=e0333993501d00e2&ac_meta=GhBlMDMzMzk5MzUwMWQwMGUyIAEoATICZW46BE1hbGFAAEoAUAA%3D&checkin=2024-01-01&checkout=2024-01-02&group_adults=1&no_rooms=1&group_children=0&sb_travel_purpose=leisure&offset=0
Enter the desired output file name (e.g., hotels_data.csv): Malaysia18
Data saved to Malaysia18_hotels_data.csv


In [None]:
# Malaysia
user_url = input("Enter the Booking.com search results URL: ")
user_output_file = input("Enter the desired output file name (e.g., hotels_data.csv): ")

booking_scraper = BookingScraper(user_url, user_output_file)
booking_scraper.scrape_and_save_all_hotels()
