# BeautifulSoup

In [3]:
%pip install pandas
from bs4 import BeautifulSoup
import requests
import datetime

import pandas as pd

Note: you may need to restart the kernel to use updated packages.


In [4]:
def outAfterOneDay(checkIn):
    # Define months with 31 days
    months31 = [1, 3, 5, 7, 8, 10]
    
    # Define months with 30 days
    months30 = [4, 6, 9, 11]

    # Check conditions for different months and days to determine the checkOut date
    if (checkIn.month in months30 and checkIn.day == 30) or \
       (checkIn.month == 2 and checkIn.day == 28 and checkIn.year == 2025) or \
       (checkIn.month == 2 and checkIn.day == 29 and checkIn.year == 2024) or \
       (checkIn.month in months31 and checkIn.day == 31):
        checkOut = datetime.datetime(checkIn.year, checkIn.month + 1, 1)
    elif checkIn.month == 12 and checkIn.day == 31:
        checkOut = datetime.datetime(checkIn.year + 1, 1, 1)
    else:
        checkOut = datetime.datetime(checkIn.year, checkIn.month, checkIn.day + 1)
    
    return checkOut

In [5]:
# Main scraper function
def scraper(checkIn):
    # Initialize an empty list to store hotel data
    hotelData = []

    # Calculate the check-out date using the outAfterOneDay function
    checkOut = outAfterOneDay(checkIn)

    # Build the URL for the initial search on Booking.com
    url = f'https://www.booking.com/searchresults.html?ss=Madrid%2C+Spain&aid=304142&lang=en-us&sb=1&src_elem=sb&src=index&dest_id=-390625&dest_type=city&checkin={checkIn.strftime("%Y-%m-%d")}&checkout={checkOut.strftime("%Y-%m-%d")}&group_adults=1&no_rooms=1&group_children=0&sb_travel_purpose=leisure&selected_currency=EUR'

    # Set headers for the HTTP request
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
        'Accept-Language': 'en-US, en;q=0.5'
    }

    # Send an HTTP request to the URL
    response = requests.get(url, headers=headers)

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # While the number of hotels collected is less than 100, continue scraping
    while len(hotelData) < 100:
        # Find all hotel elements on the current page
        hotels = soup.find_all('div', {'data-testid': 'property-card'})

        # Iterate through each hotel element and extract relevant information
        for hotel in hotels:
            name = hotel.find('div', {'data-testid': 'title'}).text.strip()
            price = hotel.find('span', {'data-testid': 'price-and-discounted-price'}).text.strip()
            link = hotel.find('a', {'data-testid': 'title-link'}).attrs['href']
            
            # Send an HTTP request to the hotel's specific page
            hotel_url = requests.get(link)
            hotel_soup = BeautifulSoup(hotel_url.text, 'html.parser')
            
            # Extract hotel address and latitude-longitude information
            hotel_address = hotel_soup.find(id="hotel_address")
            if hotel_address is not None:
                hotel_latlng = hotel_address.attrs['data-atlas-latlng']
                hotel_lat, hotel_lng = hotel_latlng.split(',')
            
            # Extract room type information
            room_type = hotel.find('div', {'data-testid': 'recommended-units'}).find('h4').text

            # Append the collected data to the hotelData list
            hotelData.append({
                'check-in date': checkIn.strftime("%Y-%m-%d"),
                'check-out date': checkOut.strftime("%Y-%m-%d"),
                'year': checkIn.year,
                'month': checkIn.month,
                'name': name,
                'price': price,
                'lat': hotel_lat,
                'lng': hotel_lng,
                'room type': room_type,
                'url': link
            })

        print(checkIn.strftime("%Y-%m-%d"), checkOut.strftime("%Y-%m-%d"))

        # Go to the next page by updating the URL with an offset
        new_url = url + f'&offset={len(hotelData)}'
        print(new_url.split('EUR')[1])
        response = requests.get(new_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

    # Return the collected hotel data and the check-out date
    return hotelData, checkOut

In [20]:
# Set the initial check-in date
checkIn = datetime.datetime(2024, 1, 1)

# Initialize an empty list to store the final scraped data
finalData = []

# Loop until the check-in date reaches January 31, 2024
while checkIn != datetime.datetime(2024, 1, 31):
    # Call the scraper function to get hotel data for the current check-in date
    data, checkOut = scraper(checkIn)
  
    # Extend the finalData list with the data collected for the current date
    finalData.extend(data)

    # Print the current length of finalData and the last entry in the list
    print(len(finalData))
    print(finalData[-1])
    
    # Update checkIn for the next iteration
    checkIn = checkOut

# Create a DataFrame from the collected data
df = pd.DataFrame.from_dict(finalData)

2024-01-01 2024-01-02
&offset=26
2024-01-01 2024-01-02
&offset=51
2024-01-01 2024-01-02
&offset=76
2024-01-01 2024-01-02
&offset=101
101
{'check-in date': '2024-01-01', 'check-out date': '2024-01-02', 'year': 2024, 'month': 1, 'name': 'Hotel Madrid Gran Via 25, Affiliated by Meliá', 'price': '€\xa0159', 'lat': '40.41985106', 'lng': '-3.70224573', 'room type': 'Single Room', 'url': 'https://www.booking.com/hotel/es/hotel-madrid-gran-via-25.html?aid=304142&label=gen173nr-1FCAQoggI4-gNIMVgEaEaIAQGYATG4ARnIAQzYAQHoAQH4AQaIAgGoAgO4AurNvasGwAIB0gIkZDc3OTI1OTktMjQwMC00NGUyLTg2NjktYWJiZWVjZTNkMDNm2AIF4AIB&ucfs=1&arphpl=1&checkin=2024-01-01&checkout=2024-01-02&dest_id=-390625&dest_type=city&group_adults=1&req_adults=1&no_rooms=1&group_children=0&req_children=0&hpos=25&hapos=100&sr_order=popularity&srpvid=c9c27f756a8e0224&srepoch=1701799660&all_sr_blocks=9161901_91989069_1_1_0&highlighted_blocks=9161901_91989069_1_1_0&matching_block_id=9161901_91989069_1_1_0&sr_pri_blocks=9161901_91989069_1_1_0_

KeyboardInterrupt: 

In [18]:
bookingData = pd.DataFrame(finalData)
bookingData.to_csv('bookingData.csv')