In [80]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse #renamed to urlib.parse in python 3

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd
from pathlib import Path
import time

## Functions for scraping listings from Airbnb

In [108]:
def initDriver():
    options = Options()
    # options.add_argument("--headless")
    options.add_argument("--incognito")
    driver = webdriver.Chrome(options=options)
    return driver

def getRootUrl(url):
    parsed_url = urlparse(url)
    root_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return root_url

def waitForElements(driver):
    # listing container
    element = WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.XPATH, "//div[@itemprop='itemListElement']"))
    )
    # pagination container
    element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.CLASS_NAME, "p1j2gy66"))
    )

def scrapeListings(driver, soup, root_url): # 1 page
    listings = soup.find_all("div",{"itemprop":"itemListElement"})
    page_data = pd.DataFrame(columns=["Title", "Rating", "Price/night (SGD)", "Total Price (SGD)", "Link"])
    for listing in listings:
        try:
            title = listing.find('div',{'data-testid':'listing-card-title'}).text.strip()
            rating = listing.find('div',{'class':'t1a9j9y7'}).text.strip().split()[0]
            price = listing.find('span',{"class":"_11jcbg2"}).text.strip().split()[0]
            price_tax = listing.find('div',{'class':'_i5duul'}).find('div',{"class":"_10d7v0r"}).text.strip().split()[0]
            link = listing.find('a', {'class':'l1ovpqvx'}).get('href')
            # Reconstruct the absolute link
            link = root_url+link
            current_data = pd.DataFrame({
                "Title":title,
                "Rating":rating,
                "Price/night (SGD)":price,
                "Total Price (SGD)":price_tax,
                "Link":[link]
            })
            page_data = pd.concat(page_data, current_data, axis=0)
            # print(f"Title: {title}\nRating: {rating}/5\nPrice: {price}\nTotal Price: {price_tax}\nLink: {link} \n\n")
        
        # skip the current listing if contain missing info/error
        except:
            continue
    return page_data

def nextPage(soup, root_url):
    next_link = soup.find('a',{'aria-label':'Next'})
    # check for last page
    if next_link:
        next_page = root_url+next_link.get('href')
        return next_page
    else:
        return next_link # will return false

def quitProgram(driver):
    driver.quit()
    file_path = Path('./dataset/airbnb.csv')
    file_path.parent.mkdir(parents = True, exist_ok = True)
    airbnb_data.to_csv(file_path, index = False)
    print(f"Data saved to {file_path}...")


def scrapeListingDetails(path, driver):
    data = pd.read_csv(path, header=0)
    listing_url = data["Link"].to_list()
    for url in listing_url:
        driver.get(url)
        # close the pop-up dialog for translation
        try:
            modal = WebDriverWait(driver, 2).until(
            EC.presence_of_element_located((By.XPATH, "//div[@role='dialog']"))
            )
            close_button = modal.find_element(By.XPATH, "//button[@aria-label='Close']")
            close_button.click()
        except TimeoutException:
            pass
        except NoSuchElementException:
            print("Couldn't find button")
            pass
        try:
            container = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@data-section-id='AMENITIES_DEFAULT']"))
            )
            button = WebDriverWait(container, 10).until(
            EC.element_to_be_clickable((By.TAG_NAME, 'button'))
            )
            button.click()
            time.sleep(5)
        except TimeoutException:
            print("Couldn't find container")
            break
        except NoSuchElementException: 
            print("Couldn't find button to click")
            break       

    

In [23]:
url = "https://www.airbnb.com.sg/s/Bangkok--Thailand/homes?refinement_paths%5B%5D=%2Fhomes&checkin=2024-10-04&checkout=2024-10-10&adults=3&tab_id=home_tab&query=Bangkok%2C%20Thailand&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-08-01&monthly_length=3&monthly_end_date=2024-11-01&price_filter_input_type=0&price_filter_num_nights=6&channel=EXPLORE&date_picker_type=calendar&place_id=ChIJ82ENKDJgHTERIEjiXbIAAQE&source=structured_search_input_header&search_type=user_map_move&search_mode=regular_search&ne_lat=13.810393924485789&ne_lng=100.5597668742775&sw_lat=13.71428581582692&sw_lng=100.48846329668004&zoom=12.786708747368644&zoom_level=12.786708747368644&search_by_map=true"
root_url = getRootUrl(url)
driver = initDriver()

airbnb_data = pd.DataFrame(columns=["Title", "Rating", "Price/night (SGD)", "Total Price (SGD)", "Link"])

try:
    driver.get(url)
    while True:
        waitForElements(driver)
        html_content = driver.page_source
        soup=BeautifulSoup(html_content,'html.parser')
        data = scrapeListings(driver, soup, root_url)
        airbnb_data = pd.concat([airbnb_data, data], axis = 0)
        result = nextPage(soup, root_url)
        if(result):
            driver.get(result)
        else:
            break
            

finally:
    quitProgram(driver)


In [109]:
driver = initDriver()
scrapeListingDetails("./dataset/airbnb.csv", driver)

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=126.0.6478.183)
Stacktrace:
	GetHandleVerifier [0x00007FF61CB3EEB2+31554]
	(No symbol) [0x00007FF61CAB7EE9]
	(No symbol) [0x00007FF61C97872A]
	(No symbol) [0x00007FF61C94D995]
	(No symbol) [0x00007FF61C9F44D7]
	(No symbol) [0x00007FF61CA0C051]
	(No symbol) [0x00007FF61C9ECDD3]
	(No symbol) [0x00007FF61C9BA33B]
	(No symbol) [0x00007FF61C9BAED1]
	GetHandleVerifier [0x00007FF61CE48B2D+3217341]
	GetHandleVerifier [0x00007FF61CE95AF3+3532675]
	GetHandleVerifier [0x00007FF61CE8B0F0+3489152]
	GetHandleVerifier [0x00007FF61CBEE786+750614]
	(No symbol) [0x00007FF61CAC376F]
	(No symbol) [0x00007FF61CABEB24]
	(No symbol) [0x00007FF61CABECB2]
	(No symbol) [0x00007FF61CAAE17F]
	BaseThreadInitThunk [0x00007FFEC801257D+29]
	RtlUserThreadStart [0x00007FFEC8F4AF28+40]


In [25]:
url = "https://www.airbnb.com.sg/s/Bangkok--Thailand/homes?refinement_paths%5B%5D=%2Fhomes&checkin=2024-10-04&checkout=2024-10-10&adults=3&tab_id=home_tab&query=Bangkok%2C%20Thailand&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-08-01&monthly_length=3&monthly_end_date=2024-11-01&price_filter_input_type=0&price_filter_num_nights=6&channel=EXPLORE&date_picker_type=calendar&place_id=ChIJ82ENKDJgHTERIEjiXbIAAQE&source=structured_search_input_header&search_type=user_map_move&search_mode=regular_search&ne_lat=13.810393924485789&ne_lng=100.5597668742775&sw_lat=13.71428581582692&sw_lng=100.48846329668004&zoom=12.786708747368644&zoom_level=12.786708747368644&search_by_map=true"
# url = "https://www.airbnb.com.sg/s/Indonesia/homes?refinement_paths%5B%5D=%2Fhomes&checkin=2024-07-25&checkout=2024-07-27&adults=2&children=0"
parsed_url = urlparse(url)
root_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
driver = webdriver.Chrome()
driver.get(url)
airbnb_data = pd.DataFrame(columns=["Title", "Rating", "Price/night (SGD)", "Total Price (SGD)", "Link"])
try:
    while True:
        element = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.XPATH, "//div[@itemprop='itemListElement']"))
        )
        pagination = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CLASS_NAME, "p1j2gy66"))
        )
        html_content = driver.page_source
        soup=BeautifulSoup(html_content,'html.parser')

        listings = soup.find_all("div",{"itemprop":"itemListElement"})
        for listing in listings:
            try:
                title = listing.find('div',{'data-testid':'listing-card-title'}).text.strip()
                rating = listing.find('div',{'class':'t1a9j9y7'}).text.strip().split()[0]
                price = listing.find('span',{"class":"_11jcbg2"}).text.strip().split()[0]
                price_tax = listing.find('div',{'class':'_i5duul'}).find('div',{"class":"_10d7v0r"}).text.strip().split()[0]
                link = listing.find('a', {'class':'l1ovpqvx'}).get('href')
                # Reconstruct the absolute link
                link = root_url+link
                current_data = pd.DataFrame({
                    "Title":title,
                    "Rating":rating,
                    "Price/night (SGD)":price,
                    "Total Price (SGD)":price_tax,
                    "Link":[link]
                })
                airbnb_data = pd.concat([airbnb_data, current_data], axis = 0)
                # print(f"Title: {title}\nRating: {rating}/5\nPrice: {price}\nTotal Price: {price_tax}\nLink: {link} \n\n")
            
            # skip the current listing if contain missing info/error
            except:
                continue
        button = soup.find('a',{'aria-label':'Next'})
        # check for last page
        if button:
            button = root_url+button.get('href')
            driver.get(button)
        else:
            break
   
finally:
    driver.quit()
    file_path = Path('./dataset/airbnb.csv')
    file_path.parent.mkdir(parents = True, exist_ok = True)
    airbnb_data.to_csv(file_path, index = False)

In [18]:
data = pd.read_csv("./dataset/airbnb.csv")
data["Price/night (SGD)"] = data["Price/night (SGD)"].str.replace("$", "").astype('int')
print(data.sort_values(by="Price/night (SGD)").hZead())

                                 Title Rating  Price/night (SGD)  \
214  Place to stay in Khet Phra Nakhon   4.45                 22   
98                     Flat in Bangkok   4.73                 31   
192  Flat in Khet Pom Prap Sattru Phai   4.64                 31   
244            Hostel in Khet Bang Rak   4.52                 32   
110  Flat in Khet Pom Prap Sattru Phai   4.76                 33   

    Total Price (SGD)                                               Link  
214              $142  https://www.airbnb.com.sg/rooms/93675883819483...  
98               $199  https://www.airbnb.com.sg/rooms/21942959?adult...  
192              $199  https://www.airbnb.com.sg/rooms/11722665?adult...  
244              $191  https://www.airbnb.com.sg/rooms/31764152?adult...  
110              $212  https://www.airbnb.com.sg/rooms/21892849?adult...  
