In [2]:
import time, json, re
from urllib.parse import unquote
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains

import pymongo

# Utility functions
Functions used to extract the property attributes when scraping the property webpage
### Extract price

In [3]:
def extract_price(browser):
    # Extract the price per night
    try:
        # If no discount
        return float(browser.find_element(By.XPATH, "//*[@class='_tyxjp1']").get_attribute('innerHTML').split(';')[1].split('&')[0])
    except:
        try:
            # If discount
            return float(browser.find_element(By.XPATH, "//*[@class='_1y74zjx']").get_attribute('innerHTML').split(';')[1].split('&')[0])
        except:
            return None

### Extract details

In [4]:
def extract_details(browser):
    # Extract the property details
    
    # Property detail keywords
    keywords = {
        'guest': 'guests', 'guests': 'guests',
        'bedroom': 'bedrooms', 'bedrooms': 'bedrooms',
        'bed': 'beds', 'beds': 'beds',
        'bath': 'baths', 'baths': 'baths'
    }
    
    details = {'guests': None, 'beds': None, 'bedrooms': None, 'baths': None}  # Default to null if not found
    details_ol = browser.find_elements(By.XPATH, '//div[contains(@class, "o1kjrihn")]//ol/li')  # Attempt to locate the details list

    # Find and extract each unique property detail
    for detail_li in details_ol:
        text = detail_li.get_attribute('innerText').strip()
        for keyword in keywords:
            if keyword in text:
                match = re.search(r'\d+', text)
                if match:
                    number = int(match.group())
                    detail_key = keywords[keyword]
                    details[detail_key] = number
                break  # Found the number, no need to continue with other keywords
    return details

### Extract superhost

In [5]:
def extract_superhost(browser):
    # Extract if Superhost
    try:
        elem = browser.find_element(By.XPATH, "//div[@class='s1l7gi0l atm_c8_km0zk7 atm_g3_18khvle atm_fr_1m9t47k atm_7l_1esdqks dir dir-ltr']")
        elems = elem.find_elements(By.XPATH, "//li[@class='l7n4lsf atm_9s_1o8liyq_keqd55 dir dir-ltr']")
        for el in elems:
            if el.text == "Superhost":
                return True
        return False
    except:
        return False  # Default to false if not found

### Extract guest favorite

In [6]:
def extract_guest_favorite(browser):
    # Extract if Guest Favorite
    try:
        elem = browser.find_element(By.XPATH, "//div[contains(text(), 'Guest favorite')]")
        return elem.text == "Guest favorite"  # Return True if the element exists and has the expected text      
    except:
       return False  # Default to false if not found

### Extract review index

In [7]:
def extract_review_index_number_reviews(browser):
    # Extract the review index and the number of reviews
    review_index, number_reviews = None, None
    
    # Attempt to find the review index
    try: 
        elem = browser.find_element(By.XPATH, "//*[contains(text(), 'Rated')]")
        full_text = elem.get_attribute("innerHTML") # Get the full text
        numbers = re.findall(r"\d+,\d+|\d+\.\d+|\d+", full_text)
        review_index = float(numbers[0].replace(',', '.'))   # Extract the first number from this text            
    except NoSuchElementException:
        review_index = None  # Default to null if not found
            
    # Attempt to find the number of reviews                   
    try:    
        number_reviews_elem = browser.find_elements(By.XPATH, "//*[contains(text(), 'reviews')]")
        for elem in number_reviews_elem:
            if re.search(r'\d+', elem.text):
                number_reviews = int(re.search(r'\d+', elem.text).group())
                break
        if number_reviews is None: 
            number_reviews = None
    except NoSuchElementException:
        number_reviews = None  # Default to null if not found
                
    return review_index, number_reviews

### Extract host

In [8]:
def extract_host(browser):
    # Extract the property host
    try:
        # Search for elements that contain "Hosted by"
        elem = browser.find_element(By.XPATH, "//*[contains(text(), 'Hosted by')]")
        # Use regex to split the text by "Hosted by " and capture the following text
        host_name_match = re.search(r'(?:Hosted by)(.+)', elem.text)
        if host_name_match: # If the regex finds a match, return the captured group which is the host's name
            return host_name_match.group(1)
    except NoSuchElementException:
        return None  # Default to null if not found

### Extract characteristics

In [9]:
def extract_characteristics(browser):
    # Extract the property characteristics
    try:
        characteristics = []
        elems = browser.find_elements(By.XPATH, "//div[@class='_sg8691']")
        for elem in elems:
            characteristics.append(elem.text)
        return characteristics
    except:
        return []  # Default to empty list if not found

### Extract coordinates

In [10]:
def extract_coordinates(browser):
    # Extract the property coordinates
    coordinates = dict()
    
    try:
        # Scroll to the middle of the page in order to load the Google map
        footer = browser.find_element(By.XPATH, "//*[@class='ff6a337 atm_26_116dmco atm_67_1vlbu9m dir dir-ltr']")
        delta_y = footer.rect['y']
        ActionChains(browser).scroll_by_amount(0, int(delta_y * 0.65)).perform()
    except:
        return coordinates  # Default to empty dict if not found
    
    time.sleep(2)
    
    # Gets all the logs from performance in Chrome
    logs = browser.get_log("performance")
    
    # Iterate the network logs to find the map location query request
    for log in logs:
        network_log = json.loads(log["message"])["message"]
    
        if ("Network.response" in network_log["method"]
                or "Network.request" in network_log["method"]
                or "Network.webSocket" in network_log["method"]):
    
            try:
                url = network_log["params"]["request"]["url"]
            except:
                continue
    
            if "MapViewportInfoQuery" in url:
                try:
                    loc = url.split("&")
                    loc = loc[len(loc) - 2]
                    loc = loc.split("=")[1]
        
                    json_loc = json.loads(unquote(loc))
                    
                    coordinates["lat"] = float((json_loc['request']['boundingBox']['southwest']['lat'] + json_loc['request']['boundingBox']['northeast']['lat']) / 2)
                    coordinates["lng"] = float((json_loc['request']['boundingBox']['southwest']['lng'] + json_loc['request']['boundingBox']['northeast']['lng']) / 2)
        
                    break
                except:
                    return coordinates  # Default to empty dict if not found
    return coordinates

### Accept cookies on page

In [11]:
def accept_cookies():
    # Accept cookies
    try:
        # Wait for the cookie button to be clickable
        cookie_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'OK')]")))
        cookie_button.click()
    except:
        pass  # If the cookie button is not visible after 5 seconds, we can assume it didn't appear (or it was already closed)


### Create the property object

In [12]:
def create_db_object(browser, region):
    # Create the property object
    
    # Instantiate a dictionary to store the property attributes
    db_object = dict()
    
    # Extract the price
    price = extract_price(browser)
    db_object['price'] = price
    
    # Extract the property details
    details = extract_details(browser)
    db_object['details'] = details
    
    # Extract the superhost status
    superhost = extract_superhost(browser)
    db_object['superhost'] = superhost
    
    # Extract the guest favorite status
    guest_favorite = extract_guest_favorite(browser)
    db_object['guest_favorite'] = guest_favorite
    
    # Extract the review index and number of reviews
    review_index, number_reviews = extract_review_index_number_reviews(browser)
    db_object['review_index'] = review_index
    db_object['number_reviews'] = number_reviews
     
    # Extract the host name
    host = extract_host(browser)
    db_object['host'] = host
    
    # Extract the property characteristics
    characteristics = extract_characteristics(browser) 
    db_object['characteristics'] = characteristics
    
    # Extract the coordinates
    coordinates = extract_coordinates(browser)
    if len(coordinates) != 0:
        db_object['location'] = {
            'type': "Point",
            'coordinates': [coordinates['lat'], coordinates['lng']]
        }
    
    # Set the property region
    db_object['region'] = region
    
    return db_object

# Airbnb scraping process
The process of accessing the Airbnb website and scraping information about properties
### Initiate the browser driver and access the Airbnb website

In [None]:
# Create the webdriver object and set the options
options = webdriver.ChromeOptions()
options.set_capability("goog:loggingPrefs", {"performance": "ALL"})
options.add_argument("--start-maximized")

browser = webdriver.Chrome(options=options)

browser.get('https://www.airbnb.com/')

# Set a driver wait which will wait for max 5 seconds when called
wait = WebDriverWait(browser, 5)
accept_cookies()

# Keep the original window of the browser for future redirects to new tabs
original_window = browser.current_window_handle

### Setup MongoDB

In [14]:
# MongoDB info
mongo_host = ""
mongo_port = 0
mongo_database = ""
mongo_collection = ""
mongo_username = ""
mongo_password = ""

# Connect to MongoDB
client = pymongo.MongoClient(mongo_host, mongo_port, username=mongo_username, password=mongo_password)
db = client[mongo_database]
collection = db[mongo_collection]

### Navigate to the Thessaloniki (Kalamaria/Panorama/Neapoli-Sikies) properties listing page

In [15]:
# Find the location input
elem = browser.find_element(By.ID, 'bigsearch-query-location-input')
# Search for the region
region = 'Kalamaria, Greece'
# region = 'Panorama, Greece'
# region = 'Neapoli-Sikies, Greece'
elem.send_keys(region + Keys.RETURN)

# Find and click the search button
elem = browser.find_element(By.XPATH, "//*[@data-testid='structured-search-input-search-button']")
elem.click()

### Scrape all properties listed in a page and repeat for multiple pages

In [16]:
db_objects = []
num_pages = 15
# Iterate through the pages
for i in tqdm(range(num_pages)):
    # Wait until the page loads
    wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[@data-testid='card-container']"))) 
    time.sleep(3)
    
    # Find all listed properties specified by the data-testid
    elems = browser.find_elements(By.XPATH, "//*[@data-testid='card-container']")
    
    # For every property listed on the page
    for j, elem in enumerate(tqdm(elems)):
        
        # Click on the property
        elem.click()
        
        # Wait until the property is opened in a new tab
        wait.until(EC.number_of_windows_to_be(2))
        
        # Switch the browser to the new tab
        for window_handle in browser.window_handles:
            if window_handle != original_window:
                browser.switch_to.window(window_handle)
                break
        
        # Wait until the property page loads
        try:
            wait.until(EC.presence_of_element_located((By.XPATH, "//*[@d='m6 6 20 20M26 6 6 26']")))
            
            # Find and click the 'X' for the translation popup
            elem = browser.find_element(By.XPATH, "//*[@d='m6 6 20 20M26 6 6 26']")
            elem.click()
        except:
            pass
        
        # Extract the property information
        db_object = create_db_object(browser, region)
        
        # Append to the list of properties
        db_objects.append(db_object)
        
        # Switch the browser to the original listings tab and close the property one
        browser.close()
        browser.switch_to.window(original_window)
      
    # Insert all objects to MongoDB
    collection.insert_many(db_objects)
    db_objects = []
    
    # Go to the next page  
    try:
        elem = browser.find_element(By.XPATH, "//a[@aria-label='Next']")
        elem.click()
    except:
        break

  0%|          | 0/15 [00:00<?, ?it/s]
  0%|          | 0/18 [00:00<?, ?it/s][A
  6%|▌         | 1/18 [00:05<01:31,  5.39s/it][A
 11%|█         | 2/18 [00:14<02:00,  7.52s/it][A
 17%|█▋        | 3/18 [00:23<02:00,  8.01s/it][A
 22%|██▏       | 4/18 [00:31<01:53,  8.13s/it][A
 28%|██▊       | 5/18 [00:40<01:48,  8.35s/it][A
 33%|███▎      | 6/18 [00:48<01:41,  8.44s/it][A
 39%|███▉      | 7/18 [00:57<01:33,  8.54s/it][A
 44%|████▍     | 8/18 [01:05<01:25,  8.52s/it][A
 50%|█████     | 9/18 [01:14<01:17,  8.56s/it][A
 56%|█████▌    | 10/18 [01:23<01:09,  8.63s/it][A
 61%|██████    | 11/18 [01:32<01:00,  8.70s/it][A
 67%|██████▋   | 12/18 [01:40<00:52,  8.73s/it][A
 72%|███████▏  | 13/18 [01:49<00:43,  8.74s/it][A
 78%|███████▊  | 14/18 [01:58<00:35,  8.81s/it][A
 83%|████████▎ | 15/18 [02:07<00:26,  8.81s/it][A
 89%|████████▉ | 16/18 [02:16<00:17,  8.77s/it][A
 94%|█████████▍| 17/18 [02:24<00:08,  8.72s/it][A
100%|██████████| 18/18 [02:33<00:00,  8.54s/it][A
  7%|▋    