In [3]:
# all imports
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import json
import time

import pandas as pd

from multiprocessing import Pool

import os

In [None]:
import requests
from bs4 import BeautifulSoup

In [4]:
def get_listings(search_page):
    answer = requests.get(search_page, timeout=5)
    content = answer.content
    soup = BeautifulSoup(content, 'html.parser')
    listings = soup.find_all('div', '_8s3ctt')

    return listings

In [5]:
airbnb_url = 'https://www.airbnb.com/s/Mayrhofen--Austria/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&flexible_trip_dates%5B%5D=april&flexible_trip_dates%5B%5D=march&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=calendar&query=Mayrhofen%2C%20Austria&place_id=ChIJbzLYLzjdd0cRDtGuTzM_vt4&checkin=2021-04-03&checkout=2021-04-10&source=structured_search_input_header&search_type=autocomplete_click'

In [6]:
listings = get_listings(airbnb_url)

In [7]:
len(listings)

0

In [None]:
print(listings[0].prettify())

##### Extract the data

In [None]:
RULES_SEARCH_PAGE = {
    'url': {'tag': 'a', 'get': 'href'},
    'name': {'tag': 'a', 'get': 'aria-label'},
    'header': {'tag': 'div', 'class': '_b14dlit'},
    'rooms': {'tag': 'div', 'class': '_kqh46o', 'order': 0},
    'facilities': {'tag': 'div', 'class': '_kqh46o', 'order': 1},
    'badge': {'tag': 'div', 'class': '_17bkx6k'},
    'rating_n_reviews': {'tag': 'span', 'class': '_18khxk1'},
    'price': {'tag': 'span', 'class': '_1p7iugi'},
    'superhost': {'tag': 'div', 'class': '_ufoy4t'},
}

In [None]:
def extract_element(listing_html, params):
    # 1. Find the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extract the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # 3. Get text
    if 'get' in params:
        output = element.get(params['get'])
    else:
        output = element.get_text()

    return output

In [None]:
extract_element(listings[0], RULES_SEARCH_PAGE['url'])

In [None]:
extract_element(listings[0], RULES_SEARCH_PAGE['name'])

In [None]:
def extract_page_features(soup, rules):
    features_dict = {}
    for feature in rules:
        try:
            features_dict[feature] = extract_element(soup, rules[feature])
        except:
            features_dict[feature] = 'empty'
    
    return features_dict

In [None]:
extract_page_features(listings[0], RULES_SEARCH_PAGE)

##### Pagination

In [None]:
def build_urls(main_url, listings_per_page=20, pages_per_location=15):
    url_list = []
    for i in range(pages_per_location):
        offset = listings_per_page * i
        url_pagination = main_url + f'&items_offset={offset}'
        url_list.append(url_pagination)
    
    return url_list

In [None]:
url_list = build_urls(airbnb_url)

In [None]:
url_list

##### Scrape search pages

In [None]:
def process_search_pages(url_list):
    features_list = []
    for page in url_list:
        listings = get_listings(page)
        for listing in listings:
            features = extract_page_features(listing, RULES_SEARCH_PAGE)
            features_list.append(features)

    return features_list

In [None]:
# try for one page
base_features = process_search_pages(url_list[4:5])

In [None]:
base_features

## 2. Dynamic pages

Let's inspect a detail page and then try to extract one of the elements

In [None]:
# listing name: div, _xcsyj0
detail_url = 'https://airbnb.com' + base_features[0]['url']

answer = requests.get(detail_url)
detail_soup = BeautifulSoup(answer.content)

In [None]:
detail_soup.find_all('div', '_xcsyj0')

In [None]:
# some JS functions inside
detail_soup

### Selenium

We also have to install a chromedriver

In [None]:
from selenium import webdriver

In [None]:
# initialize the driver
driver = webdriver.Chrome()

In [None]:
# open the page
driver.get(detail_url)

In [None]:
# get html
page_detailed = driver.page_source

# close the driver
driver.quit()

# BS
detail_soup = BeautifulSoup(page_detailed)

In [None]:
detail_soup.find_all('div', '_xcsyj0')

### Buttons, Loading time

In [None]:
# we can click the buttons
driver = webdriver.Chrome()

driver.get(detail_url)

In [None]:
# amenities button
element = driver.find_element_by_class_name('_13e0raay')

In [None]:
element.click()

In [None]:
# don't forget to stop the driver
driver.quit()

In [None]:
# amenities button
driver = webdriver.Chrome()

driver.get(detail_url)
driver.find_element_by_class_name('_13e0raay').click()

In [None]:
driver.quit()

In [None]:
driver = webdriver.Chrome()

driver.get(detail_url)
time.sleep(7)
driver.find_element_by_class_name('_13e0raay').click()

In [None]:
page_detailed = driver.page_source
driver.quit()
detail_soup_clicked = BeautifulSoup(page_detailed)

In [None]:
# no amenities before
amenities = detail_soup.find_all('div', {'class': '_aujnou'})
len(amenities)

In [None]:
# have them now
amenities = detail_soup_clicked.find_all('div', {'class': '_aujnou'})
len(amenities)

In [None]:
options = Options()
options.add_argument('--blink-settings=imagesEnabled=false')
driver = webdriver.Chrome(options=options)

driver.get(detail_url)

In [None]:
driver.quit()

In [None]:
driver = webdriver.Chrome()

detail_url = 'https://www.airbnb.com/rooms/31741201?adults=4&check_in=2021-04-06&check_out=2021-04-13&federated_search_id=7941b65b-bf17-47dc-8fe0-bce247d0657e&source_impression_id=p3_1613151799_D%2BvOz7MMKLyJexNa&guests=1'
driver.get(detail_url)

In [None]:
# looking for the button element
element = driver.find_element_by_class_name('_gby1jkw')

In [None]:
# that doesn't work (in some cases)
element.click()

In [None]:
from selenium.webdriver import ActionChains

In [None]:
# let's try without seeing the button
actions = ActionChains(driver)
actions.move_to_element(element)
actions.click().perform()

In [None]:
# let's scroll manually
actions = ActionChains(driver)
actions.move_to_element(element)
actions.click().perform()

### Scrolling with Selenium

In [None]:
actions = ActionChains(driver)
driver.execute_script("arguments[0].scrollIntoView(true);", element)

In [None]:
# finally clicking
actions.move_to_element(element)
actions.click().perform()

In [None]:
# or
element.click()

In [None]:
driver.quit()

In [None]:
# Next Generation :)
def extract_element(listing_html, params):
    # 1. Find the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extract text from these tags
    if 'get' in params:
        element_texts = [el.get(params['get']) for el in elements_found]
    else:
        element_texts = [el.get_text() for el in elements_found]
    
    # 2. Extract the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # 3. Select a particular text or concatenate all of them
    tag_order = params.get('order', 0)
    if tag_order == -1:
        output = '**__**'.join(element_texts)
    else:
        output = element_texts[tag_order]

    return output

In [None]:
extract_element(detail_soup_clicked, {'tag': 'div', 'class': '_u827kd', 'order': 0})

In [None]:
extract_element(detail_soup_clicked, {'tag': 'div', 'class': '_u827kd', 'order': -1})

### Process amenities

In [None]:
amenities[0]

In [None]:
amenities[0].find('div', '_1crk6cd').get_text()

In [None]:
amenities[0].find_all('div', '_1dotkqq')

In [None]:
# sometimes there are more elements within
[a.get_text() for a in amenities[1].find_all('div', '_1dotkqq')]

In [None]:
# let's not get deeper
[a.find(text=True) for a in amenities[1].find_all('div', '_1dotkqq')]

##### Wrap in a function

In [None]:
import json

def extract_amenities(soup):
    amenities = soup.find_all('div', {'class': '_aujnou'})
    
    amenities_dict = {}
    for amenity in amenities:
        header = amenity.find('div', {'class': '_1crk6cd'}).get_text()
        values = amenity.find_all('div', {'class': '_1dotkqq'})
        values = [v.find(text=True) for v in values]
        
        amenities_dict['amenity_' + header] = values
        
    return json.dumps(amenities_dict)

In [None]:
extract_amenities(detail_soup_clicked)

### All features

In [None]:
RULES_DETAIL_PAGE = {
    'location': {'tag': 'span', 'class': '_jfp88qr'},
    
    'specialties_1': {'tag': 'div', 'class': 't1bchdij', 'order': -1},
    'specialties_2': {'tag': 'div', 'class': '_1qsawv5', 'order': -1},

    'price_per_night': {'tag': 'div', 'class': '_ymq6as'},
    
    'refundables': {'tag': 'div', 'class': '_cexc0g', 'order': -1},
        
    'prices_1': {'tag': 'li', 'class': '_ryvszj', 'order': -1},
    'prices_2': {'tag': 'li', 'class': '_adhikmk', 'order': -1},
    
    'listing_ratings': {'tag': 'span', 'class': '_4oybiu', 'order': -1},
    
    'host_joined': {'tag': 'div', 'class': '_1fg5h8r', 'order': 1},
    'host_feats': {'tag': 'span', 'class': '_pog3hg', 'order': -1},
    
    'lang_responses': {'tag': 'li', 'class': '_1q2lt74', 'order': -1},
    'house_rules': {'tag': 'div', 'class': '_u827kd', 'order': -1},
}

In [None]:
def extract_soup_js(listing_url, waiting_time=[3, 1]):
    """Extracts HTML from JS pages: open, wait, click, wait, extract"""

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--blink-settings=imagesEnabled=false')
    driver = webdriver.Chrome(options=options)

    driver.get(listing_url)
    time.sleep(waiting_time[0])
        
    # looking for price details
    price_dropdown = 0
    try:
        element = driver.find_element_by_class_name('_gby1jkw')
        price_dropdown = 1
    except:
        pass

    # if the element is present - click on it
    if price_dropdown == 1:
        for i in range(10): # 10 attempts to scroll to the price button
            try:
                actions = ActionChains(driver)
                driver.execute_script("arguments[0].scrollIntoView(true);", element);
                actions.move_to_element_with_offset(element, 5, 5)
                actions.click().perform()
                break
            except:
                pass
        
    driver.execute_script("window.scrollTo(0, 0);")
    try:
        driver.find_element_by_class_name('_13e0raay').click()
    except:
        pass # amentities button not found

    time.sleep(waiting_time[1])

    detail_page = driver.page_source

    driver.quit()

    return BeautifulSoup(detail_page, features='html.parser')

In [None]:
# Scrape single detail page
def process_detail_page(url):
    soup = extract_soup_js(url, waiting_time=[3, 1])
    
    features_list = []
    features = extract_page_features(soup, RULES_DETAIL_PAGE)
    features['amenities'] = extract_amenities(soup)
    features_list.append(features)

    return features_list

##### Measuring time

In [None]:
t0 = time.time()
detail_features = process_detail_page(detail_url)
print(time.time() - t0)

In [None]:
detail_features

In [None]:
# CPU intensive process -> use multiprocessing :)
from multiprocessing import Pool

In [None]:
# typically we could set "n" to the number of cpu's
import os
os.cpu_count()

In [None]:
listings_urls = ['https://www.airbnb.com'+l['url'] for l in base_features]

In [None]:
len(listings_urls)

In [None]:
# check the ratio of empty values
def check_empty(features):
    # -2 as we have 2 prices (-1) and 2 specialties (-1)
    cnt, cnt_empty = -2, -2
    for listing in features:
        for key in listing[0]:
            cnt += 1
            if listing[0][key] == 'empty':
                cnt_empty += 1
    return cnt_empty/cnt

In [None]:
for n_pools in [4,8]:
    t0 = time.time()

    with Pool(n_pools) as pool:
        result = pool.map(process_detail_page, listings_urls)
    pool.close()
    pool.join()

    print(f"n_pool={n_pools}\n\ttime={round(time.time() - t0, 2)}\n\tempty_ratio={round(check_empty(result), 2)}")

In [None]:
def process_detail_page(url):
    soup = extract_soup_js(url, waiting_time=[5, 2])
    
    features_list = []
    features = extract_page_features(soup, RULES_DETAIL_PAGE)
    features['amenities'] = extract_amenities(soup)
    features_list.append(features)

    return features_list

In [None]:
# and repeat for 8 pools only
for n_pools in [8]:
    t0 = time.time()

    with Pool(n_pools) as pool:
        result = pool.map(process_detail_page, listings_urls)
    pool.close()
    pool.join()

    print(f"n_pool={n_pools}\n\ttime={round(time.time() - t0, 2)}\n\tempty_ratio={round(check_empty(result), 2)}")

## Summary

In [None]:
rooms_dirty = '7 guests 路 4 bedrooms 路 4 beds 路 3 baths'

In [None]:
rooms_dirty.split(' 路 ')

In [None]:
lang_responses = 'Languages: English, Deutsch**__**Response rate: 100%**__**Response time: within an hour'

In [None]:
lang_responses.split('**__**')