In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import numpy as np

In [2]:
airbnb_url = 'https://www.airbnb.com/s/Bentonville--Arkansas--United-States/homes?adults=8&tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Bentonville%2C%20Arkansas%2C%20United%20States&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&search_type=autocomplete_click&checkin=2022-05-13&checkout=2022-05-15&source=structured_search_input_header&place_id=ChIJSUnWTgAQyYcRAr8lJMiMgNo'

def get_listings(search_page):
    answer = requests.get(search_page, timeout=5)
    content = answer.content
    soup = BeautifulSoup(requests.get(airbnb_url).content, 'html.parser')
    listings = soup.find_all('div', 'cm4lcvy dir dir-ltr')

    return listings

listings = get_listings(airbnb_url)

print(f'Number of listings per page: {len(listings)}\n\n')

Number of listings per page: 20




In [8]:
RULES_SEARCH_PAGE = {
    'url': {'tag': 'a', 'get': 'href'},
    'name': {'tag': 'span', 'class': 'ts5gl90 tl3qa0j t1nzedvd dir dir-ltr'},
    'header': {'tag': 'div', 'class': 'cuu4odx c1frjvtt dir dir-ltr'},
    'guests': {'tag': 'span', 'class': 'mp2hv9t dir dir-ltr', 'order': 0},
    'rooms': {'tag': 'span', 'class': 'mp2hv9t dir dir-ltr', 'order': 1},
    'beds': {'tag': 'span', 'class': 'mp2hv9t dir dir-ltr', 'order': 2},
    'baths': {'tag': 'span', 'class': 'mp2hv9t dir dir-ltr', 'order': 3},
    'facilities': {'tag': 'div', 'class': 'i1wgresd dir dir-ltr', 'order': 1},
    'badge': {'tag': 'div', 'class': 'fcg8kp6 dir dir-ltr'},
    'rating': {'tag': 'span', 'class': 'rpz7y38 dir dir-ltr'},
    'review_count': {'tag': 'span', 'class': 'r1xr6rtg dir dir-ltr'},
    'price': {'tag': 'span', 'class': '_tyxjp1'},
}


def extract_element(listing_html, params):
    # 1. Finding the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extracting the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # 3. Getting text
    if 'get' in params:
        output = element.get(params['get'])
    else:
        output = element.get_text()

    return output


def extract_page_features(soup, rules):
    features_dict = {}
    for feature in rules:
        try:
            features_dict[feature] = extract_element(soup, rules[feature])
        except:
            features_dict[feature] = 'empty'
    
    return features_dict

extract_page_features(listings[0], RULES_SEARCH_PAGE)

{'url': '/rooms/585336259071491593?adults=8&children=0&infants=0&check_in=2023-02-24&check_out=2023-02-26&previous_page_section_name=1000&federated_search_id=9a5b8dd4-e7e2-4987-9d19-121e7678c84f',
 'name': 'Pink Door ✰ Coler MTN Bike Preserve ✰ Family Flow',
 'header': 'Entire residential home in Bentonville',
 'guests': '8 guests',
 'rooms': '4 bedrooms',
 'beds': '5 beds',
 'baths': '3 baths',
 'facilities': 'Wifi · Kitchen · Free parking',
 'badge': 'SUPERHOST',
 'rating': 'empty',
 'review_count': 'empty',
 'price': '$189'}

In [10]:
##### Pagination
def build_urls(main_url, listings_per_page=20, pages_per_location=15):
    url_list = []
    for i in range(pages_per_location):
        offset = listings_per_page * i
        url_pagination = main_url + f'&items_offset={offset}'
        url_list.append(url_pagination)
    
    return url_list

url_list = build_urls(airbnb_url)

url_list

['https://www.airbnb.com/s/Bentonville--Arkansas--United-States/homes?adults=8&tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Bentonville%2C%20Arkansas%2C%20United%20States&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&search_type=autocomplete_click&checkin=2022-05-13&checkout=2022-05-15&source=structured_search_input_header&place_id=ChIJSUnWTgAQyYcRAr8lJMiMgNo&items_offset=0',
 'https://www.airbnb.com/s/Bentonville--Arkansas--United-States/homes?adults=8&tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Bentonville%2C%20Arkansas%2C%20United%20States&flexible_trip_lengths%5B%5D=weekend_trip&date_picker_type=flexible_dates&search_type=autocomplete_click&checkin=2022-05-13&checkout=2022-05-15&source=structured_search_input_header&place_id=ChIJSUnWTgAQyYcRAr8lJMiMgNo&items_offset=20',
 'https://www.airbnb.com/s/Bentonville--Arkansas--United-States/homes?adults=8&tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&query=Bentonville%2C%20Arkansas%2C%20Unite

In [41]:
def process_search_pages(url_list):
    features_list = []
    for page in url_list:
        listings = get_listings(page)
        for listing in listings:
            features = extract_page_features(listing, RULES_SEARCH_PAGE)
            features_list.append(features)

    return features_list

# trying for one page

base_features = process_search_pages(url_list)

base_features

[{'url': '/rooms/585336259071491593?adults=8&children=0&infants=0&check_in=2023-01-20&check_out=2023-01-22&previous_page_section_name=1000&federated_search_id=37d5601f-55dd-4386-97d8-a75e064594eb',
  'name': 'Pink Door ✰ Coler MTN Bike Preserve ✰ Family Flow',
  'header': 'Entire residential home in Bentonville',
  'guests': '8 guests',
  'rooms': '4 bedrooms',
  'beds': '5 beds',
  'baths': '3 baths',
  'facilities': 'Wifi · Kitchen · Free parking',
  'badge': 'SUPERHOST',
  'rating': 'empty',
  'review_count': 'empty',
  'price': '$189'},
 {'url': '/rooms/47883859?adults=8&children=0&infants=0&check_in=2023-02-10&check_out=2023-02-12&previous_page_section_name=1000&federated_search_id=37d5601f-55dd-4386-97d8-a75e064594eb',
  'name': 'the PALMETTO/Near future WMHO/Greenway/8th St mkt',
  'header': 'Entire residential home in Bentonville',
  'guests': '14 guests',
  'rooms': '4 bedrooms',
  'beds': '7 beds',
  'baths': '3 baths',
  'facilities': 'Wifi · Kitchen · Free parking · Self ch

In [12]:
landing_page_df = pd.DataFrame.from_dict(base_features, orient='columns')

landing_page_df

Unnamed: 0,url,name,header,guests,rooms,beds,baths,facilities,badge,rating,review_count,price
0,/rooms/585336259071491593?adults=8&children=0&...,Pink Door ✰ Coler MTN Bike Preserve ✰ Family Flow,Entire residential home in Bentonville,8 guests,4 bedrooms,5 beds,3 baths,Wifi · Kitchen · Free parking,SUPERHOST,empty,empty,$189
1,/rooms/47883859?adults=8&children=0&infants=0&...,the PALMETTO/Near future WMHO/Greenway/8th St mkt,Entire residential home in Bentonville,14 guests,4 bedrooms,7 beds,3 baths,Wifi · Kitchen · Free parking · Self check-in,SUPERHOST,4.99,(69 reviews),$197
2,/rooms/47701526?adults=8&children=0&infants=0&...,"Pool/HotTub, Firepit Mile to Slaughter Pen & town",Entire residential home in Bentonville,9 guests,3 bedrooms,6 beds,2 baths,Wifi · Kitchen · Free parking · Self check-in,SUPERHOST,4.97,(91 reviews),$438
3,/rooms/39864150?adults=8&children=0&infants=0&...,"Pintell Trails: Spacious 4 Bed, 3 Bath w/ Garage",Entire residential home in Bentonville,8 guests,4 bedrooms,4 beds,3 baths,Wifi · Kitchen · Free parking · Self check-in,empty,empty,empty,$180
4,/rooms/583586116688149847?adults=8&children=0&...,[!!NEW!!] •The Hummingbird House•,Entire residential home in Bentonville,10 guests,3 bedrooms,6 beds,2 baths,Wifi · Kitchen · Free parking · Self check-in,empty,empty,empty,$165
...,...,...,...,...,...,...,...,...,...,...,...,...
295,/rooms/53842760?adults=8&children=0&infants=0&...,The London Calling ★ 3BD w/ POOL ★ Mins to Tra...,Entire residential home in Bentonville,9 guests,3 bedrooms,7 beds,2 baths,Wifi · Kitchen · Free parking,SUPERHOST,5.0,(4 reviews),$316
296,/rooms/51085138?adults=8&children=0&infants=0&...,T&L ⭐️ The Louise⭐️ 🚲 Right by Tristan Trail 🏠...,Entire townhouse in Bentonville,10 guests,3 bedrooms,5 beds,2 baths,Wifi · Kitchen · Free parking · Self check-in,SUPERHOST,4.92,(12 reviews),$134
297,/rooms/48225180?adults=8&children=0&infants=0&...,Bentonville Playhouse the perfect location for...,Entire residential home in Bentonville,8 guests,4 bedrooms,5 beds,2 baths,Wifi · Kitchen · Free parking · Self check-in,SUPERHOST,5.0,(40 reviews),$170
298,/rooms/42336443?adults=8&children=0&infants=0&...,MOD•HAUS,Entire residential home in Bentonville,9 guests,3 bedrooms,5 beds,2 baths,Wifi · Kitchen · Free parking · Self check-in,SUPERHOST,4.98,(88 reviews),$187


In [13]:
# listing name: div, _xcsyj0
detail_url = 'https://airbnb.com' + base_features[0]['url']
print(f'{detail_url}\n')

answer = requests.get(detail_url)
print(f'{answer}\n')

detail_soup = BeautifulSoup(answer.content)

detail_soup = detail_soup.find('h2', '_14i3z6h')

# some JS functions inside
for a in detail_soup:
    print(a.text)

https://airbnb.com/rooms/585336259071491593?adults=8&children=0&infants=0&check_in=2023-03-03&check_out=2023-03-05&previous_page_section_name=1000&federated_search_id=81494fa7-69be-49e0-8220-f98a6115a434

<Response [200]>

Entire residential home hosted by Pink Door BnB


In [35]:
ser = Service("/usr/local/bin/chromedriver")
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)

#opening page
reviews =driver.get(detail_url)

print(reviews)
time.sleep(10)

# getting html
page_detailed = driver.page_source

# closing the driver
driver.quit()

detail_soup = BeautifulSoup(page_detailed)

detail_soup = detail_soup.find_all('span', 'll4r2nl dir dir-ltr')

for a in detail_soup:
    print(f'{a.text}\n')

None
Family Flow - named for a favorite nearby Coler Mountain Bike Trail - is a colorful, fun filled home perfect for a family or fun group of friends. Enjoy a professionally decorated space, inspired by the art and nature Bentonville is famous for. Each room is unique and certain to ensure every person in your party will be happy with their accomodations. 

Enjoy comfy, 100% cotton bedding for a great night's sleep, a fully stocked kitchen (just bring the snacks!), a private bedroom upstairs (with games and TV) for kids to have their own unique space to enjoy, 3 full bathrooms, a smart TV, a gas fireplace and a fully stocked back deck that overlooks the street entrance to Coler MTN Bike Preserve. 

Walk to the Gorgeous Airship coffee house located within the nature preserve. 5 minutes drive to Bentonville Town Square. 

Book now for Coler specific events - this home is the closest Airbnb to Coler! We are inclusive and do no discriminate - all are welcome!

This host has 251 reviews fo

In [15]:
# amenities button
ser = Service("/usr/local/bin/chromedriver")
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)

driver.get(detail_url)
time.sleep(20)
driver.find_element(by=By.PARTIAL_LINK_TEXT, value='amenities').click()

In [16]:
page_detailed = driver.page_source
time.sleep(20)
detail_soup_clicked = BeautifulSoup(page_detailed)

In [17]:
time.sleep(10)
amenities = detail_soup_clicked.find_all('div', {'class': '_gw4xx4'})


amenities_all = [amenitie.text for amenitie in amenities]

# for i in url_list:
#     print(url_list + amenities_all)

amenities_all

['Hair dryer',
 'Shampoo',
 'Hot water',
 'Washer',
 'Dryer',
 'Essentials',
 'Hangers',
 'Bed linens',
 'Room-darkening shades',
 'Iron',
 'TV',
 'Pack ’n play/Travel crib',
 'Children’s books and toys',
 'High chair',
 'Air conditioning',
 'Indoor fireplace',
 'Heating',
 'Smoke alarm',
 'Carbon monoxide alarm',
 'Fire extinguisher',
 'First aid kit',
 'Wifi',
 'Dedicated workspace',
 'Kitchen',
 'Refrigerator',
 'Microwave',
 'Cooking basics',
 'Dishes and silverware',
 'Dishwasher',
 'Stove',
 'Oven',
 'Coffee maker',
 'Patio or balcony',
 'Outdoor furniture',
 'Hammock',
 'Outdoor dining area',
 'BBQ grill',
 'Free parking on premises',
 'Free street parking',
 'Pets allowed',
 'Long term stays allowed',
 'Unavailable: Security cameras on propertySecurity cameras on property',
 'Unavailable: Private entrancePrivate entrance']

In [20]:

def extract_element(listing_html, params):
    # 1. Find the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extract text from these tags
    if 'get' in params:
        element_texts = [el.get(params['get']) for el in elements_found]
    else:
        element_texts = [el.get_text() for el in elements_found]
    
    # 2. Extract the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # 3. Select a particular text or concatenate all of them
    tag_order = params.get('order', 0)
    if tag_order == -1:
        output = ' // '.join(element_texts)
    else:
        output = element_texts[tag_order]

    return output



In [21]:
#attempting house rules
house_rules = extract_element(detail_soup_clicked, {'tag': 'div', 'class': 'c1lue5su dir dir-ltr', 'order': 0})

house_rules

'House rulesCheck-in: FlexibleCheckout: 11:00 AMNo smokingNo parties or eventsPets are allowedShow more'

In [25]:
RULES_DETAIL_PAGE = {
    'location': {'tag': 'span', 'class': '_8vvkqm3'},
    'specialties_1': {'tag': 'div', 'class': '_1qsawv5', 'order': -1},
    'host_joined': {'tag': 'li', 'class': '_194e2vt2', 'order': 1},
    'house_rules': {'tag': 'div', 'class': '_u827kd', 'order': 0},
}


def extract_soup_js(listing_url, waiting_time=[3, 1]):
    """Extracts HTML from JS pages: open, wait, click, wait, extract"""

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--blink-settings=imagesEnabled=false')
    driver = webdriver.Chrome(options=options)

    driver.get(listing_url)
    time.sleep(waiting_time[0])
        
        
    driver.execute_script("window.scrollTo(0, 0);")
    try:
        driver.find_element(By.CLASS_NAME, "b1sec48q v7aged4 dir dir-ltr")
    except:
        pass # amentities button not found

    time.sleep(waiting_time[1])

    detail_page = driver.page_source

    driver.quit()

    return BeautifulSoup(detail_page, features='html.parser')

In [40]:
# Scrape single detail page
def process_detail_page(url):
    soup = extract_soup_js(url, waiting_time=[3, 1])
    
    features_list = []
    features = extract_page_features(soup, RULES_DETAIL_PAGE)
    features['amenities'] = amenities_all
    features_list.append(features)

    return features_list


t0 = time.time()
detail_features = process_detail_page(detail_url)
# print(time.time() - t0)
print(detail_features)

[{'location': 'Bentonville, Arkansas, United States', 'specialties_1': 'Pink Door BnB is a Superhost // Free cancellation before Mar 2', 'host_joined': 'empty', 'house_rules': 'empty', 'amenities': ['Hair dryer', 'Shampoo', 'Hot water', 'Washer', 'Dryer', 'Essentials', 'Hangers', 'Bed linens', 'Room-darkening shades', 'Iron', 'TV', 'Pack ’n play/Travel crib', 'Children’s books and toys', 'High chair', 'Air conditioning', 'Indoor fireplace', 'Heating', 'Smoke alarm', 'Carbon monoxide alarm', 'Fire extinguisher', 'First aid kit', 'Wifi', 'Dedicated workspace', 'Kitchen', 'Refrigerator', 'Microwave', 'Cooking basics', 'Dishes and silverware', 'Dishwasher', 'Stove', 'Oven', 'Coffee maker', 'Patio or balcony', 'Outdoor furniture', 'Hammock', 'Outdoor dining area', 'BBQ grill', 'Free parking on premises', 'Free street parking', 'Pets allowed', 'Long term stays allowed', 'Unavailable: Security cameras on propertySecurity cameras on property', 'Unavailable: Private entrancePrivate entrance']}]