In [2]:
from bs4 import BeautifulSoup
from urllib.parse import urlparse #renamed to urlib.parse in python 3

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException

from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed

import pandas as pd
import numpy as np
from pathlib import Path
import re

## Functions for scraping listings from Airbnb

In [3]:
def initDriver():
    options = Options()
    options.add_argument("--headless") # runs chrome without actually opening the chrome window
    # options.add_argument("--incognito")
    driver = webdriver.Chrome(options=options)
    return driver

def initSoup(driver):
    html_content = driver.page_source
    return BeautifulSoup(html_content,"html.parser")

def getRootUrl(url):
    parsed_url = urlparse(url)
    root_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return root_url

def waitForListingElements(driver):
    # listing container
    element = WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.XPATH, "//div[@itemprop='itemListElement']")))
    
    # pagination container
    element = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.CLASS_NAME, "p1j2gy66")))
    

def scrapeListings(driver, soup, root_url): # 1 page
    listings = soup.find_all("div",{"itemprop":"itemListElement"})
    page_data = pd.DataFrame(columns=["Place type", "Room type","Location", "Rating", "Total Reviews", "Price/night (SGD)", "Total Price (SGD)", "Link"])
    for listing in listings:
        try:
            title = listing.find("div",{"data-testid":"listing-card-title"}).text.strip().split("in")
            room_type = title[0].strip()
            if room_type.lower() == "room":
                place_type = "Room"
            else:
                place_type = "Entire home"
            location = title[1].strip()
            rating = listing.find("div",{"class":"t1a9j9y7"}).text.strip().split()[0]
            if rating != "New":
                totalReviews = listing.find("div",{"class":"t1a9j9y7"}).text.strip().split()[6]
            else:
                totalReviews = 0
            price = listing.find("span",{"class":"_11jcbg2"}).text.strip().split()[0]
            price_tax = listing.find("div",{"class":"_i5duul"}).find("div",{"class":"_10d7v0r"}).text.strip().split()[0]
            link = listing.find("a", {"class":"l1ovpqvx"}).get("href")
            # Reconstruct the absolute link
            link = root_url+link
            current_data = pd.DataFrame({
                "Place type":[place_type],
                "Room type": [room_type],
                "Location": [location],
                "Rating":[rating],
                "Total Reviews":[totalReviews],
                "Price/night (SGD)":[price],
                "Total Price (SGD)":[price_tax],
                "Link":[link]
            })
            page_data = pd.concat([page_data, current_data], axis=0)
            # print(f"Title: {title}\nRating: {rating}/5\nPrice: {price}\nTotal Price: {price_tax}\nLink: {link} \n\n")
        
        # skip the current listing if contain missing info/error
        except:
            continue
    return page_data

def getnextPageURL(soup, root_url):
    next_link = soup.find("a",{"aria-label":"Next"})
    # check for last page
    if next_link:
        next_page = root_url+next_link.get("href")
        return next_page
    else:
        return next_link # will return false

def quitProgram(driver, airbnb_data, path):
    driver.quit()
    file_path = Path(path)
    file_path.parent.mkdir(parents = True, exist_ok = True)
    airbnb_data.to_csv(file_path, index = False)
    print(f"Data saved to {file_path}...")

'''====================================================================='''

def scrapeListingDetails(url):
    driver = initDriver()
    listing_details = {}
    driver.get(url)
    # close the pop-up dialog for translation
    try:
        modal = WebDriverWait(driver, 5).until(
        EC.presence_of_element_located((By.XPATH, "//div[@role='dialog']")))
        close_button = modal.find_element(By.XPATH, "//button[@aria-label='Close']")
        close_button.click()
    except TimeoutException:
        pass
    except NoSuchElementException:
        print("Couldn't find translation modal") 
    try:
        # 1
        soup = initSoup(driver)
        listing_details.update(scrapeRoomContents(soup))

        # 2
        soup = initSoup(driver)
        listing_details.update(scrapeLatLong(soup))   

        # 3 - only need pass the url in 1 of the dictionary for merging with the original dataframe
        # click the show all amenitites button
        amentities_sect = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//div[@data-section-id='AMENITIES_DEFAULT']")))
        amenities_button = WebDriverWait(amentities_sect, 10).until(
        EC.element_to_be_clickable((By.TAG_NAME, "button")))
        amenities_button.click()
        amenities_modal = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//div[@role='dialog']")))
        # current_url = driver.current_url
        soup = initSoup(driver)
        listing_details.update(scrapeAmenities(soup, url))

        # 4
        soup = initSoup(driver)
        listing_details.update(scrapeRatings(soup))

        # 5 
        soup = initSoup(driver)
        listing_details.update(scrapeHostDetails(soup))
    # skip the current listing, scrape again later
    # except Exception as e:
    #     driver.quit()
    #     listing_details = {"Link":url}
    finally:
        driver.quit()
    return pd.DataFrame([listing_details])

def scrapeRoomContents(soup):
    room_contents = soup.find("div", {"class":"o1kjrihn"}).find_all("li", {"class":"l7n4lsf"})
    bedrooms, beds, bathrooms = 0, 0, 0
    for content in room_contents:
        content = content.text.strip("· ")
        if 'bedroom' in content:
            bedrooms = content.split(' ')[0]
        elif 'beds' in content:
            beds = content.split(' ')[0]
        elif 'bathroom' in content:
            bathrooms= content.split(' ')[0]
            try: # set bathrooms to 1 regardless of the desc like private etc.
                int_value = int(bathrooms)
            except ValueError:
                bathrooms = 1
        airbnb_room_contents = {
            "Bedrooms":bedrooms,
            "Beds":beds, 
            "Bathrooms":bathrooms,
        }

    return airbnb_room_contents

def scrapeRatings(soup):
    ratings = soup.find_all("div", {"class":"l925rvg"})
    cleanliness, accuracy, checkIn, communication, location, value = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
    for rating in ratings:
        name = rating.contents[0].text
        number = rating.contents[1].text
        if name == "Cleanliness":
            cleanliness = number
        elif name == "Accuracy":
            accuracy = number
        elif name == "Check-in":
            checkIn = number
        elif name == "Communication":
            communication = number
        elif name == "Location":
            location = number
        elif name == "Value":
            value = number
    airbnb_ratings = {
        "Cleanliness Rating":cleanliness,
        "Accuracy Rating":accuracy,
        "Check-in Rating":checkIn,
        "Communication Rating":communication,
        "Location Rating":location,
        "Value Rating":value,
    }
    
    return airbnb_ratings

def scrapeAmenities(soup, url):
    amenities = soup.find_all("div", {"class":"twad414"})
    airbnb_amenities = []
    for amenity in amenities:
        if (amenity.text.startswith("Unavailable")):
            continue
        airbnb_amenities.append(amenity.text)
    airbnb_amenities = {
        "Amenities":airbnb_amenities,
        "Link":url
    }

    return airbnb_amenities

def scrapeHostDetails(soup):
    host_superhost = soup.find("span", {"class":"s2nv573"})
    if host_superhost and host_superhost.text == "Superhost":
        host_superhost = True
    else:
        host_superhost = False
    host_reviews = soup.find("span", {"data-testid":"Reviews-stat-heading"})
    host_rating = soup.find("div", {"class":"ruujrrq"})
    host_reviews = host_reviews.text if host_reviews else float(np.nan)
    host_rating = host_rating.text if host_rating else float(np.nan)

    airbnb_host_details = {
        "Superhost":host_superhost,
        "Host Reviews":host_reviews,
        "Host Rating":host_rating,
    }

    return airbnb_host_details

def scrapeLatLong(soup):
    script = soup.find("script", string=lambda x: x and 'lat' in x and 'lng' in x)
    if script:
        match = re.search(r'"lat":([\d.]+),"lng":([\d.]+)', script.text)
        if match:
            latitude = match.group(1)
            longitude = match.group(2)
    airbnb_lat_long =  {
        "Latitude":latitude,
        "Longitude":longitude,
    }

    return airbnb_lat_long  

### Scrape basic information of listings from page

In [4]:
url = "https://www.airbnb.com.sg/s/Bangkok--Thailand/homes?refinement_paths%5B%5D=%2Fhomes&checkin=2024-10-04&checkout=2024-10-10&adults=3&tab_id=home_tab&query=Bangkok%2C%20Thailand&flexible_trip_lengths%5B%5D=one_week&monthly_start_date=2024-08-01&monthly_length=3&monthly_end_date=2024-11-01&price_filter_input_type=0&price_filter_num_nights=6&channel=EXPLORE&date_picker_type=calendar&place_id=ChIJ82ENKDJgHTERIEjiXbIAAQE&source=structured_search_input_header&search_type=user_map_move&search_mode=regular_search&ne_lat=13.810393924485789&ne_lng=100.5597668742775&sw_lat=13.71428581582692&sw_lng=100.48846329668004&zoom=12.786708747368644&zoom_level=12.786708747368644&search_by_map=true"
root_url = getRootUrl(url)
driver = initDriver()

airbnb_data = pd.DataFrame(columns=["Place type", "Room type", "Location", "Rating", "Total Reviews", "Price/night (SGD)", "Total Price (SGD)", "Link"])

try:
    driver.get(url)
    while True:
        waitForListingElements(driver)
        soup = initSoup(driver)
        data = scrapeListings(driver, soup, root_url)
        airbnb_data = pd.concat([airbnb_data, data], axis = 0)
        result = getnextPageURL(soup, root_url)
        if(result):
            driver.get(result)
        else:
            break
            

finally:
    path = "./dataset/airbnb.csv"
    quitProgram(driver, airbnb_data, path)


Data saved to dataset\airbnb.csv...


### Scrape additional information from each listing

In [53]:
data = pd.read_csv("./dataset/airbnb.csv", header=0)
listing_url = data["Link"].to_list()
count = 0
results = []
with ThreadPoolExecutor(max_workers=5) as executor:
    futures = [executor.submit(scrapeListingDetails, url) for url in listing_url]
    for future in as_completed(futures):
        results.append(future.result())
        count += 1
        print(count)
final_data = pd.concat(results, axis=0)
data = data.merge(final_data, how="inner", on="Link")
data = data.reindex(columns = ["Place type", "Room type", "Location", "Latitude", "Longitude",
    "Price/night (SGD)", "Total Price (SGD)", "Bedrooms", "Beds", "Bathrooms",
    "Superhost", "Host Reviews", "Host Rating", "Total Reviews", "Rating",
    "Cleanliness Rating", "Accuracy Rating", "Check-in Rating",
    "Communication Rating", "Location Rating", "Value Rating", "Amenities", "Link"])
file_path = Path("./dataset/airbnb.csv")
file_path.parent.mkdir(parents = True, exist_ok = True)
data.to_csv(file_path, index = False)
print("Amenities has been successfully merged into data")

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
Amenities has be

In [30]:
import requests, re

r = requests.get('https://www.airbnb.com.sg/rooms/1195697419656803553?check_out=2024-10-10&unique_share_id=F5DB2A4D-DA11-4439-AA2E-687AB8180DD5&slcid=f1247430419743e5bc6306bd76a620a0&s=13&feature=share&adults=3&check_in=2024-10-04&channel=whatsapp&slug=9O4FjOpF&source_impression_id=p3_1721802607_P3n6jNcn1igcjEQg')
p_lat = re.compile(r'"lat":([-0-9.]+),')
p_lng = re.compile(r'"lng":([-0-9.]+),')
lat = p_lat.findall(r.text)[0]
lng = p_lng.findall(r.text)[0]
print(lat,lng)

13.736341903712638 100.55670793589738


### Run this cell only if there are missing columns

In [54]:
data = pd.read_csv("./dataset/airbnb.csv", header=0)
print(data.isna().sum())
null_mask = data.isna().any(axis=1)
# retrieve the URLs with no amenities
null_url = data[null_mask]["Link"]
print(null_url)

data = data.dropna()
print(data.isna().sum())


Place type              0
Room type               0
Location                0
Latitude                0
Longitude               0
Price/night (SGD)       0
Total Price (SGD)       0
Bedrooms                0
Beds                    0
Bathrooms               0
Superhost               0
Host Reviews            4
Host Rating             4
Total Reviews           0
Rating                  0
Cleanliness Rating      0
Accuracy Rating         0
Check-in Rating         0
Communication Rating    0
Location Rating         0
Value Rating            0
Amenities               0
Link                    0
dtype: int64
136    https://www.airbnb.com.sg/rooms/28263438?adult...
168    https://www.airbnb.com.sg/rooms/12172154329531...
234    https://www.airbnb.com.sg/rooms/2960283?adults...
270    https://www.airbnb.com.sg/rooms/11860043030382...
Name: Link, dtype: object
Place type              0
Room type               0
Location                0
Latitude                0
Longitude               0
Price