In [11]:
from seleniumwire import webdriver
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import json
from generic_methods import getRating
from selenium.webdriver.common.by import By
import traceback

# pip install webdriver-manager
# pip install selenium-wire
# pip install bs4
# pip install pandas


In [7]:
locations = json.load(open("locations.json", "r"))

chrome_options = webdriver.ChromeOptions()
prefs = {
    "profile.managed_default_content_settings.images": 2,
    "profile.managed_default_content_settings.javascript": 2
}
chrome_options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(chrome_options=chrome_options)

In [8]:
url = f"https://www.tripadvisor.com.ph/{locations['Manila']['hotels']}"
NUM_PAGES = 1 # number of pages to scrape.  total hotels scraped will be NUM_PAGES * 30 
offset = 0
hotel_links = []

for i in range(NUM_PAGES):
    index = url.index('-g')
    offset_url = url[:index] + f"-oa{offset}" + url[index:]
    driver.get(offset_url)
    hotel_list_page = BeautifulSoup(driver.page_source, "html.parser")

    hotel_items = []
   
    hotel_items.extend(hotel_list_page.find_all("div", class_="jsTLT K"))

    for item in hotel_items:
        hotel_links.append(item.contents[0].get("href"))

    offset += 30

print(hotel_links)


['/Hotel_Review-g298573-d483187-Reviews-New_Coast_Hotel_Manila-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d301802-Reviews-Sofitel_Philippine_Plaza_Manila-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d306018-Reviews-Diamond_Hotel_Philippines-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d304540-Reviews-The_Manila_Hotel-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d2410887-Reviews-The_Bayleaf_Intramuros-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d1230302-Reviews-Hotel_H2O-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d477891-Reviews-City_Garden_Suites-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d16817438-Reviews-Hotel_Lucky_Chinatown-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d12598952-Reviews-Rizal_Park_Hotel-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d13226014-Reviews-Red_Planet_Manila_Bay-Manila_Metro_Manila_Luzon.html', '/Hotel_Review-g298573-d306357-Reviews-Bayview_Park_Hotel_Ma

In [9]:
def getHotelInfo(page: BeautifulSoup, url):
    hotel_info = {}

    page = BeautifulSoup(driver.page_source, "html.parser")

    hotel_info["name"] = page.find("h1","QdLfr b d Pn").contents[0]
    hotel_info["url"] = url
    hotel_info["address"] = page.find("span", "fHvkI PTrfg").contents[0]
    hotel_info["about"] = page.find("div", "fIrGe _T").contents[0]
    hotel_info["review_count"] = int(page.find("span", "qqniT").contents[0].replace(",",""))
    hotel_info["rating"] = float(page.find("span", "uwJeR P").contents[0])
    hotel_info["rating_description"] = page.find("div", "kkzVG").contents[0]

    #Ratings
    ratings = page.find_all("div", "HXCfp")
    for rating in ratings:
        rating_type = rating.find("div", "hLoRK").contents[0]
        classes = rating.find("span", "ui_bubble_rating")
        rating_value = getRating(classes.get("class"))
        hotel_info[f"rating_{rating_type}"] = rating_value

    # More Info
    try:
        more_info = page.find("div", "aeQAp S5 b Pf ME").parent.contents # gets parent of the classs
        current_info_type = ""
        for element in more_info:
            if element.get("class") == None:
                continue
            if " ".join(element.get("class")) == "aeQAp S5 b Pf ME" :
                current_info_type = element.contents[0].replace(" ","_")
            elif " ".join(element.get("class")) == "OsCbb K":
                details = []
                detail_elements = element.find_all("div", "yplav f ME H3 _c")
                for detail_element in detail_elements:
                    details.append(detail_element.contents[1])
                hotel_info[current_info_type] = ",".join(details)
    except:
        pass

    try:
        hotel_info["hotel_class"] = page.find("svg", "JXZuC d H0").get("aria-label").split(" ")[0]
    except:
        hotel_info["hotel_class"] = None

    # To add:
    # hotel style
    # languages spoken

    # Proximity Details
    try:
        hotel_info["walkability_score"] = page.find("span","iVKnd fSVJN").contents[0]
        hotel_info["walkability_description"] = page.find("span","lSyvc H3 b zpbpA").contents[0]
        hotel_info["nearby_restaurant_count"] = page.find("span","iVKnd Bznmz").contents[0]
        hotel_info["nearby_attraction_count"] = page.find("span","iVKnd rYxbA").contents[0]
    except:
        pass

    # Tags
    tags_elements = page.find("div", "GFCJJ").contents[0].contents
    current_tag = ""
    for tag_element in tags_elements:
        if tag_element.get("class") == None:
            continue
        if " ".join(tag_element.get("class")) == "mpDVe Ci b":
            current_tag = tag_element.contents[0].replace(" ","_").lower()

            # modify tag elements:
            current_tag = "other_name" if current_tag == "also_known_as" else current_tag
            current_tag = "old_name" if current_tag == "formerly_known_as" else current_tag
            current_tag = "room_count" if current_tag == "number_of_rooms" else current_tag
        elif " ".join(tag_element.get("class")) == "IhqAp Ci":
            hotel_info[current_tag] = tag_element.contents[0].replace("<!-- -->","")
    del hotel_info["location"] #redundant

    # Reviews
    hotel_review = []
    for review in page.find_all("div", "YibKl MC R2 Gi z Z BB pBbQr"):
        review_info = {}
        review_info["restaurantName"] = hotel_info["name"]
        review_info["rating"] = getRating(review.find("span", "ui_bubble_rating").get("class"))
        review_info["ratingDate"] = review.find("div", "cRVSd").contents[0].contents[1].replace(" wrote a review ","")
        review_info["title"] = review.find("div", "KgQgP MC _S b S6 H5 _a").contents[0].contents[0].contents[0].contents[0]
        review_info["content"] = review.find("span", "QewHA H4 _a").contents[0].contents[0]
        review_info["visitDate"] = review.find("span", "teHYY _R Me S4 H3").contents[1].strip()
        
        review_info["url"] = review.find("a", "Qwuub").get("href")
        hotel_review.append(review_info)

    return hotel_info, hotel_review
    # break

In [12]:
hotel_infos = []
hotel_reviews = []
for link in hotel_links:
    try:
        hotel_url = "https://www.tripadvisor.com.ph" + link
        driver.get(hotel_url)

        hotel_info, hotel_review = getHotelInfo( BeautifulSoup(driver.page_source, "html.parser") , url)

        hotel_infos.append(hotel_info)
        hotel_reviews.extend(hotel_review)
    except Exception as e:
        print(f"An error has occured on {hotel_url}:{traceback.print_exc()}")
        continue


pd.DataFrame(hotel_infos).to_csv("datasets/hotel_infos.csv", index=False)
pd.DataFrame(hotel_reviews).to_csv("datasets/hotel_reviews.csv", index=False)
    # break

PermissionError: [Errno 13] Permission denied: 'datasets/hotel_infos.csv'