# Scraping TripAdvisor Reviews using Selenium 

In this project, we will be scraping the reviews from hotel guests on TripAdvisor for Row NYC Hotel in New York City. There are a total of 9040 reviews and the date of review, review title, review as well as ratings will be extracted from the webpage. 
This is done through the use of Selenium for web scraping. 

The flow of this project would be:
1. Import relevant libraries 
2. Extract the individual elements through Selenium 
3. Build web scraper to automate the scraping of 9040 reviews 

## 1. Import relevant libraries 

In [5]:
import time
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from datetime import datetime, timedelta



## Function to scrape review from tripadvisor.com.sg


In [134]:
##Function to scrape reviews

def get_reviews(hotel_url, pages = None, based_date = None):
    
    """
    1. To scrape all reviews:
        - leave pages and based_date as None

    2. To scrape recent reviews: 
        - input page and based_date for shorter run time. (If pages left as None, run time will be long)
        - based_date must be in "mmm yyyy" format. Eg: 'Jan 2022'
    
    Output will be a df with the hotel name, date of review, Title of Review, Review, Ratings
    """
    #Create empty list for to append scrape data
    hotel_name = [] 
    Title = []
    Date = []
    Review = []
    ratings = []

    #Create empty df
    df = pd.DataFrame()

    if based_date is not None:
         based_date_dt = datetime.strptime(based_date, "%b %Y").date()
    else:
         pass

    #initiate chrome driver
    options = webdriver.ChromeOptions()
    options.add_argument('--ignore-certificate-errors')
    options.add_argument('--incognito')
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    

    #open hotel url page
    for url in hotel_url:
        driver.get(url)
    
        #Calculate the ttl pages to scrape for each hotel base on comments for each page (ard 5)
        en_rev_ttl = int(driver.find_elements(By.XPATH, "//span[@class='cvxmR']")[0].text.replace('(','').replace(')','').replace(',',''))
        en_rev_per_pg = len(driver.find_elements(By.XPATH,"//q[@class = 'XllAv H4 _a']"))
        if pages is None:
            pages_to_scrape = int(en_rev_ttl/en_rev_per_pg)
        else:
            pages_to_scrape = pages + 1
                
        print(f"No of pages to scrape is {pages}.")

        #Loop through the pages to scrape     
        for page in range(1, pages_to_scrape):
                print(f"page {page}")
                time.sleep(5)
                # Click on'Read More' button to expand each review to get full review
                element_list = driver.find_elements(By.XPATH, '//span[(@class ="fmBIl _S Nc")]') 
                
                if len(element_list) > 0:
                    print(f'there is an element: {element_list}')
                    driver.execute_script("arguments[0].click();", element_list[0])
                else:
                    print('theres no element')
                time.sleep(2)    
                    
                # To get the reviews and tag the name of hotel to review
                reviews = driver.find_elements(By.XPATH,"//q[@class = 'XllAv H4 _a']")
                h_name = driver.find_elements(By.XPATH, "//h1[@class='fkWsC b d Pn']")
                for i in range(len(reviews)):
                    Review.append(reviews[i].text)
                    hotel_name.append(h_name[0].text)
                
                
                #To get review title
                title = driver.find_elements(By.XPATH,"//div[contains(@data-test-target, 'review-title')]")
                for j in range(len(title)):
                    Title.append(title[j].text)
        
                # To get the dates 
                # dates = driver.find_elements(By.XPATH,"//span[@class = 'euPKI _R Me S4 H3']" )
                dates = driver.find_elements(By.XPATH,"//div[@class = 'bcaHz']")
                
                for k in range(len(dates)):
                    Date.append(dates[k].text)
            
                # To get ratings 
                for div in driver.find_elements(By.XPATH,"//div[@class = 'emWez F1']"):
                    elements = div.find_elements_by_tag_name('span')
                    for i in range(len(elements)):
                        ratings.append(elements[i].get_attribute('class'))
                    
                # Looping through mutiple pages
                # XPath diff for .com and .com.sg.        
                driver.implicitly_wait(10)
                components = ['13', '14', '15']
                for comp in components:
                    try:
                        if page <4:
                            
                            driver.find_elements (By.XPATH,f'//*//*[@id="component_{comp}"]/div/div[3]/div[8]/div/a[{page}]')[0].click()
                        else:
                            driver.implicitly_wait(10)  
                            driver.find_elements (By.XPATH,f'//*//*[@id="component_{comp}"]/div/div[3]/div[8]/div/a[4]')[0].click()
                            time.sleep(3)
                    except:
                        pass
    
    zipped = list(zip(hotel_name, Date, Title, Review, ratings))
    
    #Recreate empty list to append filtered data for use in forming df
    hotel_name = [] 
    Title = []
    Date = []
    Review = []
    ratings = []
    
    for z in zipped:
        # To convert date from str to datetime to compare with based_date
        try:
            datetime_k = datetime.strptime(z[1][-8:], "%b %Y").date()
        except:
            try:
                datetime_k = datetime.strptime("Sep"+" "+z[1][-4:], "%b %Y").date()
            except:
                datetime_k = datetime.strptime(z[1][-3:] + " " +'2022', "%b %Y").date()
        
        # To filter out reviews that are earlier than based_date
        if based_date is not None:
            if datetime_k >= based_date_dt:
                Date.append(datetime_k)
                hotel_name.append(z[0])
                Title.append(z[2])
                Review.append(z[3])
                ratings.append(z[4])
            else:
                pass
        else:
            Date.append(datetime_k)
            hotel_name.append(z[0])
            Title.append(z[2])
            Review.append(z[3])
            ratings.append(z[4])
    
    # add to df and process for readability
    df['Hotel_Name'] = hotel_name
    df['Date'] =  Date
    df['Title'] = Title
    df['Review']= Review
    df['Rating']= ratings

    df['Date'] = pd.to_datetime(df['Date'])
    df['Date'] = df['Date'].dt.to_period('M')
    df['Rating'] = df['Rating'].str.replace('ui_bubble_rating bubble_', '')
    df['Rating'] = df['Rating'].str.replace('0', '')

    return df


In [126]:
# Set up list of hotel url to scrape
# ['https://www.tripadvisor.com.sg/Hotel_Review-g294265-d568053-Reviews-Ibis_budget_Singapore_Emerald-Singapore.html']
# ['https://www.tripadvisor.com.sg/Hotel_Review-g294265-d506320-Reviews-Ibis_budget_Singapore_Joo_Chiat-Singapore.html']
#['https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1089215-Reviews-Ibis_budget_Singapore_West_Coast-Singapore.html']
# ['https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1008216-Reviews-Ibis_Budget_Singapore_Ametrine-Singapore.html']
# ['https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1015609-Reviews-Ibis_budget_Singapore_Imperial-Singapore.html']
# 'https://www.tripadvisor.com.sg/Hotel_Review-g294265-d2439500-Reviews-Ibis_Budget_Singapore_Clarke_Quay-Singapore.html',
hotel_url = ['https://www.tripadvisor.com.sg/Hotel_Review-g294265-d2439500-Reviews-Ibis_Budget_Singapore_Clarke_Quay-Singapore.html',
             'https://www.tripadvisor.com.sg/Hotel_Review-g294265-d1823538-Reviews-Ibis_budget_Singapore_Bugis-Singapore.html']

#run function to capture data and save into variable data
# based_date format: "mmm yyyy" eg: 'Jan 2022'
df_rev = get_reviews(hotel_url, pages = 2, based_date = 'Jan 2022')



Could not get version for google-chrome with the command:  powershell "$ErrorActionPreference='silentlycontinue' ; (Get-Item -Path "$env:PROGRAMFILES\Google\Chrome\Application\chrome.exe").VersionInfo.FileVersion ; if (-not $? -or $? -match $error) { (Get-Item -Path "$env:PROGRAMFILES(x86)\Google\Chrome\Application\chrome.exe").VersionInfo.FileVersion } if (-not $? -or $? -match $error) { (Get-Item -Path "$env:LOCALAPPDATA\Google\Chrome\Application\chrome.exe").VersionInfo.FileVersion } if (-not $? -or $? -match $error) { reg query "HKCU\SOFTWARE\Google\Chrome\BLBeacon" /v version } if (-not $? -or $? -match $error) { reg query "HKLM\SOFTWARE\Wow6432Node\Microsoft\Windows\CurrentVersion\Uninstall\Google Chrome" /v version }"
Current google-chrome version is UNKNOWN
Get LATEST chromedriver version for UNKNOWN google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/101.0.4951.41/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\edowi\.

No of pages to scrape is 2.
page 1
there is an element: [<selenium.webdriver.remote.webelement.WebElement (session="59e20d274eb0ebff771fcbbf516b5076", element="a358a18d-4447-4e83-8c0c-c3fbb1bfe48c")>]




page 2
there is an element: [<selenium.webdriver.remote.webelement.WebElement (session="59e20d274eb0ebff771fcbbf516b5076", element="739ba5df-a01b-4521-bcfb-b8d106da1b3a")>, <selenium.webdriver.remote.webelement.WebElement (session="59e20d274eb0ebff771fcbbf516b5076", element="0e6cfb9f-7b3b-4ae8-9d20-f4c56fd233d9")>]
No of pages to scrape is 2.
page 1
there is an element: [<selenium.webdriver.remote.webelement.WebElement (session="59e20d274eb0ebff771fcbbf516b5076", element="4b1d0460-423e-49af-b61d-95c4050930ee")>]
page 2
there is an element: [<selenium.webdriver.remote.webelement.WebElement (session="59e20d274eb0ebff771fcbbf516b5076", element="ae48d3eb-4d95-4e71-bc14-21ef893b5c12")>, <selenium.webdriver.remote.webelement.WebElement (session="59e20d274eb0ebff771fcbbf516b5076", element="86d4da84-fa74-4340-912f-de28763aa22b")>]
['cjclay5 wrote a review 6 May', 'Joyce K wrote a review Apr 2022', 'rammkhamkaew wrote a review Apr 2022', 'Ben Chua wrote a review Dec 2021', '253SA wrote a review

In [127]:
df_rev

Unnamed: 0,Hotel_Name,Date,Title,Review,Rating
0,Ibis Budget Singapore Clarke Quay,2022-05,A comfortable stay,The Ibis Budget in Clarke Quay is the perfect ...,5
1,Ibis Budget Singapore Clarke Quay,2022-04,Gd location and Comfortable rooms,Had a 3D 2N staycation in a superior twin bed ...,4
2,Ibis Budget Singapore Clarke Quay,2022-04,Ibis budget Clarke quay,"Good place, good location, close to mrt statio...",4
3,ibis budget Singapore Bugis,2022-02,Great value and location,No frills hotel but excellent location with ve...,4


In [129]:
df_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype    
---  ------      --------------  -----    
 0   Hotel_Name  4 non-null      object   
 1   Date        4 non-null      period[M]
 2   Title       4 non-null      object   
 3   Review      4 non-null      object   
 4   Rating      4 non-null      object   
dtypes: object(4), period[M](1)
memory usage: 288.0+ bytes


In [130]:
# save df as CSV for future NLP analysis 
df_rev.to_csv('ibis_budget_recent_rev.csv', encoding = 'utf8', index=False)