In [45]:
# Import necessary libraries and modules

from selenium import webdriver 
import time 
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException, TimeoutException, ElementNotInteractableException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [46]:
# URL of the TripAdvisor page for Thessaloniki restaurants
url = 'https://www.tripadvisor.com/Restaurants-g189473-Thessaloniki_Thessaloniki_Region_Central_Macedonia.html'

In [47]:
driver = webdriver.Chrome() # Create an instance of the Chrome web driver
driver.get(url) # Open the TripAdvisor page in the web driver
time.sleep(2)
driver.find_element(By.ID, "onetrust-accept-btn-handler").click() # click on cookie consent button


In [48]:
def selectCheckbox1(showMore, driver):
    """
    This function selects checkboxes for "Coffee & Tea," "Bars & Pubs," and unselects "Restaurants."
    """
    showMore.click()

    time.sleep(1)
    label_text = "Coffee & Tea"
    label_element = driver.find_element(By.XPATH, "//label[contains(., '{}')]".format(label_text))
    checkbox = label_element.find_element(By.XPATH, "./preceding-sibling::input")
    driver.execute_script("arguments[0].click();", checkbox)

    time.sleep(1)
    label_text = "Bars & Pubs"
    label_element = driver.find_element(By.XPATH, "//label[contains(., '{}')]".format(label_text))
    checkbox = label_element.find_element(By.XPATH, "./preceding-sibling::input")
    driver.execute_script("arguments[0].click();", checkbox)

    time.sleep(3)
    label_text = "Restaurants"
    label_element = driver.find_elements(By.XPATH, "//label[contains(., '{}')]".format(label_text))
    checkbox = label_element[2].find_element(By.XPATH, "./preceding-sibling::input")
    driver.execute_script("arguments[0].removeAttribute('checked'); arguments[0].click();", checkbox)
        

In [49]:
def selectCheckbox2(showMore, driver):
    """
    This function is an alternative implementation of selecting checkboxes.
    """
    showMore.click()

    label_text = "Coffee & Tea"
    label_element = driver.find_element(By.XPATH, "//label[contains(., '{}')]".format(label_text))
    checkbox = label_element.find_element(By.XPATH, "preceding::input[1]")
    driver.execute_script("arguments[0].click();", checkbox)


    time.sleep(1)
    label_text = "Bars & Pubs"
    label_element = driver.find_element(By.XPATH, "//label[contains(., '{}')]".format(label_text))
    checkbox = label_element.find_element(By.XPATH, "preceding::input[1]")
    driver.execute_script("arguments[0].click();", checkbox)

    time.sleep(2)
    label_text = "Restaurants"
    label_element = driver.find_element(By.XPATH, "//label[contains(., '{}')]".format(label_text))
    checkbox = label_element.find_element(By.XPATH, "preceding::input[1]")
    driver.execute_script("arguments[0].removeAttribute('checked'); arguments[0].click();", checkbox)


In [50]:
try:
    text_to_find = "Show more"
    showMore = driver.find_element(By.XPATH, "//span[contains(text(), '{}')]".format(text_to_find))
    selectCheckbox1(showMore, driver) 

except NoSuchElementException:
    try:
        showMore = driver.find_element(By.XPATH, "//button[contains(@aria-controls, 'filter-expando-establishmentTypes')]")
        selectCheckbox2(showMore, driver)
    except NoSuchElementException:
        print("Error: Something went wrong on checkbox selection...")

time.sleep(1)   

In [51]:
def retrieveInfo(review, business_name):
    """
    This function retrieves information from each review element on the page. It contains a retry mechanism to retrieve the whole text of the review by clicking the ‚ÄúMore‚Äù button
    """
    review_info = {}
    review_info['Business_name'] = business_name
    
    # Extract username, review date, visit date, review title,  review text and rating
    try:
        username_element = review.find_element(By.XPATH, ".//div[@class='info_text pointer_cursor']")
        username = username_element.text
    except NoSuchElementException:
        username = None
    review_info['Username'] = username
    try:
        date_element = review.find_element(By.XPATH, ".//span[@class='ratingDate']")
        review_date = date_element.get_attribute('title')
    except NoSuchElementException:
        review_date = None
    review_info['Review Date'] = review_date
    try:
        visit_date_element = review.find_element(By.XPATH, ".//div[@class='prw_rup prw_reviews_stay_date_hsx']")
        visit_date_text = visit_date_element.text
        visit_date = visit_date_text.replace("Date of visit:", "").strip()
    except NoSuchElementException:
        visit_date = None
    review_info['Visit Date'] = visit_date
    try:
        title_element = review.find_element(By.XPATH, ".//span[@class='noQuotes']")
        title = title_element.text
    except NoSuchElementException:
        title = None
    review_info['Review Title'] = title
        
    # Check if there is a "More" button to see full review text
    try:
        max_attempts = 3
        current_attempt = 1
        while current_attempt < max_attempts:
            try:
                # Define the locator for the clickable element within the parent element
                # clickable_locator = review.find_element(By.CLASS_NAME, 'ulBlueLinks')
                clickable_locator = review.find_element(By.XPATH, "//span[text() = 'More']")

                # Perform the click operation
                #driver.execute_script("arguments[0].click();", clickable_locator)
                clickable_locator.click()
                
                time.sleep(2)
                break  # Click successful, exit the loop

            except ElementClickInterceptedException:
                current_attempt += 1
                print(f"Attempt {current_attempt} failed. Retrying...")

    except NoSuchElementException:
        print("")
    except TimeoutException:
        print("Error: Timeout")
        
    # Extract review content
    content_element = review.find_element(By.XPATH, ".//p[@class='partial_entry']")
    time.sleep(1)
    content = content_element.text.strip()
    print(content)
    review_info['Review Text'] = content
    try:
        rating_element = review.find_element(By.XPATH, ".//span[contains(@class, 'ui_bubble_rating')]")
        time.sleep(1)
        rating_class = rating_element.get_attribute('class')
        rating = int(rating_class.split('bubble_')[-1]) / 10
    except NoSuchElementException:
        rating = None
    review_info['Rating'] = rating   

    return review_info 

In [52]:
def navigateReview(driver, review_data, business_name):
    """
    This function navigates through each review element on the page and calls retrieveInfo() to extract information.
    """
    try:
        # Find review elements
        reviews = driver.find_elements(By.CLASS_NAME, 'review-container')
        time.sleep(1)
        # Scrape review data
        for review in reviews:
            try:
                review_info = retrieveInfo(review, business_name)
                review_data.append(review_info)

            except ElementClickInterceptedException:
                time.sleep(3)
                review_info = retrieveInfo(review, business_name)
                review_data.append(review_info)
                print("ElementClickInterceptedException: navigateReview")
    except StaleElementReferenceException:
        print("StaleElementReferenceException: navigateReview")



In [53]:
def goToNextReviewPage(driver, review, business_name, counter):
    """ 
    This function goes to the next review page if available.
    """
    try:
        
        next_page_link = driver.find_element("xpath", f".//a[contains(@data-page-number, {counter})]")
        time.sleep(1) 
        if next_page_link:
            next_page_link.click()
            time.sleep(3)

            # Update the review data by navigating to the new page
            navigateReview(driver, review, business_name)
            msg = ""

        # Check if the "Next" button is disabled
        elif 'disabled' in next_page_link.get_attribute('class'):
            msg = "No more pages."
            return msg

    except NoSuchElementException:
        # Handle the case when the "Next" button is not found
        msg = "No more pages."
        return msg

    # Return the updated review data and the status message
    return  msg

In [54]:
def getReviewInfo(driver, review_data):
    """ 
    The function goes through the review pages for every business and invokes the retrieveInfo() function to obtain information about the reviews. 
    By employing a retry mechanism with a maximum of three attempts, we reduce the chances of missing out on information due to network problems.
    """
    max_retries = 3  # Maximum number of retries
    retry_count = 0  # Counter for retries

    while retry_count < max_retries:
        try:
            # Find the business name element
            business_name_element = driver.find_element(By.XPATH, "//h1[@data-test-target='top-info-header']")
            time.sleep(2)
            business_name = business_name_element.text

            print(f"Business Name: {business_name}")

            counter = 2
            while True:
                # Navigate to the current page and extract review data
                navigateReview(driver, review_data, business_name)
                # Go to the next page and update the review data
                msg = goToNextReviewPage(driver, review_data, business_name, counter)
                counter += 1
                # Check if there are no more pages
                if msg == "No more pages.":
                    print(msg)
                    break

            # If execution reaches this point without any exceptions, exit the loop
            break

        except (StaleElementReferenceException, ElementNotInteractableException):
            # Handle the StaleElementReferenceException here
            print("StaleElementReferenceException or ElementNotInteractableException occurred. Retrying...")
            retry_count += 1
            if retry_count >= max_retries:
                print("Maximum number of retries reached. Exiting...")
                # You can raise an exception here or add further error handling
                break
            else:
                time.sleep(2)
                driver.refresh()
                time.sleep(3)
                business_name_element = driver.find_element(By.XPATH, "//h1[@data-test-target='top-info-header']")
                


In [55]:

def navigate(driver, businessCount, review_data, max_retries):
    """
    This function navigates to each business page and calls getReviewInfo() to extract review information.
    """
    retry_count = 0
    while True:
        try:
            print(f"business: {businessCount}")
            time.sleep(1)
            # Find the div element you want to click
            business_page = driver.find_element(By.CSS_SELECTOR, f"div[data-test='{businessCount}_list_item']")

            # Find the link element inside the div
            link_element = business_page.find_element(By.TAG_NAME, "a")

            # Perform the click action on the div element
            link_element.click()

            wait = WebDriverWait(driver, 10)  # Set a maximum wait time of 10 seconds
            wait.until(EC.number_of_windows_to_be(2))  # Wait until there are two open windows

            #Switch to the new window or tab
            driver.switch_to.window(driver.window_handles[1])
            time.sleep(2)
            getReviewInfo(driver, review_data) 
            time.sleep(1)
            # Close the current window
            driver.close()

            # Switch back to the previous window
            driver.switch_to.window(driver.window_handles[0])
        except NoSuchElementException:
            return businessCount
        except ElementNotInteractableException:
            # Handle the ElementNotInteractableException here
            print("ElementNotInteractableException occurred. Retrying...")
            retry_count += 1
            if retry_count >= max_retries:
                print("Maximum number of retries reached. Exiting...")
                # You can raise an exception here or add further error handling
                break
            else:
                # Retry the operation after waiting for some time
                time.sleep(3)
                continue

        businessCount += 1


In [56]:
def goToPage(driver, businessCount, counter, review_data):
    """ 
    This function goes to the next page of businesses and calls navigate() to navigate through each business page.
    """
    retries = 3  # Set the maximum number of retries
    while retries > 0:
        try:
            time.sleep(1)
            next_page_link = driver.find_element("xpath", f".//a[contains(@aria-label, {counter})]")
        except NoSuchElementException:
            time.sleep(1)
            next_page_link = driver.find_element("xpath", f".//a[contains(@data-page-number, {counter})]")

        if next_page_link:
            next_page_link.click()
            time.sleep(3)
            max_retries = 3  # Set the maximum number of retries
            businessCount = navigate(driver, businessCount, review_data, max_retries)
            break  # Element found and processed, exit the retry loop

        retries -= 1
        print("Element not found. Retrying...")
        time.sleep(3)

    else:
        print("Max retries exceeded. Exiting goToPage().")

    return businessCount

In [57]:
counter = 2
businessCount = 1
max_entries = 3
review_data = []

time.sleep(3)
businessCount = navigate(driver, businessCount, review_data, max_entries)
print(f"businessCount: {businessCount}")

while True:
    # give the DOM time to load
    print(f"page: {counter}")
    time.sleep(3)
    try:
        businessCount = goToPage(driver, businessCount, counter, review_data)

    except (ElementClickInterceptedException, StaleElementReferenceException):
        # Retry the operation after waiting for some time
        time.sleep(4)
        continue
    except NoSuchElementException:
        break       
    counter += 1
    


# When all pages have been processed, quit the driver
# driver.quit()

#list that contains all the reviews form each business
print(review_data)

business: 1
Business Name: Albeta Mediterranean Bakery
Top quality products, exceptional customer service and tasty food. Coffee is also very good and one of my favorite choices.

A great place for breakfast! I really enjoyed the tortilla chicken and the fresh pomegranate juice.

The stuff was very helpful and kind! They also have a beg variety of pastries and the best coffee in town!

The best bakery in Thessaloniki. Highest quality of ingredients featuring baked goods to sandwiches & salads

It has everything, from delicious food to polite staff! It has variety of food, coffee and you can find kombucha (for the ones that love it and cannot find it easily in Thessaloniki). Don‚Äôt forget to check out the sweets as well :)

The best in town. Wonderful coffee, nice people, everything fresh and tasty. So nice to be there. Mornings get beautiful. üôèüèª üíóüíóüíóüíó

The cutest bakery located at the center of downtown Thessaloniki. Best quality and a variety of sweet and savoury pas

In [58]:
import csv
csv_file = 'reviews.csv'

# Extract the headers from the first dictionary in the list
headers = review_data[0].keys()

with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=headers)

    # Write the headers and then the values
    writer.writeheader()
    for review in review_data:
        writer.writerow(review)