In [62]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

import pandas as pd
import numpy as np

import sys
import csv
import time
import random
import logging
import traceback
import os

# default tripadvisor website of restaurant
PATH = "/usr/bin/chromedriver"

In [63]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [69]:
def expand_reviews(locator, base_element):
    element = WebDriverWait(base_element, timeout=10).until(
        EC.presence_of_element_located(locator)
    )
    if element is not None:
        element.click()


def scrape_url(url, csvWriter, restaurant, debug=False, driver_args = None):

    # Import the webdriver
    if driver_args is not None:
        driver = webdriver.Chrome(PATH, **driver_args)
    else:
        driver = webdriver.Chrome(PATH)        
    driver.get(url)

    try:
        reviews_div = WebDriverWait(driver, timeout=10).until(
            EC.presence_of_element_located((By.ID, "REVIEWS"))
        )

        try:
            num_pages_el = WebDriverWait(reviews_div, timeout=10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "pageNum.last"))
            )

            num_pages = int(num_pages_el.text)
            logger.info(f"Found a maximum of {num_pages} pages")
        except Exception as e:
            logger.error(
                "Could not find maximum page. This page may only have one page of reviews")
            logger.error(e)
            num_pages = 10

        # change the value inside the range to save more or less reviews
        for i in range(0, num_pages):

            #     # expand the review
            more_span = "//span[@class='taLnk ulBlueLinks']"
            expand_locator = (By.XPATH, more_span)

            try:
                expand_reviews(expand_locator, base_element=reviews_div)
            except Exception as e:
                try:
                    expand_reviews(expand_locator, base_element=reviews_div)
                except:
                    logger.info("Could not find 'more' button in order to expand reviews")

        #     # Wait for the expansion to take place

            # Make sure that all the reviews are loaded
            time.sleep(2)
            num_reviews_per_page = 10
            container = driver.find_elements_by_xpath(
                ".//div[@class='review-container']")

            if len(container) < num_reviews_per_page and i < (num_pages - 1):
                max_num_tries = 5
                num_tries = 0

                while len(container) != num_reviews_per_page and num_tries < max_num_tries:
                    time.sleep(1)
                    try:
                        container = driver.find_elements_by_xpath(
                            ".//div[@class='review-container']")
                    except:
                        pass
                    num_tries += 1

            for j in range(len(container)):
                title = container[j].find_element_by_xpath(
                    ".//span[@class='noQuotes']").text
                date = container[j].find_element_by_xpath(
                    ".//span[contains(@class, 'ratingDate')]").get_attribute("title")
                rating = container[j].find_element_by_xpath(
                    ".//span[contains(@class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
                try:
                    review = container[j].find_element_by_xpath(
                        ".//span[@class='postSnippet']").text.replace("\n", " ")
                except:
                    review = container[j].find_element_by_xpath(
                        ".//p[@class='partial_entry']").text.replace("\n", " ")

                if debug:
                    print([restaurant, date, rating, title, review])

                csvWriter.writerow([restaurant, date, rating, title, review, url, str(i+1)])

            if i < (num_pages-1):
                # change the page
                try:
                    driver.find_element_by_xpath(
                        './/a[@class="nav next ui_button primary"]').click()
                    time.sleep(random.randint(1, 4))
                except Exception as e:
                    logger.error("Could not find next button even without beeing in the last page")
                    logger.error(e)
            else:
                logger.info("Reached end of all pages")

    except Exception as e:
        logger.error("Could not load this page")
        logger.error(e)

    driver.close()
    time.sleep(random.randint(1, 3))

In [70]:
df = pd.read_csv("../datasets/final_list_URL_TripAdvisor.csv", sep=";")
df.head()

Unnamed: 0,NYC_extract.DBA,NYC_extract.TripAdvisor.URL,NYC_extract.INSPECTION.DATE,NYC_extract.VIOLATION.DESCRIPTION,NYC_extract.SCORE,NYC_extract.GRADE
946,NOEL'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,11/02/2016,Facility not vermin proof. Harborage or condit...,14,
17922,WEST SIDE STEAKHOUSE,https://www.tripadvisor.com/Restaurant_Review-...,06/06/2017,Food not cooled by an approved method whereby ...,13,A
19584,BAKER'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,03/07/2017,Hot food item not held at or above 140Âº F.,24,
19648,BAKER'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,03/07/2017,Filth flies or food/refuse/sewage-associated (...,24,
19791,PEE DEE,https://www.tripadvisor.com/Restaurant_Review-...,04/07/2017,Facility not vermin proof. Harborage or condit...,27,


In [71]:
idxs = df[['NYC_extract.TripAdvisor.URL']].drop_duplicates().index
df = df.loc[idxs, ['NYC_extract.TripAdvisor.URL', "NYC_extract.DBA"]]
df

Unnamed: 0,NYC_extract.TripAdvisor.URL,NYC_extract.DBA
946,https://www.tripadvisor.com/Restaurant_Review-...,NOEL'S PIZZA
17922,https://www.tripadvisor.com/Restaurant_Review-...,WEST SIDE STEAKHOUSE
19584,https://www.tripadvisor.com/Restaurant_Review-...,BAKER'S PIZZA
19791,https://www.tripadvisor.com/Restaurant_Review-...,PEE DEE
19956,https://www.tripadvisor.com/Restaurant_Review-...,ABITINO'S PIZZA
...,...,...
223934,https://www.tripadvisor.com/Restaurant_Review-...,FAMOUS ORIGINAL RAY'S PIZZA
224460,https://www.tripadvisor.com/Restaurant_Review-...,PRONTO PIZZA
226121,https://www.tripadvisor.com/Restaurant_Review-...,TWO BOOTS
227187,https://www.tripadvisor.com/Restaurant_Review-...,NICK & STEF'S STEAKHOUSE


In [72]:
logs_lines = []

for path in os.listdir("./scraping_runs/"):
    logs_lines += open(f"./scraping_runs/{path}").readlines()
    
urls_read = set(l.replace("\n","") for l in logs_lines if l.startswith("http"))
pending_urls = set(df["NYC_extract.TripAdvisor.URL"].values) - urls_read
df = df[df["NYC_extract.TripAdvisor.URL"].isin(pending_urls)]
df

Unnamed: 0,NYC_extract.TripAdvisor.URL,NYC_extract.DBA
124568,https://www.tripadvisor.com/Restaurant_Review-...,AZALEA
124664,https://www.tripadvisor.com/Restaurant_Review-...,DELMONICOS
124708,https://www.tripadvisor.com/Restaurant_Review-...,FLAVORS OF ITALY
124771,https://www.tripadvisor.com/Restaurant_Review-...,ARTE RESTAURANT
124855,https://www.tripadvisor.com/Restaurant_Review-...,MISIRIZZI
...,...,...
223934,https://www.tripadvisor.com/Restaurant_Review-...,FAMOUS ORIGINAL RAY'S PIZZA
224460,https://www.tripadvisor.com/Restaurant_Review-...,PRONTO PIZZA
226121,https://www.tripadvisor.com/Restaurant_Review-...,TWO BOOTS
227187,https://www.tripadvisor.com/Restaurant_Review-...,NICK & STEF'S STEAKHOUSE


In [None]:
# default path to file to store data
path_to_file = "../datasets/scraped_final_list_URL_TripAdvisor.csv"

# Open the file to save the review
csvFile = open(path_to_file, 'a', encoding="utf-8")
csvWriter = csv.writer(csvFile)

for i, row in df.iterrows():
    
    opts = Options()
    opts.add_argument('--no-sandbox')
    opts.add_argument('--headless')
    opts.add_argument('--disable-dev-shm-usage')
    driver_args = { "options" : opts }

    restaruant_name = row["NYC_extract.DBA"]
    url = row["NYC_extract.TripAdvisor.URL"]
    print(url)
    
    scrape_url(url, csvWriter, restaurant=restaruant_name, debug=False, 
               driver_args=driver_args)

https://www.tripadvisor.com/Restaurant_Review-g60763-d424558-Reviews-Azalea_Ristorante-New_York_City_New_York.html


INFO:__main__:Found a maximum of 72 pages
INFO:__main__:Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d459628-Reviews-Delmonico_s-New_York_City_New_York.html


INFO:__main__:Found a maximum of 85 pages
INFO:__main__:Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d19270049-Reviews-Flavors_of_Italy_Bistro-New_York_City_New_York.html


ERROR:__main__:Could not find maximum page. This page may only have one page of reviews
ERROR:__main__:Message: 

ERROR:__main__:Could not find net button even without beeing in the last page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)

ERROR:__main__:Could not find net button even without beeing in the last page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)

ERROR:__main__:Could not find net button even without beeing in the last page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)

ERROR:__main__:Could not find net button even without beeing in the last page
ERROR:_

https://www.tripadvisor.com/Restaurant_Review-g60763-d477310-Reviews-Trattoria_Dell_Arte-New_York_City_New_York.html


INFO:__main__:Found a maximum of 154 pages
INFO:__main__:Could not expand reviews
INFO:__main__:Could not expand reviews
INFO:__main__:Could not expand reviews
INFO:__main__:Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d12141953-Reviews-Misirizzi_Italian_Restaurant-New_York_City_New_York.html


INFO:__main__:Found a maximum of 3 pages
INFO:__main__:Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d14952452-Reviews-Attraversa_TriBeCa-New_York_City_New_York.html


INFO:__main__:Found a maximum of 2 pages
INFO:__main__:Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1887411-Reviews-LAVO_Italian_Restaurant-New_York_City_New_York.html


INFO:__main__:Found a maximum of 63 pages
