In [1]:
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException

import pandas as pd
import numpy as np

import sys
import csv
import time
import random
import logging
import traceback
import os

# default tripadvisor website of restaurant
PATH = "/usr/bin/chromedriver"

In [2]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [3]:
def expand_reviews(locator, base_element):
    element = WebDriverWait(base_element, timeout=10).until(
        EC.presence_of_element_located(locator)
    )
    if element is not None:
        element.click()

In [4]:
def get_field(jth_container, xpath, element_name):
    try:
        field = jth_container.find_element_by_xpath(xpath)
    except Exception as e:
        logger.error(f"Could not get {element_name}")
        logger.error(e)
        field = None
    return field

In [5]:
def get_fields(jth_container):
    title = get_field(jth_container, ".//span[@class='noQuotes']", "title")
    if title is not None:
        title = title.text

    date = get_field(jth_container, ".//span[contains(@class, 'ratingDate')]", "date")
    if date is not None:
        try:
            date = date.get_attribute("title")
        except Exception as e:
            logger.error("Could not load date")
            logger.error(e)

    rating = get_field(jth_container, ".//span[contains(@class, 'ui_bubble_rating bubble_')]", "rating")
    if rating is not None:
        rating = rating.get_attribute("class").split("_")[3]
    try:
        try:
            review = jth_container.find_element_by_xpath(
                    ".//span[@class='postSnippet']").text.replace("\n", " ")
        except:
            review = jth_container.find_element_by_xpath(
                    ".//p[@class='partial_entry']").text.replace("\n", " ")
    except Exception as e:
        review = None
        logger.error("Could not get review")
        logger.error(e)
    
    return [date, rating, title, review]

In [6]:
def scrape_url(url, csvWriter, restaurant, debug=False, driver_args = None, max_num_retries=3):
    num_retries = 0
    finished = False
    
    while not finished and num_retries < max_num_retries:
        try:
            # Import the webdriver
            if driver_args is not None:
                driver = webdriver.Chrome(PATH, **driver_args)
            else:
                driver = webdriver.Chrome(PATH)        
            driver.get(url)

            reviews_div = WebDriverWait(driver, timeout=10).until(
                EC.presence_of_element_located((By.ID, "REVIEWS"))
            )

            try:
                num_pages_el = WebDriverWait(reviews_div, timeout=10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, "pageNum.last"))
                )

                num_pages = int(num_pages_el.text)
                logger.info(f"Found a maximum of {num_pages} pages")
            except Exception as e:
                logger.error(
                    "Could not find maximum page. This page may only have one page of reviews")
                logger.error(e)
                num_pages = 1

            # change the value inside the range to save more or less reviews
            for i in range(0, num_pages):

                #     # expand the review
                more_span = "//span[@class='taLnk ulBlueLinks']"
                expand_locator = (By.XPATH, more_span)

                try:
                    expand_reviews(expand_locator, base_element=reviews_div)
                except Exception as e:
                    try:
                        expand_reviews(expand_locator, base_element=reviews_div)
                    except:
                        logger.info("Could not find 'more' button in order to expand reviews")

            #     # Wait for the expansion to take place

                # Make sure that all the reviews are loaded
                time.sleep(2)
                num_reviews_per_page = 10
                container = driver.find_elements_by_xpath(
                    ".//div[@class='review-container']")
                
                # Check that all 10 pages are loaded if we are not in the last page
                if len(container) < num_reviews_per_page and i < (num_pages - 1):
                    max_num_tries = 5
                    num_tries = 0

                    while len(container) != num_reviews_per_page and num_tries < max_num_tries:
                        time.sleep(1)
                        try:
                            container = driver.find_elements_by_xpath(
                                ".//div[@class='review-container']")
                        except Exception as e:
                            logger.log(f"Could not load container with 10 reviwes, retrying {num_tries} of {max_num_tries}")
                            logger.log(e)
                            pass
                        num_tries += 1

                for j in range(len(container)):
                    
                    date, rating, title, review = get_fields(container[j])

                    if debug:
                        print([restaurant, date, rating, title, review])

                    csvWriter.writerow([restaurant, date, rating, title, review, url, str(i+1)])

                if i < (num_pages-1):
                    # change the page
                    try:
                        WebDriverWait(reviews_div, timeout=5).until( 
                            EC.element_to_be_clickable((By.XPATH, './/a[@class="nav next ui_button primary"]'))
                        ).click()
    #                     driver.find_element_by_xpath(
    #                         './/a[@class="nav next ui_button primary"]').click()
                        time.sleep(random.randint(1, 3))
                    except Exception as e:
                        logger.error("Could not find and click next button even without beeing in the last page and waited to load")
                        logger.error(e)
                else:
                    logger.info("Reached end of all pages")
                    finished = True
                                     
        except TimeoutException as e:
            logger.error(f"Could not load this page, retrying... {num_retries+1} of {max_num_retries}")
            logger.error(e)
            num_retries += 1
            time.sleep(random.randint(1, 60))
        
        driver.close()
        time.sleep(random.randint(1, 3))

In [7]:
df = pd.read_csv("../datasets/final_list_URL_TripAdvisor_noduplicated.csv")
df.head()

Unnamed: 0,NYC_extract.DBA,NYC_extract.TripAdvisor.URL,NYC_extract.INSPECTION.DATE,NYC_extract.VIOLATION.DESCRIPTION,NYC_extract.SCORE,NYC_extract.GRADE,NYC_initial.Latitude,NYC_initial.Longitude,NYC_list_URL_final$NYC_extract.DBA
946,NOEL'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,11/02/2016,Facility not vermin proof. Harborage or condit...,14,,40.786901,-73.942033,NOEL'S PIZZA
17922,WEST SIDE STEAKHOUSE,https://www.tripadvisor.com/Restaurant_Review-...,06/06/2017,Food not cooled by an approved method whereby ...,13,A,40.764325,-73.958543,WEST SIDE STEAKHOUSE
19584,BAKER'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,03/07/2017,Hot food item not held at or above 140Âº F.,24,,40.764257,-73.982997,BAKER'S PIZZA
19791,PEE DEE,https://www.tripadvisor.com/Restaurant_Review-...,04/07/2017,Facility not vermin proof. Harborage or condit...,27,,40.78551,-73.972895,PEE DEE
19956,ABITINO'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,04/07/2017,Cold food item held above 41Âº F (smoked fish ...,31,,40.743826,-73.999556,ABITINO'S PIZZA


In [8]:
idxs = df[['NYC_extract.TripAdvisor.URL']].drop_duplicates().index
df = df.loc[idxs, ['NYC_extract.TripAdvisor.URL', "NYC_extract.DBA"]]
df

Unnamed: 0,NYC_extract.TripAdvisor.URL,NYC_extract.DBA
946,https://www.tripadvisor.com/Restaurant_Review-...,NOEL'S PIZZA
17922,https://www.tripadvisor.com/Restaurant_Review-...,WEST SIDE STEAKHOUSE
19584,https://www.tripadvisor.com/Restaurant_Review-...,BAKER'S PIZZA
19791,https://www.tripadvisor.com/Restaurant_Review-...,PEE DEE
19956,https://www.tripadvisor.com/Restaurant_Review-...,ABITINO'S PIZZA
...,...,...
132689,https://www.tripadvisor.com/Restaurant_Review-...,CIELO AT THE MAYFAIR
132694,https://www.tripadvisor.com/Restaurant_Review-...,ZIA MARIA LITTLE ITALY
132713,https://www.tripadvisor.com/Restaurant_Review-...,ADRIENNE'S PIZZA BAR
97439,https://www.tripadvisor.com/Restaurant_Review-...,DON PEPI PIZZA


In [10]:
logs_lines = []

for path in os.listdir("./scraping_runs/"):
    logs_lines += open(f"./scraping_runs/{path}").readlines()
    
urls_read = set(l.replace("\n","") for l in logs_lines if l.startswith("http"))
pending_urls = set(df["NYC_extract.TripAdvisor.URL"].values) - urls_read
df = df[df["NYC_extract.TripAdvisor.URL"].isin(pending_urls)]
df

Unnamed: 0,NYC_extract.TripAdvisor.URL,NYC_extract.DBA
81742,couldnt find - repeated,OLGA'S PIZZA


Unnamed: 0,NYC_extract.TripAdvisor.URL,NYC_extract.DBA
81742,couldnt find - repeated,OLGA'S PIZZA


In [133]:
# default path to file to store data
path_to_file = "../datasets/scraped_final_list_URL_TripAdvisor.csv"

# Open the file to save the review
csvFile = open(path_to_file, 'a', encoding="utf-8")
csvWriter = csv.writer(csvFile)

for i, row in df.iterrows():
    url = row["NYC_extract.TripAdvisor.URL"]
    if not url.startswith("http"):
        continue
    print(url)

    opts = Options()
    opts.add_argument('--no-sandbox')
    opts.add_argument('--headless')
    opts.add_argument('--disable-dev-shm-usage')
    driver_args = { "options" : opts }

    restaruant_name = row["NYC_extract.DBA"]
    
    scrape_url(url, csvWriter, restaurant=restaruant_name, debug=False, 
               driver_args=driver_args)

https://www.tripadvisor.com/Restaurant_Review-g60763-d3544467-Reviews-La_Bella_Vita-New_York_City_New_York.html


2021-03-16 16:06:19 INFO     Found a maximum of 37 pages
2021-03-16 16:06:31 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 16:09:41 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1318683-Reviews-Luna_Piena_Ristorante-New_York_City_New_York.html


2021-03-16 16:09:49 INFO     Found a maximum of 26 pages
2021-03-16 16:12:05 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1023214-Reviews-Quartino_Bottega_Organica-New_York_City_New_York.html


2021-03-16 16:12:13 INFO     Found a maximum of 6 pages
2021-03-16 16:12:42 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d12945910-Reviews-Mani_in_Pasta-New_York_City_New_York.html


2021-03-16 16:12:50 INFO     Found a maximum of 2 pages
2021-03-16 16:12:58 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d546786-Reviews-La_Masseria-New_York_City_New_York.html


2021-03-16 16:13:10 INFO     Found a maximum of 147 pages
2021-03-16 16:26:44 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d3737291-Reviews-Firenze_Ristorante-New_York_City_New_York.html


2021-03-16 16:26:55 INFO     Found a maximum of 9 pages
2021-03-16 16:27:45 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d521117-Reviews-Pietro_s-New_York_City_New_York.html


2021-03-16 16:27:54 INFO     Found a maximum of 16 pages
2021-03-16 16:29:15 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d5210361-Reviews-Armonie-New_York_City_New_York.html


2021-03-16 16:29:34 ERROR    Could not find maximum page. This page may only have one page of reviews
2021-03-16 16:29:34 ERROR    Message: 

2021-03-16 16:29:37 INFO     Reached end of all pages


https://www.tripadvisor.com/Attraction_Review-g60763-d5124083-Reviews-Grand_Central_Market-New_York_City_New_York.html


2021-03-16 16:29:59 ERROR    Could not find maximum page. This page may only have one page of reviews
2021-03-16 16:29:59 ERROR    Message: 

2021-03-16 16:30:20 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 16:30:22 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1731141-Reviews-Piccola_Cucina-New_York_City_New_York.html


2021-03-16 16:30:30 INFO     Found a maximum of 56 pages
2021-03-16 16:35:28 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d6912446-Reviews-Antica_Ristorante-New_York_City_New_York.html


2021-03-16 16:35:43 INFO     Found a maximum of 18 pages
2021-03-16 16:37:18 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1878436-Reviews-Harry_s_Italian_Pizza_Bar-New_York_City_New_York.html


2021-03-16 16:37:27 INFO     Found a maximum of 31 pages
2021-03-16 16:40:16 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d423968-Reviews-Serafina_Fabulous_Pizza_79th-New_York_City_New_York.html


2021-03-16 16:40:23 INFO     Found a maximum of 21 pages
2021-03-16 16:42:17 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d968445-Reviews-Bocca_di_Bacco-New_York_City_New_York.html


2021-03-16 16:42:26 INFO     Found a maximum of 48 pages
2021-03-16 16:46:40 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d2272739-Reviews-Giardino_D_oro-New_York_City_New_York.html


2021-03-16 16:46:47 INFO     Found a maximum of 7 pages
2021-03-16 16:47:19 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1308338-Reviews-Antonucci-New_York_City_New_York.html


2021-03-16 16:47:29 INFO     Found a maximum of 13 pages
2021-03-16 16:48:38 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1878682-Reviews-Olio_e_Piu-New_York_City_New_York.html


2021-03-16 16:48:48 INFO     Found a maximum of 205 pages
2021-03-16 16:51:08 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 16:54:17 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 16:57:12 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 16:57:49 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 16:59:44 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 17:01:39 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 17:03:18 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 17:09:26 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d477302-Reviews-Umberto_s_Clam_House-New_York_City_New_York.html


2021-03-16 17:09:37 INFO     Found a maximum of 33 pages
2021-03-16 17:12:32 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d13285712-Reviews-Mimmo-New_York_City_New_York.html


2021-03-16 17:12:41 INFO     Found a maximum of 2 pages
2021-03-16 17:12:49 INFO     Reached end of all pages


https://www.tripadvisor.com/RestaurantsNear-g60763-d479379-Angelo_s_of_Mulberry_Street-New_York_City_New_York.html


2021-03-16 17:13:07 ERROR    Could not load this page, retrying... 1 of 3
2021-03-16 17:13:07 ERROR    Message: 

2021-03-16 17:13:45 ERROR    Could not load this page, retrying... 2 of 3
2021-03-16 17:13:45 ERROR    Message: 

2021-03-16 17:14:53 ERROR    Could not load this page, retrying... 3 of 3
2021-03-16 17:14:53 ERROR    Message: 



https://www.tripadvisor.com/Restaurant_Review-g60763-d6486923-Reviews-Pisillo_Italian_Panini-New_York_City_New_York.html


2021-03-16 17:15:38 INFO     Found a maximum of 43 pages
2021-03-16 17:19:18 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 17:19:22 ERROR    Could not find and click next button even without beeing in the last page and waited to load
2021-03-16 17:19:22 ERROR    Message: element click intercepted: Element <a data-page-number="42" data-offset="410" class="nav next ui_button primary" onclick="widgetEvCall('handlers.paginate', event, this); widgetEvCall('handlers.trackClick', event, this, 'pagination_next', '42');" href="/Restaurant_Review-g60763-d6486923-Reviews-or410-Pisillo_Italian_Panini-New_York_City_New_York.html">...</a> is not clickable at point (702, 582). Other element would receive the click: <div id="taplc_hotels_loading_box_rr_resp_0" class="ppr_rup ppr_priv_hotels_loading_box" data-placement-name="hotels_loading_box:rr_resp" style="display: block;">...</div>
  (Session info: headless chrome=89.0.4389.82)

2021-03-16 17:19:24 INFO     Could not f

https://www.tripadvisor.com/Restaurant_Review-g60763-d814362-Reviews-Aurora_SoHo-New_York_City_New_York.html


2021-03-16 17:19:38 INFO     Found a maximum of 26 pages
2021-03-16 17:19:44 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 17:21:59 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d17195500-Reviews-Marcellino_Trattoria_Pizzeria-New_York_City_New_York.html


2021-03-16 17:22:05 INFO     Found a maximum of 3 pages
2021-03-16 17:22:21 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d459100-Reviews-Trattoria_Belvedere-New_York_City_New_York.html


2021-03-16 17:22:29 INFO     Found a maximum of 17 pages
2021-03-16 17:24:01 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d15561701-Reviews-Princi_Bakery-New_York_City_New_York.html


2021-03-16 17:24:10 INFO     Found a maximum of 2 pages
2021-03-16 17:24:18 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1057341-Reviews-Nizza-New_York_City_New_York.html


2021-03-16 17:24:27 INFO     Found a maximum of 85 pages
2021-03-16 17:32:04 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d1749407-Reviews-Petrarca_Cucina_e_Vino-New_York_City_New_York.html


2021-03-16 17:32:15 INFO     Found a maximum of 14 pages
2021-03-16 17:33:29 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d7934469-Reviews-Nocciola_Ristorante-New_York_City_New_York.html


2021-03-16 17:33:37 INFO     Found a maximum of 4 pages
2021-03-16 17:33:58 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d946843-Reviews-Cacio_e_Vino-New_York_City_New_York.html


2021-03-16 17:34:08 INFO     Found a maximum of 11 pages
2021-03-16 17:35:02 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d5825911-Reviews-Cibo_e_Vino-New_York_City_New_York.html


2021-03-16 17:35:10 INFO     Found a maximum of 11 pages
2021-03-16 17:36:03 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d10934435-Reviews-Antico_Noe-New_York_City_New_York.html


2021-03-16 17:36:21 ERROR    Could not find maximum page. This page may only have one page of reviews
2021-03-16 17:36:21 ERROR    Message: 

2021-03-16 17:36:24 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d11843586-Reviews-Allora_Ristorante-New_York_City_New_York.html


2021-03-16 17:36:40 INFO     Found a maximum of 13 pages
2021-03-16 17:37:49 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d3747414-Reviews-Cielo_at_the_Mayfair-New_York_City_New_York.html


2021-03-16 17:37:58 INFO     Found a maximum of 26 pages
2021-03-16 17:40:15 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d15151690-Reviews-Zia_Maria_Little_Italy-New_York_City_New_York.html


2021-03-16 17:40:23 INFO     Found a maximum of 5 pages
2021-03-16 17:40:49 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d2397895-Reviews-Adrienne_s_Pizzabar-New_York_City_New_York.html


2021-03-16 17:41:19 INFO     Found a maximum of 56 pages
2021-03-16 17:46:22 INFO     Reached end of all pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d2362951-Reviews-Famous_Famiglia_Pizzeria-New_York_City_New_York.html


2021-03-16 17:46:30 INFO     Found a maximum of 34 pages
2021-03-16 17:49:48 INFO     Could not find 'more' button in order to expand reviews
2021-03-16 17:49:50 INFO     Reached end of all pages


In [1]:
import pandas as pd
df = pd.read_csv("../datasets/test.csv", sep="\t")
df

Unnamed: 0,restaurant_name,date,rating,title,review,num_likes,url,num_page,total_pages
0,test,"December 3, 2020",50,My Pal Tommy the Nose !!,Tommy is a great host of Umbertos for years. H...,1,https://www.tripadvisor.com/Restaurant_Review-...,1,
1,test,"June 9, 2020",40,The food is definitely good,The service was appropriately given to us. The...,1,https://www.tripadvisor.com/Restaurant_Review-...,1,
2,test,"March 12, 2020",50,Umberto was a great host and,Umberto was a great host and the food was exce...,0,https://www.tripadvisor.com/Restaurant_Review-...,1,
3,test,"February 7, 2020",50,Oh yeah!,"Nice bread and water to start, nice little bot...",0,https://www.tripadvisor.com/Restaurant_Review-...,1,
4,test,"February 1, 2020",50,Awesome,We were no planning to dine here but at the la...,1,https://www.tripadvisor.com/Restaurant_Review-...,1,
...,...,...,...,...,...,...,...,...,...
329,test,"March 24, 2011",30,Good Italian - Wine was old and flat,Food was good. Some of the best calamari I've ...,0,https://www.tripadvisor.com/Restaurant_Review-...,32,
330,test,"March 24, 2011",10,fugetaboutit - don't bother,meal was mediocre - at best - but what made ou...,2,https://www.tripadvisor.com/Restaurant_Review-...,33,
331,test,"September 4, 2009",20,Watch out!,"Our food was fine, but we were charged extra f...",0,https://www.tripadvisor.com/Restaurant_Review-...,33,
332,test,"August 3, 2009",40,Great late night dinner!!,After our nightmare at the Hotel Pennysylvania...,0,https://www.tripadvisor.com/Restaurant_Review-...,33,
