In [38]:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

import pandas as pd
import numpy as np

import sys
import csv
import time
import random
import logging
import traceback
import os

# default tripadvisor website of restaurant
PATH = "/usr/bin/chromedriver"

In [39]:
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

In [40]:
def expand_reviews(locator, base_element):
    element = WebDriverWait(base_element, timeout=10).until(
        EC.presence_of_element_located(locator)
    )
    if element is not None:
        element.click()


def scrape_url(url, csvWriter, restaurant, debug=False, driver_args = None):

    # Import the webdriver
    if driver_args is not None:
        driver = webdriver.Chrome(PATH, **driver_args)
    else:
        driver = webdriver.Chrome(PATH)        
    driver.get(url)

    try:
        reviews_div = WebDriverWait(driver, timeout=10).until(
            EC.presence_of_element_located((By.ID, "REVIEWS"))
        )

        try:
            num_pages_el = WebDriverWait(reviews_div, timeout=10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "pageNum.last"))
            )

            num_pages = int(num_pages_el.text)
            logger.info(f"Found a maximum of {num_pages} pages")
        except Exception as e:
            logger.error(
                "Could find maximum page. This page may only have one page of reviews")
            logger.error(e)
            num_pages = 10

        # change the value inside the range to save more or less reviews
        for i in range(0, num_pages):

            #     # expand the review
            more_span = "//span[@class='taLnk ulBlueLinks']"
            expand_locator = (By.XPATH, more_span)

            try:
                expand_reviews(expand_locator, base_element=reviews_div)
            except Exception as e:
                try:
                    expand_reviews(expand_locator, base_element=reviews_div)
                except:
                    logger.info("Could not expand reviews")

        #     # Wait for the expansion to take place

            # Make sure that all the reviews are loaded
            time.sleep(2)
            num_reviews_per_page = 10
            container = driver.find_elements_by_xpath(
                ".//div[@class='review-container']")

            if len(container) < num_reviews_per_page and i < (num_pages - 1):
                max_num_tries = 5
                num_tries = 0

                while len(container) != num_reviews_per_page and num_tries < max_num_tries:
                    time.sleep(1)
                    try:
                        container = driver.find_elements_by_xpath(
                            ".//div[@class='review-container']")
                    except:
                        pass
                    num_tries += 1

            for j in range(len(container)):
                title = container[j].find_element_by_xpath(
                    ".//span[@class='noQuotes']").text
                date = container[j].find_element_by_xpath(
                    ".//span[contains(@class, 'ratingDate')]").get_attribute("title")
                rating = container[j].find_element_by_xpath(
                    ".//span[contains(@class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
                try:
                    review = container[j].find_element_by_xpath(
                        ".//span[@class='postSnippet']").text.replace("\n", " ")
                except:
                    review = container[j].find_element_by_xpath(
                        ".//p[@class='partial_entry']").text.replace("\n", " ")

                if debug:
                    print([restaurant, date, rating, title, review])

                csvWriter.writerow([restaurant, date, rating, title, review, url, str(i+1)])

            if i < (num_pages-1):
                # change the page
                driver.find_element_by_xpath(
                    './/a[@class="nav next ui_button primary"]').click()
                time.sleep(random.randint(1, 4))
            else:
                logger.info("Reached end of page")

    except Exception as e:
        logger.error("Could not load this page")
        logger.error(e)

    driver.close()
    time.sleep(random.randint(1, 3))

In [41]:
df = pd.read_csv("../datasets/final_list_URL_TripAdvisor.csv", sep=";")
df.head()

Unnamed: 0,NYC_extract.DBA,NYC_extract.TripAdvisor.URL,NYC_extract.INSPECTION.DATE,NYC_extract.VIOLATION.DESCRIPTION,NYC_extract.SCORE,NYC_extract.GRADE
946,NOEL'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,11/02/2016,Facility not vermin proof. Harborage or condit...,14,
17922,WEST SIDE STEAKHOUSE,https://www.tripadvisor.com/Restaurant_Review-...,06/06/2017,Food not cooled by an approved method whereby ...,13,A
19584,BAKER'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,03/07/2017,Hot food item not held at or above 140Âº F.,24,
19648,BAKER'S PIZZA,https://www.tripadvisor.com/Restaurant_Review-...,03/07/2017,Filth flies or food/refuse/sewage-associated (...,24,
19791,PEE DEE,https://www.tripadvisor.com/Restaurant_Review-...,04/07/2017,Facility not vermin proof. Harborage or condit...,27,


In [42]:
idxs = df[['NYC_extract.TripAdvisor.URL']].drop_duplicates().index
df = df.loc[idxs, ['NYC_extract.TripAdvisor.URL', "NYC_extract.DBA"]]
df

Unnamed: 0,NYC_extract.TripAdvisor.URL,NYC_extract.DBA
946,https://www.tripadvisor.com/Restaurant_Review-...,NOEL'S PIZZA
17922,https://www.tripadvisor.com/Restaurant_Review-...,WEST SIDE STEAKHOUSE
19584,https://www.tripadvisor.com/Restaurant_Review-...,BAKER'S PIZZA
19791,https://www.tripadvisor.com/Restaurant_Review-...,PEE DEE
19956,https://www.tripadvisor.com/Restaurant_Review-...,ABITINO'S PIZZA
...,...,...
223934,https://www.tripadvisor.com/Restaurant_Review-...,FAMOUS ORIGINAL RAY'S PIZZA
224460,https://www.tripadvisor.com/Restaurant_Review-...,PRONTO PIZZA
226121,https://www.tripadvisor.com/Restaurant_Review-...,TWO BOOTS
227187,https://www.tripadvisor.com/Restaurant_Review-...,NICK & STEF'S STEAKHOUSE


In [43]:
logs_lines = []

for path in os.listdir("./scraping_runs/"):
    logs_lines += open(f"./scraping_runs/{path}").readlines()
    
urls_read = set(l.replace("\n","") for l in logs_lines if l.startswith("http"))
pending_urls = set(df["NYC_extract.TripAdvisor.URL"].values) - urls_read
df = df[df["NYC_extract.TripAdvisor.URL"].isin(pending_urls)]
df

Unnamed: 0,NYC_extract.TripAdvisor.URL,NYC_extract.DBA
123245,https://www.tripadvisor.com/Restaurant_Review-...,LUZZO'S LA PIZZA NAPOLETANA
123263,https://www.tripadvisor.com/Restaurant_Review-...,IL CORTILE RESTAURANT
123270,https://www.tripadvisor.com/Restaurant_Review-...,GRADISCA RESTAURANT
123292,https://www.tripadvisor.com/Restaurant_Review-...,CACIO & PEPE
123301,https://www.tripadvisor.com/Restaurant_Review-...,PATRIZIAS OF BROOKLYN
...,...,...
223934,https://www.tripadvisor.com/Restaurant_Review-...,FAMOUS ORIGINAL RAY'S PIZZA
224460,https://www.tripadvisor.com/Restaurant_Review-...,PRONTO PIZZA
226121,https://www.tripadvisor.com/Restaurant_Review-...,TWO BOOTS
227187,https://www.tripadvisor.com/Restaurant_Review-...,NICK & STEF'S STEAKHOUSE


In [47]:
# default path to file to store data
path_to_file = "../datasets/scraped_final_list_URL_TripAdvisor.csv"

# Open the file to save the review
csvFile = open(path_to_file, 'a', encoding="utf-8")
csvWriter = csv.writer(csvFile)

for i, row in df.iterrows():
    
    opts = Options()
    opts.add_argument('--no-sandbox')
    opts.add_argument('--headless')
    opts.add_argument('--disable-dev-shm-usage')
    driver_args = { "options" : opts }

    restaruant_name = row["NYC_extract.DBA"]
    url = row["NYC_extract.TripAdvisor.URL"]
    print(url)
    
    scrape_url(url, csvWriter, restaurant=restaruant_name, debug=False, 
               driver_args=driver_args)

https://www.tripadvisor.com/Restaurant_Review-g60763-d779482-Reviews-Luzzo_s-New_York_City_New_York.html


INFO:__main__:Found a maximum of 23 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d424422-Reviews-Il_Cortile_Restaurant-New_York_City_New_York.html


INFO:__main__:Found a maximum of 70 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d445197-Reviews-Gradisca-New_York_City_New_York.html


INFO:__main__:Found a maximum of 12 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d599003-Reviews-Cacio_e_Pepe-New_York_City_New_York.html


INFO:__main__:Found a maximum of 15 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: 



https://www.tripadvisor.com/Restaurant_Review-g48127-d18825582-Reviews-Patrizia_s-Maspeth_Queens_New_York.html


ERROR:__main__:Could find maximum page. This page may only have one page of reviews
ERROR:__main__:Message: 

ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d8424752-Reviews-Bono_Trattoria-New_York_City_New_York.html


INFO:__main__:Found a maximum of 5 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d2086556-Reviews-Sofia_s-New_York_City_New_York.html


INFO:__main__:Found a maximum of 36 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d423264-Reviews-Palma-New_York_City_New_York.html


INFO:__main__:Found a maximum of 38 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d17433839-Reviews-Memoria-New_York_City_New_York.html


ERROR:__main__:Could find maximum page. This page may only have one page of reviews
ERROR:__main__:Message: 

ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d479364-Reviews-Basta_Pasta-New_York_City_New_York.html


INFO:__main__:Found a maximum of 23 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: 



https://www.tripadvisor.com/Restaurant_Review-g60763-d1557580-Reviews-Trattoria_Casa_di_Isacco-New_York_City_New_York.html


INFO:__main__:Found a maximum of 23 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: element click intercepted: Element <span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickCollapse',event,this);">...</span> is not clickable at point (188, 592). Other element would receive the click: <div id="taplc_hotels_loading_box_rr_resp_0" class="ppr_rup ppr_priv_hotels_loading_box" data-placement-name="hotels_loading_box:rr_resp" style="display: block;">...</div>
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d3962352-Reviews-Nica_Trattoria-New_York_City_New_York.html


INFO:__main__:Found a maximum of 5 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d423883-Reviews-Petaluma-New_York_City_New_York.html


INFO:__main__:Found a maximum of 10 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d7751468-Reviews-La_Pizza-New_York_City_New_York.html


INFO:__main__:Found a maximum of 8 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: element click intercepted: Element <span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickExpand',event,this);">...</span> is not clickable at point (218, 598). Other element would receive the click: <p class="partial_entry">...</p>
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d7392512-Reviews-Amata-New_York_City_New_York.html


INFO:__main__:Found a maximum of 6 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d7370236-Reviews-Gelso_Grand-New_York_City_New_York.html


INFO:__main__:Found a maximum of 49 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d534450-Reviews-Via_Italia-New_York_City_New_York.html


INFO:__main__:Found a maximum of 21 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: element click intercepted: Element <span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickCollapse',event,this);">...</span> is not clickable at point (188, 592). Other element would receive the click: <div id="taplc_hotels_loading_box_rr_resp_0" class="ppr_rup ppr_priv_hotels_loading_box" data-placement-name="hotels_loading_box:rr_resp" style="display: block;">...</div>
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d457896-Reviews-Supper_Restaurant-New_York_City_New_York.html


INFO:__main__:Found a maximum of 23 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: 



https://www.tripadvisor.com/Restaurant_Review-g60763-d13393942-Reviews-Laboratorio329-New_York_City_New_York.html


INFO:__main__:Found a maximum of 3 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d423797-Reviews-Scalinatella_Restaurant-New_York_City_New_York.html


INFO:__main__:Found a maximum of 24 pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d12492884-Reviews-Fiaschetteria_Pistoia-New_York_City_New_York.html


INFO:__main__:Found a maximum of 4 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d14358488-Reviews-Joe_and_Pats_Pizzeria-New_York_City_New_York.html


INFO:__main__:Found a maximum of 3 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d459250-Reviews-Le_Zie-New_York_City_New_York.html


INFO:__main__:Found a maximum of 15 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d459109-Reviews-Via_Quadronno-New_York_City_New_York.html


INFO:__main__:Found a maximum of 25 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: element click intercepted: Element <span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickCollapse',event,this);">...</span> is not clickable at point (189, 593). Other element would receive the click: <div id="taplc_hotels_loading_box_rr_resp_0" class="ppr_rup ppr_priv_hotels_loading_box" data-placement-name="hotels_loading_box:rr_resp" style="display: block;">...</div>
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d424586-Reviews-Il_Gattopardo-New_York_City_New_York.html


INFO:__main__:Found a maximum of 56 pages


https://www.tripadvisor.com/Restaurant_Review-g60763-d459365-Reviews-V_T_Pizzeria-New_York_City_New_York.html


INFO:__main__:Found a maximum of 13 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d423331-Reviews-Emilio_s_Ballato-New_York_City_New_York.html


INFO:__main__:Found a maximum of 29 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: stale element reference: element is not attached to the page document
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d459072-Reviews-Cara_Mia-New_York_City_New_York.html


INFO:__main__:Found a maximum of 45 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: element click intercepted: Element <span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickCollapse',event,this);">...</span> is not clickable at point (188, 593). Other element would receive the click: <div id="taplc_hotels_loading_box_rr_resp_0" class="ppr_rup ppr_priv_hotels_loading_box" data-placement-name="hotels_loading_box:rr_resp" style="display: block;">...</div>
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d543319-Reviews-Posto-New_York_City_New_York.html


INFO:__main__:Found a maximum of 10 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: element click intercepted: Element <span class="taLnk ulBlueLinks" onclick="widgetEvCall('handlers.clickCollapse',event,this);">...</span> is not clickable at point (188, 592). Other element would receive the click: <div id="taplc_hotels_loading_box_rr_resp_0" class="ppr_rup ppr_priv_hotels_loading_box" data-placement-name="hotels_loading_box:rr_resp" style="display: block;">...</div>
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d479315-Reviews-Caffe_Buon_Gusto-New_York_City_New_York.html


INFO:__main__:Found a maximum of 8 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d2361016-Reviews-Frankies_Spuntino_570-New_York_City_New_York.html


INFO:__main__:Found a maximum of 14 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d4561725-Reviews-Original_Vincent_s-New_York_City_New_York.html


INFO:__main__:Found a maximum of 8 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: no such element: Unable to locate element: {"method":"xpath","selector":".//a[@class="nav next ui_button primary"]"}
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d424558-Reviews-Azalea_Ristorante-New_York_City_New_York.html


INFO:__main__:Found a maximum of 72 pages
ERROR:__main__:Could not load this page
ERROR:__main__:Message: stale element reference: element is not attached to the page document
  (Session info: headless chrome=89.0.4389.82)



https://www.tripadvisor.com/Restaurant_Review-g60763-d459628-Reviews-Delmonico_s-New_York_City_New_York.html


INFO:__main__:Found a maximum of 85 pages


KeyboardInterrupt: 