In [31]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException
import time
import numpy as np
import pandas as pd
import concurrent.futures
import multiprocessing
import pprint
import re
import datetime
import locale
from selenium.webdriver.chrome.options import Options 
import chromedriver_binary

In [32]:
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
chrome_options = Options()  
chrome_options.add_argument("--headless")  

In [33]:
browser = webdriver.Chrome(options=chrome_options)
browser.get("https://github.com")

In [34]:
def accessibility(browser):
    """
    if cookie banner and robot detection are present, remove them
    """
    
    # if the are you human popup appears, remove it
    try :
        robot_detection = browser.find_element_by_id('botdetect_abu_nip__overlay')
        return False
    except NoSuchElementException:
        pass
    
    # remove cookie banner because it takes almost 50% height of the page
    try:
        cookie_banner = browser.find_element_by_css_selector('#cookie_warning button')
        cookie_banner.click()
    except NoSuchElementException:
        pass
    
    return True

In [35]:
def do_query(url, query, browser):
    """
    get a page and realize a query
    """
    
    # get page
    browser.get(url)
    
    # let the JS charge a little in case you have cookie banners, 
    # robot detection scripts running
    time.sleep(2)
    robot_detection = accessibility(browser)
    
    if not robot_detection :
        return robot_detection # we have to do the query again
    
    # send query value
    search_input = browser.find_element_by_id('ss')
    search_input.send_keys(query)
    
    # btn submit
    btn_submit = browser.find_element_by_class_name('sb-searchbox__button')
    btn_submit.click()
    return True

In [36]:
def get_hotels_list_one_page(browser):
    """
    get all hotel links inside a webpage
    """
    time.sleep(3)
    
    while True:
        result = accessibility(browser)
        
        if result : #it means there is no issue upon loading, we can continue
            break
        else :
            browser.quit()
    
    # get hotellist
    hotel_list = browser.find_elements_by_css_selector('#hotellist_inner .sr_item') 
    
    hotel_links_list = []
    for i in range(len(hotel_list)):
        try:
            hotel_list = browser.find_elements_by_css_selector('#hotellist_inner .sr_item') 
            has_rating = hotel_list[i].find_elements_by_css_selector('.bui-review-score__badge')
            # print(has_rating)
            # print(len(has_rating))
            if len(has_rating) > 0 :
                # hotel_links = browser.find_elements_by_css_selector('h3 .hotel_name_link')
                link = hotel_list[i].find_element_by_css_selector('h3 .hotel_name_link').get_attribute('href')
                hotel_links_list.append(link)
        except StaleElementReferenceException as e:
            print(e)
            
    return hotel_links_list

In [37]:
# browser = webdriver.Chrome()
# browser = webdriver.Firefox(executable_path=r"/Users/DARE/Documents/geckodriver")
# browser.get('https://www.booking.com/searchresults.fr.html?aid=1610680&label=marseille-dehaav7Ex4WDoxSLFYlrjgS379617115837%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap%3Aneg%3Afi%3Atikwd-553829793%3Alp9056143%3Ali%3Adec%3Adm%3Appccp%3DUmFuZG9tSVYkc2RlIyh9YURcq_26dhSxO_kD28P4Rwg&lang=fr&sid=1f08e1d7b9a399a0cf3d0b1ac8c1b4d3&sb=1&sb_lp=1&src=city&src_elem=sb&error_url=https%3A%2F%2Fwww.booking.com%2Fcity%2Ffr%2Fmarseille.fr.html%3Faid%3D1610680%3Blabel%3Dmarseille-dehaav7Ex4WDoxSLFYlrjgS379617115837%253Apl%253Ata%253Ap1%253Ap2%253Aac%253Aap%253Aneg%253Afi%253Atikwd-553829793%253Alp9056143%253Ali%253Adec%253Adm%253Appccp%253DUmFuZG9tSVYkc2RlIyh9YURcq_26dhSxO_kD28P4Rwg%3Bsid%3D1f08e1d7b9a399a0cf3d0b1ac8c1b4d3%3Binac%3D0%26%3B&ss=Marseille&is_ski_area=0&ssne=Marseille&ssne_untouched=Marseille&city=-1449947&checkin_year=&checkin_month=&checkout_year=&checkout_month=&group_adults=2&group_children=0&no_rooms=1&b_h4u_keep_filters=&from_sf=1')
# links = get_hotels_list_one_page(browser)

In [38]:
def test_robot(url):
    browser = webdriver.Chrome()
    # browser = webdriver.Firefox(executable_path=r"/Users/DARE/Documents/geckodriver")
    
    browser.get(url)
    time.sleep(3)
    res = accessibility(browser)
    if not res :
        print('robot page')
    browser.quit()

In [39]:
def get_comment_item_value(review, col):
    """
    we optimize the process of collecting the values of reviews elements (title, name, rating, etc..)
    """
    
    date_format = "%d %B %Y"
    
    items = {
        'column': ['nom', 'pays', 'favorite', 'date', 'titre', 'bons_points', 'mauvais_points', 'note'],
        'css_selector' : ['.bui-avatar-block__title', '.bui-avatar-block__subtitle', '.c-review-block__badge', '.c-review-block__date', '.c-review-block__title', '.c-review__inner:first-child .c-review__body', '.lalala .c-review__body', '.bui-review-score__badge'],
        'value_OK': [None, None, 1, None, None, None, None, None],
        'value_NOK': ['None', 'None', 0, datetime.datetime.strptime('01 Janvier 1970', date_format), 'None', 'None', 'None', 'None']
    }
    
    try:
        # we get the index of the column we are dealing with, so we can get the css selector and values OK/NOK
        col_idx = items['column'].index(col)
        
        # we assign the correct values to variables for easier manipulation
        css = items['css_selector'][col_idx]
        valueOK = items['value_OK'][col_idx]
        valueNOK = items['value_NOK'][col_idx]
        
        # we get the value from the webpage
        item = review.find_element_by_css_selector(css).text
            
        # in case we are dealing with the favorite (Choix de l'utilisateur) column
        # we don't need the text, we just want to know that the user put it as favorite
        if valueOK is not None:
            item = valueOK
            
        # we have to get only the date inside the text
        if col == 'date':
            pattern = re.compile(r'\d{1,2}\s\w+\s\w{4}')
            result = pattern.search(item)
            date_str = result.group()
            item = datetime.datetime.strptime(date_str, date_format)
        
        return item
            
    except NoSuchElementException:
        # the item doesn't exist in the page, we put a default value
        return valueNOK

In [40]:
def details_comment(review, etablissement, cols):
    new_row = []
    
    # we stop after the 'note' column
    for col in cols[:8]:
        new_row.append(get_comment_item_value(review, col))
    
    # we collect also the data about the accomodation itself
    new_row.append(etablissement['type'])
    new_row.append(etablissement['lieu'])
    new_row.append(etablissement['note'])
    
    return new_row

In [41]:
def get_info_comments(browser, etablissement):
    """
    comments can be displayed on several pages. we want max 200 reviews per hotel
    """
    
    # create new dataframe
    cols = ['nom', 'pays', 'favorite', 'date', 'titre', 'bons_points', 'mauvais_points', 'note', 'type_etablissement', 'lieu', 'note_etablissement']
    data = pd.DataFrame([], columns=cols)
    
    count = 0
    
    while True:
        time.sleep(2) # let the DOM load
        nb_reviews = len(browser.find_elements_by_css_selector('.review_list .review_list_new_item_block'))
        
        # we show each review and call the script to get its content
        for i in range(nb_reviews): 
            reviews = browser.find_elements_by_css_selector('.review_list .review_list_new_item_block')
            review = reviews[i]
        
            # we scroll to the next review otherwise we won't be able to get the text content
            browser.execute_script('arguments[0].scrollIntoView({behavior: "smooth", block: "end", inline: "nearest"});', review)
        
            # we add a new comment
            try:
                data.loc[len(data.index)] = details_comment(review, etablissement, cols)
            except:
                raise
            
            count += 1
            time.sleep(1)
            
        # when we are done with a page of comments, we check if we have at least 50 comments
        # if it is the case, we go to the next accomodation
        if count >= 100:
            break 
            
        try :
            next_btn = browser.find_element_by_css_selector('#review_list_score_container .bui-pagination__next-arrow:not(.bui-pagination__item--disabled) a')
            next_btn.click()
        except NoSuchElementException:
            # there is no more comments to load
            print('no more comments to load')
            break
        except StaleElementReferenceException as e:
            print(e)
    
    try:
        backup = pd.read_csv('backup.csv')
    except FileNotFoundError:
        backup = pd.DataFrame([], columns=cols)
        
    new_backup = pd.concat([backup, data])
    new_backup.to_csv('backup.csv', index=False)
    return data

In [42]:
def get_hotel_page(url):
    """
    get hotel page and open comments section
    """
    
    browser = webdriver.Chrome()
    # browser = webdriver.Firefox(executable_path=r"/Users/DARE/Documents/geckodriver")
    browser.get(url)
    time.sleep(3) # we have to leave some time to see if the Are you a robot popup shows up
    
    while True:
        loading = accessibility(browser)
        
        if loading :
            break
        else :
            browser.quit()
            return False
    
    
    # open reviews panel
    try:
        btn_cmt = browser.find_element_by_id('show_reviews_tab')
        btn_cmt.click()
    except NoSuchElementException as e:
        print(e)
        return False
    except ElementClickInterceptedException as e:
        print('already open')
        
    time.sleep(1) 
        
    #get only french reviews
    try:
        # btn_french = browser.find_element_by_css_selector('.language_filter .bui-input-checkbutton:first-child')
        btn_french = browser.find_element_by_css_selector('.language_filter_checkbox[value="fr"] + span')
        btn_french.click()
    except NoSuchElementException :
        # there is no review - we continue with another accomodation
        print('no review')
        browser.quit()
        return 'No review'
    except ElementClickInterceptedException:
        # if the cookie btn is still here because the computer is slow, remove it
        print('still cookie btn')
        try:
            cookie_banner = browser.find_element_by_css_selector('#cookie_warning button:not([data-track-event="m_cookie_warning_closed])')
            cookie_banner.click()
        except NoSuchElementException:
            print('no cookie btn, unexpected error')
            return False
            pass
        
    
    # it has to take into account the language change
    time.sleep(2)
    
    # get info about accomodation
    try:
        etablissement = {
            'nom': browser.find_element_by_css_selector('.hp__hotel-name').text,
            'type' : browser.find_element_by_css_selector('.hp__hotel-name span').text,
            'note': browser.find_element_by_css_selector('.reviewFloater .bui-review-score__badge').get_attribute('innerHTML'), # sometimes it is hidden
            'lieu' : browser.find_element_by_css_selector('.sb-destination__input').get_attribute("value")
        }
    except NoSuchElementException as e:
        print(e, 'no etablissement listed')
        return False
        
    # get reviews_list
    data = get_info_comments(browser, etablissement)
    
    # close reviews panel
    close_btn = browser.find_element_by_css_selector('.sliding-panel-widget.is-shown .sliding-panel-widget-close-button')
    close_btn.click()
    
    browser.quit()
    return data

In [43]:
# debug the function above still some issue
# url = 'https://www.booking.com/hotel/fr/campaniletoulousecitedelespace.fr.html?label=gen173nr-1FCAEoggI46AdIDVgEaE2IAQGYAQ24AQfIAQzYAQHoAQH4AQKIAgGoAgO4AvOlvvYFwAIB;sid=de02c128049b9f98f7f79db7f5cb2658;dest_id=-1473166;dest_type=city;dist=0;group_adults=2;group_children=0;hapos=1;hpos=1;no_rooms=1;room1=A%2CA;sb_price_type=total;sr_order=popularity;srepoch=1590661883;srpvid=0fa849fd83d200e1;type=total;ucfs=1&#tab-reviews'
# url1 = 'https://www.booking.com/hotel/fr/campaniletoulousecitedelespace.fr.html'
# get_hotel_page(url1)

In [44]:
def retry(func, arg):
    while True : # we repeat the process in case we have a Are you human popup.
        comments_one_location = func(arg)
                
        if isinstance(comments_one_location, pd.core.frame.DataFrame) or comments_one_location: 
            break 
            
    return comments_one_location

In [45]:
all_hotels_links = []
# in case, there is a stop in the scrap code, I can resume from the links saved

In [46]:
def get_results(query):
    """
    get all reviews for accomadation for a query on Booking.com
    """
    
    cols = ['nom', 'pays', 'favorite', 'date', 'titre', 'bons_points', 'mauvais_points', 'note', 'type_etablissement', 'lieu', 'note_etablissement']
    all_comments = pd.DataFrame([], columns=cols)
    
    # open booking.com and make a query
    browser = webdriver.Chrome()
    # browser = webdriver.Firefox(executable_path=r"/Users/DARE/Documents/geckodriver")
    
    # if we are asked if we are human, we close the window and open a new one
    loading = None
   
    while True:
        loading = do_query('https://www.booking.com/index.fr.html', query, browser)
        
        if loading:
            break
        else :
            browser.quit()
    
    # loop over all results pages to get hotel links
    hotel_links_list = []
    while True:
        hotel_links_list = hotel_links_list + get_hotels_list_one_page(browser)
        time.sleep(1)
        
        try :
            next_btn = browser.find_element_by_css_selector('.bui-pagination__next-arrow:not(.bui-pagination__item--disabled) .bui-pagination__link')
            next_btn.click()
        except NoSuchElementException:
            print('no more results') # no more results
            break
        except StaleElementReferenceException as e:
            print(e)
    
    browser.quit() # we have collected all hotel links, we can close the browser
        
        
    # all_links = all_links + hotel_links_list # backup incase the following code doesn't go as planned
    
    # we get comments per location and concatenate with previous results
    for hotel_link in hotel_links_list:
        comments_one_location = retry(get_hotel_page, hotel_link)
            
        if isinstance(comments_one_location, pd.core.frame.DataFrame):
            try:
                all_comments = pd.concat([all_comments, comments_one_location], axis=0, ignore_index=True)
            except:
                print('unexpected error')
                
    all_comments.to_csv(f'{query}.csv', index=False)
    return all_comments # even if we have errors, we must have an exported dataset if possible

In [47]:
# get_results('Marseille', all_hotels_links)

In [48]:
# dataset

In [49]:
# multiprocessing

with multiprocessing.Pool() as pool:
    datasets = pool.map(get_results, ['Nantes', 'Lyon', 'Nice'])

Process ForkPoolWorker-7:
Process ForkPoolWorker-8:
Process ForkPoolWorker-5:
Process ForkPoolWorker-6:


KeyboardInterrupt: 

In [None]:
df_gen = pd.concat(datasets, axis=0, ignore_index=True)
df_gen.to_csv('booking.csv', index=False)