In [365]:
import chromedriver_binary
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
import time
import numpy as np
import pandas as pd
import concurrent.futures
import multiprocessing
import pprint
import re
import datetime
import locale

locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')

'fr_FR.UTF-8'

In [411]:
def accessibility(browser):
    """
    if cookie banner and robot detection are present, remove them
    """
    
    # if are you humain popup appears, remove it
    try :
        btn_human = browser.find_element_by_css_selector('button [data-bui-ref="modal-close"]')
        btn_human.click()
    except NoSuchElementException:
         print('no popup to remove')
        
    # remove cookie banner because it takes almost 50% height of the page
    try:
        cookie_banner = browser.find_element_by_css_selector('#cookie_warning button')
        cookie_banner.click()
    except NoSuchElementException:
        print('no cookie banner to remove')

In [353]:
def do_query(url, query, browser):
    """
    get a page and realize a query
    """
    
    # get page
    browser.get(url)
    
    # let the JS charge a little in case you have cookie banners, 
    # robot detection scripts running
    time.sleep(1)
    accessibility(browser)
    
    # send query value
    search_input = browser.find_element_by_id('ss')
    search_input.send_keys(query)
    
    # btn submit
    btn_submit = browser.find_element_by_class_name('sb-searchbox__button')
    btn_submit.click()

In [386]:
def get_hotels_list_one_page(hotel_links_list, browser):
    """
    get all hotel links inside a webpage
    """
    time.sleep(2)
    accessibility(browser)
    
    # get hotellist
    hotel_list = browser.find_elements_by_css_selector('#hotellist_inner .sr_item') 
    
    for i in range(len(hotel_list)):
        try:
            hotel_links = browser.find_elements_by_css_selector('h3 .hotel_name_link')
            hotel_links_list.append(hotel_links[i].get_attribute('href'))
        except StaleElementReferenceException as e:
            print(e)
    
    return hotel_links_list

In [367]:
df = {
    'nom' : [],
    'pays' : [],
    'favorite' : [],
    'date': [],
    'titre' : [],
    'bons_points' : [],
    'mauvais_points' : [],
    'note' : [],
    'type_etablissement' : [],
    'lieu' : [],
    'note_etablissement': []
}

In [271]:
def details_comment(df, review, etablissement):
    # review : Selenium object
    
    name = review.find_element_by_css_selector('.bui-avatar-block__title').text
    df['nom'].append(name)
    
    pays = review.find_element_by_css_selector('span.bui-avatar-block__subtitle').text
    df['pays'].append(pays)
    
    try:
        fav = review.find_element_by_css_selector('.c-review-block__badge').text
        df['favorite'].append(1)
    except NoSuchElementException:
        df['favorite'].append(0)
    
    # we get the sanitized date value
    date = review.find_element_by_css_selector('.c-review-block__date').text
    pattern = re.compile(r'\d{1,2}\s\w+\s\w{4}')
    result = pattern.search(date)
    date_str = result.group()
    date_format = "%d %B %Y"
    new_date = datetime.datetime.strptime(date_str, DATE_FORMAT)
    
    df['date'].append(new_date.date())
    
    titre = review.find_element_by_css_selector('.c-review-block__title').text
    df['titre'].append(titre)
    
    try:
        com_pos = review.find_element_by_css_selector('.c-review__inner:first-child .c-review__body').text
        df['bons_points'].append(com_pos)
    except NoSuchElementException:
        df['bons_points'].append('Pas de commentaires positifs')
    
    try:
        com_neg = review.find_element_by_css_selector('.lalala .c-review__body').text
        df['mauvais_points'].append(com_neg)
    except NoSuchElementException:
        df['mauvais_points'].append('Pas de commentaires négatifs')
        
    
    note = review.find_element_by_css_selector('.bui-review-score__badge').text
    df['note'].append(note)
    
    
    df['type_etablissement'].append(etablissement['type'])
    df['lieu'].append(etablissement['lieu'])
    df['note_etablissement'].append(etablissement['note'])

In [412]:
def get_comments(url):
    browser = webdriver.Chrome()
    
    # get page
    browser.get(url)
    time.sleep(2)
    accessibility(browser)

    # open reviews panel
    btn_cmt = browser.find_element_by_id('show_reviews_tab')
    btn_cmt.click()
    
    # get info about location
    etablissement = {
        'nom': browser.find_element_by_css_selector('.hp__hotel-name').text,
        'type' : browser.find_element_by_css_selector('.hp__hotel-name span').text,
        'note': browser.find_element_by_css_selector('.reviewFloater .bui-review-score__badge').get_attribute('innerHTML'), # sometimes it is hidden
        'lieu' : browser.find_element_by_css_selector('.sb-destination__input').get_attribute("value")
    }
    
    time.sleep(1)
        
    #get only french reviews
    btn_french = browser.find_element_by_css_selector('.language_filter .bui-input-checkbutton:first-child')
    btn_french.click()
    
    # it has to take into account the language change
    time.sleep(2)
    
    # get reviews_list
    reviews = browser.find_elements_by_css_selector('[itemprop="review"]')
    
    for i in range(len(reviews)): 
        reviews = browser.find_elements_by_css_selector('[itemprop="review"]')
        review =  reviews[i]
        
        browser.execute_script('arguments[0].scrollIntoView({behavior: "smooth", block: "end", inline: "nearest"});', review)
        
        # collect details about the review and store it inside the dictionary
        details_comment(df, review, etablissement)
        time.sleep(2)
    
    close_btn = browser.find_element_by_css_selector('.sliding-panel-widget.is-shown .sliding-panel-widget-close-button')
    close_btn.click()
    
    browser.quit()
        
    # ========= DEBUG ONLY ============= #
    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(df)
    # ========= END DEBUG ONLY ========== #

In [408]:
def get_results(query):
    """
    get all reviews for accomadation for a query on Booking.com
    """
    # open booking.com and make a query
    browser = webdriver.Chrome()
    
    do_query('https://www.booking.com/', query, browser)
    
    hotel_links_list = []
    
    while True:
        get_hotels_list_one_page(hotel_links_list, browser)
        time.sleep(1)
        
        try :
            next_btn = browser.find_element_by_css_selector('.bui-pagination__next-arrow:not(.bui-pagination__item--disabled) .bui-pagination__link')
            next_btn.click()
        except NoSuchElementException:
            break
        except StaleElementReferenceException as e:
            print(e)
            
    return hotel_links_list

In [396]:
hotel_list = get_results('Paris')
# since Booking doesn't open accomadation in the same tab, I preferred to get them all first
# then loop over the links to get the related comments

no popup to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to rem

In [None]:
for link in hotel_list[:10]:
    get_comments(link)

df

no popup to remove
no popup to remove
no popup to remove
no popup to remove


In [319]:
def create_dataset(df):
    # save data collected and save it to a CSV file
    data = pd.DataFrame(df)
    data.to_csv('booking.csv', index=False)