In [365]:
import chromedriver_binary
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
import time
import numpy as np
import pandas as pd
import concurrent.futures
import multiprocessing
import pprint
import re
import datetime
import locale

locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')

'fr_FR.UTF-8'

In [411]:
def accessibility(browser):
    """
    if cookie banner and robot detection are present, remove them
    """
    
    # if are you humain popup appears, remove it
    try :
        btn_human = browser.find_element_by_css_selector('button [data-bui-ref="modal-close"]')
        btn_human.click()
    except NoSuchElementException:
         print('no popup to remove')
        
    # remove cookie banner because it takes almost 50% height of the page
    try:
        cookie_banner = browser.find_element_by_css_selector('#cookie_warning button')
        cookie_banner.click()
    except NoSuchElementException:
        print('no cookie banner to remove')

In [353]:
def do_query(url, query, browser):
    """
    get a page and realize a query
    """
    
    # get page
    browser.get(url)
    
    # let the JS charge a little in case you have cookie banners, 
    # robot detection scripts running
    time.sleep(1)
    accessibility(browser)
    
    # send query value
    search_input = browser.find_element_by_id('ss')
    search_input.send_keys(query)
    
    # btn submit
    btn_submit = browser.find_element_by_class_name('sb-searchbox__button')
    btn_submit.click()

In [453]:
def get_hotels_list_one_page(hotel_links_list, browser):
    """
    get all hotel links inside a webpage
    """
    time.sleep(2)
    accessibility(browser)
    
    # get hotellist
    hotel_list = browser.find_elements_by_css_selector('#hotellist_inner .sr_item') 
   
    for i in range(len(hotel_list)):
        try:
            hotel_links = browser.find_elements_by_css_selector('h3 .hotel_name_link')
            hotel_links_list.append(hotel_links[i].get_attribute('href'))
        except StaleElementReferenceException as e:
            print(e)
    
    return hotel_links_list

In [448]:
df = {
    'nom' : [],
    'pays' : [],
    'favorite' : [],
    'date': [],
    'titre' : [],
    'bons_points' : [],
    'mauvais_points' : [],
    'note' : [],
    'type_etablissement' : [],
    'lieu' : [],
    'note_etablissement': []
}

In [450]:
def get_comment_item_value(review, col):
    """
    we optimize the process of collecting the values of reviews elements (title, name, rating, etc..)
    """
    
    date_format = "%d %B %Y"
    
    
    items = {
        'column': ['nom', 'pays', 'favorite', 'date', 'titre', 'bons_points', 'mauvais_points', 'note'],
        'css_selector' : ['.bui-avatar-block__title', '.bui-avatar-block__subtitle', '.c-review-block__badge', '.c-review-block__date', '.c-review-block__title', '.c-review__inner:first-child .c-review__body', '.lalala .c-review__body', '.bui-review-score__badge'],
        'value_OK': [None, None, 1, None, None, None, None, None],
        'value_NOK': ['None', 'None', 0, datetime.datetime.strptime('01 Janvier 1970', date_format), 'None', 'None', 'None', 'None']
    }
    
    try:
        # we get the index of the column we are dealing with, so we can get the css selector and values OK/NOK
        col_idx = items['column'].index(col)
        
        # we assign the correct values to variables for easier manipulation
        css = items['css_selector'][col_idx]
        valueOK = items['value_OK'][col_idx]
        valueNOK = items['value_NOK'][col_idx]
        
        # we get the value from the webpage
        item = review.find_element_by_css_selector(css).text
            
        # in case we are dealing with the favorite (Choix de l'utilisateur) column
        # we don't the text, we just want to know that the user put it as favorite
        if valueOK is not None:
            item = valueOK
            
        # we have to get only the date inside the text
        if col == 'date':
            print(item)
            pattern = re.compile(r'\d{1,2}\s\w+\s\w{4}')
            result = pattern.search(item)
            date_str = result.group()
            item = datetime.datetime.strptime(date_str, DATE_FORMAT)
        
        # assign the value to the column in dataset
        df[col].append(item)
            
    except NoSuchElementException:
        # the item doesn't exist in the page, we put a default value
        df[col].append(valueNOK)

In [454]:
def details_comment(df, review, etablissement):
    columns = list(df.keys())
    
    # we stop after the 'note' column
    for col in columns[:8]:
        get_comment_item_value(review, col)
    
    # we collect also the data about the accomodation itself
    df['type_etablissement'].append(etablissement['type'])
    df['lieu'].append(etablissement['lieu'])
    df['note_etablissement'].append(etablissement['note'])

In [451]:
def get_comments(url):
    
    browser = webdriver.Chrome()
    browser.get(url)
    time.sleep(3) # we have to leave sometime to see if the Are you a robot popup shows up
    accessibility(browser)

    # open reviews panel
    try:
        btn_cmt = browser.find_element_by_id('show_reviews_tab')
        btn_cmt.click()
    except NoSuchElementException as e:
        print(e)
    
    # get info about accomodation
    try:
        etablissement = {
            'nom': browser.find_element_by_css_selector('.hp__hotel-name').text,
            'type' : browser.find_element_by_css_selector('.hp__hotel-name span').text,
            'note': browser.find_element_by_css_selector('.reviewFloater .bui-review-score__badge').get_attribute('innerHTML'), # sometimes it is hidden
            'lieu' : browser.find_element_by_css_selector('.sb-destination__input').get_attribute("value")
        }
    except NoSuchElementException as e:
        print(e)
        
    #get only french reviews
    btn_french = browser.find_element_by_css_selector('.language_filter .bui-input-checkbutton:first-child')
    btn_french.click()
    
    # it has to take into account the language change
    time.sleep(2)
    
    # get reviews_list
    nb_reviews = len(browser.find_elements_by_css_selector('[itemprop="review"]'))
    
    # we show each review and call the script to get its content
    for i in range(nb_reviews): 
        reviews = browser.find_elements_by_css_selector('[itemprop="review"]')
        review = reviews[i]
        
        # we scroll to the next review otherwise we won't be able to get the text content
        browser.execute_script('arguments[0].scrollIntoView({behavior: "smooth", block: "end", inline: "nearest"});', review)
        
        details_comment(df, review, etablissement)
        time.sleep(1)
    
    # close reviews panel
    close_btn = browser.find_element_by_css_selector('.sliding-panel-widget.is-shown .sliding-panel-widget-close-button')
    close_btn.click()
    
    browser.quit()
        
    # ========= DEBUG ONLY ============= #
    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(df)
    data = pd.DataFrame(df)
    data.to_csv('booking.csv', index=False)
    return data
    # ========= END DEBUG ONLY ========== #

In [449]:
get_comments(hotel_list[0])

no popup to remove
Commentaire envoyé le 28 juin 2018
Commentaire envoyé le 22 mai 2020
Commentaire envoyé le 20 mai 2020
Commentaire envoyé le 19 mai 2020
Commentaire envoyé le 17 mai 2020
Commentaire envoyé le 15 mai 2020
Commentaire envoyé le 7 mai 2020
Commentaire envoyé le 1 avril 2020
Commentaire envoyé le 30 mars 2020
Commentaire envoyé le 26 mars 2020


Unnamed: 0,nom,pays,favorite,date,titre,bons_points,mauvais_points,note,type_etablissement,lieu,note_etablissement
0,Loshouarn,France,1,2018-06-28,Le confort comme à la maison !,J'ai énormément apprécié l'accueil et je me su...,,10,Hôtel,Paris,81
1,Isabelle,France,0,2020-05-22,Calme et confortable.,"Très bon accueil, Hatim le réceptionniste est ...",,80,Hôtel,Paris,81
2,Eva,Italie,0,2020-05-20,Accueil chaleureux,- Personnel irréprochable : surtout l’homme et...,- le rapport qualité prix pour la chambre supé...,80,Hôtel,Paris,81
3,François,France,0,2020-05-19,Très correct,"Merci a Hatim pour l'accueil, très appréciable...",,80,Hôtel,Paris,81
4,Fabienne,France,0,2020-05-17,Très bien,"L’accueil , la situation de l’établissement",La fenêtre qui ne fermait pas très bien,80,Hôtel,Paris,81
5,Alain,Maroc,0,2020-05-15,Très bien,"Le réceptionniste Algérien ,très compétent et ...",,80,Hôtel,Paris,81
6,Mondher,France,0,2020-05-07,Fabuleux,L’accueil,,90,Hôtel,Paris,81
7,Gilles,France,0,2020-04-01,hôtel parfait pour un court séjour,La chambre était très propre,Il y avait un peu de bruit dans les couloirs,90,Hôtel,Paris,81
8,Carine,France,0,2020-03-30,Très bien,"L'accueil, la chambre confortable, le prix (tr...",Rien,80,Hôtel,Paris,81
9,Ourdia,France,0,2020-03-26,Agréable,"On a tout aimé pendant notre séjours, l'accuei...",,80,Hôtel,Paris,81


In [408]:
def get_results(query):
    """
    get all reviews for accomadation for a query on Booking.com
    """
    # open booking.com and make a query
    browser = webdriver.Chrome()
    
    do_query('https://www.booking.com/', query, browser)
    
    hotel_links_list = []
    
    # loop over all results pages to get hotel links and comments
    while True:
        get_hotels_list_one_page(hotel_links_list, browser)
        time.sleep(1)
        
        try :
            next_btn = browser.find_element_by_css_selector('.bui-pagination__next-arrow:not(.bui-pagination__item--disabled) .bui-pagination__link')
            next_btn.click()
        except NoSuchElementException:
            break
        except StaleElementReferenceException as e:
            print(e)
            
    return hotel_links_list

In [396]:
hotel_list = get_results('Paris')
# since Booking doesn't open accomadation in the same tab, I preferred to get them all first
# then loop over the links to get the related comments

no popup to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to remove
no cookie banner to remove
no popup to rem

In [413]:
for link in hotel_list[:10]:
    get_comments(link)

df

no popup to remove
no popup to remove
no popup to remove
no popup to remove
no popup to remove
no popup to remove
no popup to remove
no popup to remove
no popup to remove
no popup to remove


{'nom': ['Loshouarn',
  'Isabelle',
  'Eva',
  'François',
  'Fabienne',
  'Alain',
  'Mondher',
  'Gilles',
  'Carine',
  'Ourdia',
  'Loshouarn',
  'Isabelle',
  'Eva',
  'François',
  'Fabienne',
  'Alain',
  'Mondher',
  'Gilles',
  'Carine',
  'Ourdia',
  'Loshouarn',
  'Isabelle',
  'Eva',
  'François',
  'Fabienne',
  'Alain',
  'Mondher',
  'Gilles',
  'Carine',
  'Ourdia',
  'Loshouarn',
  'Isabelle',
  'Eva',
  'François',
  'Fabienne',
  'Alain',
  'Mondher',
  'Gilles',
  'Carine',
  'Ourdia',
  'Robert',
  'Peggy',
  'Thomas',
  'Marine',
  'Francois',
  'Julien',
  'Karine',
  'Lionel',
  'Francois',
  'Antoine',
  'Anne',
  'Briantais',
  'Christian',
  'Gregoire',
  'Nadine',
  'Anne-laure',
  'Marie-françoise',
  'Chloé',
  'Jean',
  'Soumia',
  'Jean-charles',
  'Sebastien',
  'Virginie',
  'Francois',
  'Chloé',
  'Jean-pierre',
  'Marie-christine',
  'Maria',
  'Eric',
  'Claude',
  'Nancy',
  'Anonyme',
  'Djamila',
  'Yoanka',
  'Diane',
  'Wefer',
  'Laura',
  'K

In [319]:
def create_dataset(df):
    # save data collected and save it to a CSV file
    data = pd.DataFrame(df)
    data.to_csv('booking.csv', index=False)