In [65]:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException, ElementClickInterceptedException, ElementNotInteractableException
from selenium.webdriver.support.ui import Select
import time
import numpy as np
import pandas as pd
import concurrent.futures
import multiprocessing
import pprint
import re
import datetime
import locale
from selenium.webdriver.chrome.options import Options 
import chromedriver_binary
from stopit import SignalTimeout as Timeout
from stopit import TimeoutException

In [66]:
# necessary to get french date
locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')

'fr_FR.UTF-8'

In [67]:
# headless mode
chrome_options = Options()  

chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument("--start-maximized")
# chrome_options.add_argument("--headless")  

## Utilities

In [68]:
def make_data_persistent(backup_link, cols, new_data):
    """
    save data to csv file
    """
    
    try:
        backup = pd.read_csv(backup_link)
    except FileNotFoundError:
        backup = pd.DataFrame([], columns=cols)
        
    # if new_data isn't a dataframe, we create a DataFrame from it
    if not isinstance(new_data, pd.core.frame.DataFrame):
        try:
            new_data = pd.DataFrame(new_data, columns=cols)
        except:
            raise
    
    new_backup = pd.concat([backup, new_data])
    new_backup.to_csv(backup_link, index=False)

In [69]:
def accessibility(browser):
    """
    if cookie banner and robot detection are present, remove them
    """
    
    # if the are you human popup appears, remove it
    try :
        robot_detection = browser.find_element_by_id('botdetect_abu_nip__overlay')
        return False
    except NoSuchElementException:
        pass
    
    # remove cookie banner because it takes almost 50% height of the page
    try:
        cookie_banner = browser.find_element_by_css_selector('#cookie_warning button')
        cookie_banner.click()
    except NoSuchElementException:
        pass
    
    return True

In [70]:
def reload_page(url):
    """
    reload webpage if network not available
    """
    
    while True:
        try:
            browser = webdriver.Chrome(options=chrome_options)
            browser.get(url)
            time.sleep(3) # let's the DOM load
            
            logo = browser.find_element_by_css_selector('#logo_no_globe_new_logo') # if it can't get the logo, it means the page isn't loaded
            robot_detection = accessibility(browser)
    
            if robot_detection :
                break
            else :
                browser.close()
        except:
            browser.close() # reload the page
    
    return browser

## Functions to get the hotel links

In [71]:
def get_hotels_links_by_page(browser, query):
    """
    get all hotel links inside a webpage
    """
    
    # we can compare it to a previous backup if we have it
    try:
        backup_links = pd.read_csv(f'backup_hotel_links_{query}.csv')
    except:
        backup_links = None
        print('no backup file')
    
    hotel_links_array = []

    # get link for each hotel in the page
    for hotel in browser.find_elements_by_css_selector('#hotellist_inner .sr_item'):
        
        try:
            # we only want hotel with reviews
            has_rating = hotel.find_elements_by_css_selector('.bui-review-score__badge')
            
            if len(has_rating) > 0 :
                link = hotel.find_element_by_css_selector('h3 .hotel_name_link').get_attribute('href')
                
                existing_link = []
                
                if backup_links is not None:
                    existing_link = backup_links.loc[backup_links['link'] == link]
                
                if len(existing_link) == 0: # if not in database or if the backup file doesn't exist
                    # we want the short version of the link to save space in the csv
                    pattern = re.compile(r'(.+)\?')
                    result = pattern.match(link)
                    short_link = result.group(1)
                    hotel_links_array.append([short_link, 0])
                    
        except StaleElementReferenceException as e:
            print(e)
       
    # persistent data
    make_data_persistent(f'backup_hotel_links_{query}.csv', ['link', 'has_been_scrapped'], hotel_links_array)
    
    return hotel_links_array

In [72]:
def get_all_hotel_links(browser, query):
    """
    loop through all results pages to get hotel links to scrap
    """
    
    all_links = []
    
    while True:
        time.sleep(2) # to prevent stale elements
        try :
            all_links.append(get_hotels_links_by_page(browser, query))
            next_btn = browser.find_element_by_css_selector('.bui-pagination__next-arrow:not(.bui-pagination__item--disabled) .bui-pagination__link')
            next_btn.click()
        except NoSuchElementException:
            print('no more results') # no more results
            break
        except StaleElementReferenceException as e:
            print(e)
            
    # flatten the list of all links
    all_links = [link[0] for links_by_page in all_links for link in links_by_page]
    return all_links

## Functions to fetch comments

In [73]:
def get_value_for_comment_item(review, col):
    """
    optimize the process of collecting the values of reviews elements (title, name, rating, etc..)
    """
    
    items = {
        'column': ['nom', 'pays', 'favorite', 'date', 'titre', 'bons_points', 'mauvais_points', 'note'],
        'css_selector' : ['.bui-avatar-block__title', '.bui-avatar-block__subtitle', '.c-review-block__badge', '.c-review-block__date', '.c-review-block__title', '.c-review__row:not(.lalala) .c-review__body', '.lalala .c-review__body', '.bui-review-score__badge'],
    }
    
    try:
        # we get the index of the column we are dealing with, to get the related css
        col_idx = items['column'].index(col)
        css = items['css_selector'][col_idx]
        
        # we get the value related to the column
        item = review.find_element_by_css_selector(css).text
            
        # date info is shown like this : 'Commentaire envoyé le DD/MM/YYYY', 
        # we only need the date inside the text
        if col == 'date':
            try: 
                pattern = re.compile(r'\d{1,2}\s\w+\s\w{4}')
                result = pattern.search(item)
                date_str = result.group()
                item = datetime.datetime.strptime(date_str, "%d %B %Y")
            except AttributeError:
                item = 'None'
                
        # in case we are dealing with the Choix de l'utilisateur / favorite column
        # we don't need the text, if favorite element present in the block => 1 otherwise 0
        if col == 'favorite' and item:
            item = 1
                
        return item
    
    except NoSuchElementException: # in case the item can't be fetched
         if col == 'favorite':
            return 0
         else :
            return 'None' # other exceptions will lead to the closing of the current browser, so we are bubbbling them up

In [74]:
def get_comment_data(review, etablissement, cols):
    """
    get the comment data
    """
    
    new_row = []
    
    # we can get the first 8 cells from the review block itself
    for col in cols[:8]:
        new_row.append(get_value_for_comment_item(review, col))
    
    # we collect also the data about the accomodation
    new_row.append(etablissement['type'])
    new_row.append(etablissement['lieu'])
    new_row.append(etablissement['note'])
    
    return new_row

In [75]:
def get_comments(browser, etablissement, query, link):
    """
    comments can be displayed on several pages. we want max 300 reviews per hotel
    """
    
    count = 0
    
    # create new dataframe to save the comments into a backup csv file
    cols = ['nom', 'pays', 'favorite', 'date', 'titre', 'bons_points', 'mauvais_points', 'note', 'type_etablissement', 'lieu', 'note_etablissement']
    data = pd.DataFrame([], columns=cols)
    
    while True:
        time.sleep(2) # wait till booking get the following comments
        
        # we get each review and call the script to get its content
        for review in browser.find_elements_by_css_selector('.review_list .review_list_new_item_block'): 
        
            # scroll to the review
            browser.execute_script('arguments[0].scrollIntoView({behavior: "smooth", block: "end", inline: "nearest"});', review)
        
            # we add the content of the review
            data.loc[len(data.index)] = get_comment_data(review, etablissement, cols)
            
            count += 1
            
        # when we are done with a section of comments, we check if we have at least 300 comments
        if count >= 300:
            break 
            
        # otherwise, we open a new comments panel
        try :
            next_btn = browser.find_element_by_css_selector('#review_list_score_container .bui-pagination__next-arrow:not(.bui-pagination__item--disabled) a')
            next_btn.click()
        except NoSuchElementException:
            print('no more comments to load')
            break
    
    # we save the scrapped comments in a backup csv file
    make_data_persistent(f'backup_booking_{query}.csv', cols, data)
    
    # update the hotel_links list of the query, so when we have to do scrap again we can resume at the right spot
    try:
        backup_links = pd.read_csv(f'backup_hotel_links_{query}.csv')
        mask = backup_links['link'] == link
        backup_links.loc[mask, 'has_been_scrapped'] = 1
        backup_links.to_csv(f'backup_hotel_links_{query}.csv', index=False)
    except FileNotFoundError:
        print(e)

In [76]:
def open_comments_panel(browser):
    """
    get hotel page, open French comments section and return the hotel and browser elements to the main caller
    """
    
    # open reviews panel
    time.sleep(2)
    try:
        btn_cmt = browser.find_element_by_id('show_reviews_tab')
        btn_cmt.click()
    except NoSuchElementException as e:
        print(e, 'there is no review')
        return False
    except ElementClickInterceptedException as e:
        print(e, 'already open') # if the webpage has already been visited by us
        
    #get only french reviews
    time.sleep(2) # wait a little bit till the checkbox is available
    try:
        btn_french = browser.find_element_by_css_selector('.language_filter_checkbox[value="fr"] + span')
        btn_french.click()
    except NoSuchElementException as e:
        print(e, 'there is no french review')
        return False
    
    # get only bad comments - uncomment if you want only bad comments
    # try:
        # select = Select(browser.find_element_by_id('review_score_filter'))
        # select.select_by_index(4)
    # except NoSuchElementException as e:
        # print('no bad comment')

    # it has to take into account the language change
    time.sleep(2)
    
    # get info about accomodation
    try:
        etablissement = {
            'nom': browser.find_element_by_css_selector('.hp__hotel-name').text,
            'type' : browser.find_element_by_css_selector('.hp__hotel-name span').text,
            'note': browser.find_element_by_css_selector('.reviewFloater .bui-review-score__badge').get_attribute('innerHTML'), # sometimes it is hidden
            'lieu' : browser.find_element_by_css_selector('.sb-destination__input').get_attribute("value")
        }
    except NoSuchElementException as e:
        etablissement = {
            'nom': 'None',
            'type' : 'None',
            'note': 'None', # sometimes it is hidden
            'lieu' : 'None'
        }
        
    return etablissement

## Main function

In [77]:
def connect_to_booking(query):
    """
    1 - get all hotel links for one city
    2 - for each link, get at most 300 comments
    """
    
    # we reload the page until we can access it
    browser = reload_page("https://booking.com")
    
    # send query value
    search_input = browser.find_element_by_id('ss')
    search_input.send_keys(query)
    
    # btn submit
    btn_submit = browser.find_element_by_class_name('sb-searchbox__button')
    btn_submit.click()
    
    # if it is not the first time, we scrap, we resume the process with the corresponding csv file
    try :
        hotel_links = pd.read_csv(f'backup_hotel_links_{query}.csv')
        mask = hotel_links['has_been_scrapped'] == 0
        all_links = hotel_links.loc[mask, 'link']
    except Exception as e: 
        print(e, 'the backup file doesnt exist')
        all_links = get_all_hotel_links(browser, query)
        
    # quit Chrome
    browser.close()
    print('got all hotel links')
    
    #get comments for each hotel
    for link in all_links:
        
        # we reload the page until we can access it
        new_browser = reload_page(link)
            
        try: # 4 min to get all comments, otherwise we go to the next link
            with Timeout(240.0) as timeout_ctx:
                
                etablissement = open_comments_panel(new_browser)
                if etablissement: # we fetch comment only if we can open the comments panel
                    get_comments(new_browser, etablissement, query, link)
        except TimeoutException as e:
            print(e, 'timeout')
        except Exception as e: # all exceptions not catched in subprocesses will be dealt here
            print(e, 'unexpected error')
        finally:
            new_browser.close()

In [81]:
cities = ['Paris', 'Nice', 'Toulouse']

In [None]:
with multiprocessing.Pool() as pool:
    pool.map(connect_to_booking, cities)

# stop manually when you achieve the number you want
# we can put a fixed number for improvement

In [82]:
# merge datasets
def merge_datasets(cities):
    list_datasets = []

    for city in cities :
        dataset = pd.read_csv(f'backup_booking_{city}.csv')
        list_datasets.append(dataset)


    comments = pd.concat(list_datasets, axis=0)
    comments.to_csv('booking_comments.csv', index=False)

In [None]:
merge_datasets(cities)

In [90]:
# code to combine previous scraping job and merge with negative comments collected later on
# df = pd.read_csv('positive_comments.csv')
# df.head()

# we delete empty rows
# df = df.dropna(how='all')
# data = df[:30000] # and get only 30k to balance comments

# neg = pd.read_csv('negative_comments.csv')
# comments = pd.concat([data, neg], axis=0)
# shuffle comments
# comments = comments.sample(frac=1)
# comments.to_csv('booking_comments.csv', index=False)