In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
import pandas as pd
import traceback
from os import mkdir
from os.path import isdir

In [None]:
class AgodaScrapper:
    def __init__(self, location):
        self.df = pd.DataFrame()
        self.location = location
        self.driver = webdriver.Chrome()
        self.driver.get("https://www.agoda.com")
        self.search_box = self.driver.find_element("id", 'textInput')
        self.search_box.send_keys(location)
        self.search_box.send_keys(Keys.ESCAPE)
        self.wait = WebDriverWait(self.driver, 20)
        self.search_button = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[data-selenium="searchButton"]')))
        self.keepScrapping = True
        self.search_button.click()
        
    def run(self):
        page = 0
        totalHotel = 0
        while self.keepScrapping:
            try:
                print(f"TotalHotel {totalHotel}")
                print(f"current page {page}")
                self.scrollDown()
                hotels = self.extractHotelsFromCurrentPage()
                print(f"Here we have {len(hotels)} hotels")
                for idx, hotel in enumerate(hotels):
                    try:
                        a = hotel.find_element(by=By.TAG_NAME, value='a')
                        a.send_keys(Keys.CONTROL + Keys.RETURN)
                        self.driver.switch_to.window(self.driver.window_handles[-1])
                        WebDriverWait(self.driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
                        self.extractData()
                        self.driver.close()
                    except:
                        pass
                    finally:
                        self.driver.switch_to.window(self.driver.window_handles[0])
                    print(f"Hotel number {idx}/{len(hotels)}")
                self.nextPage()
                totalHotel += len(hotels)
                page += 1
            except Exception as e:
                print(traceback.format_exc())
                self.keepScrapping = False
        if not isdir('Raw'):
            mkdir('Raw')
        self.df.to_csv('Raw/' + self.location + " df.csv", index=False)
        print("Done")
        
    def extractHotelsFromCurrentPage(self):
        return self.driver.find_elements(by=By.CSS_SELECTOR, value='[data-selenium="hotel-item"]')
    
    def extractData(self):
        data = {           
            'Name':                              self.getHotelName(),
            'Price':                             self.getHotelPrice(),
            'Stars':                             self.getHotelStars(),
            'Score':                             self.getHotelScore(),
            'Spoken Language':                   self.getHotelSpokenLanguage(),
            'Reviews':                           self.getHotelReviews(),   
            'Location':                          self.getHotelLocation(),   
            'Floors':                            self.getHotelFloors(),   
            'Rooms':                             self.getHotelRooms(),   
            'Resturents':                        self.getHotelResturents(),
            'Sparkling clean':                   self.getHotelSparklingClean(),
            'NewlyBuilt':                        self.getHotelNewlyBuilt(),
            'ExcellentView':                     self.getHotelExcellentView(),
            'Check In 24/7':                     self.getHotelCheckIn24(),
            'AirportTransfer':                   self.getHotelAirportTransfer(),
            'Front Desk':                        self.getHotelFrontDesk(),
            'Valet Parking':                     self.getHotelValetParking(),
            'Free WiFi In All Rooms':            self.getHotelFreeWiFiInAllRooms(),
            'Swimming Pool':                     self.getHotelSwimmingPool(),
            'Bar':                               self.getHotelBar(),
            'Coffee':                            self.getHotelCoffee(),
            'DailyHousekeeping':                 self.getHotelDailyHousekeeping(),
            'Elevator':                          self.getHotelElevator(),
            'Hair Dryer':                        self.getHotelHairDryer(),
            'Golf':                              self.getHotelGolfCourse(),
            'Kids club':                         self.getHotelKidsClub(),
            'Booked today':                      self.getHotelBookedToday(),
            'Real Guest Cleanlines Score':       self.getHotelGuestCleanlines(),
            'Real Guest Facilities Score':       self.getHotelGuestFacilities(),
            'Real Guest Location Score':         self.getHotelGuestLocation(),
            'Real Guest Service Score':          self.getHotelGuestService(),
            'Real Guest Value for money Score':  self.getHotelGuestValueForMoney(),
               }
        self.df = pd.concat([self.df, pd.DataFrame(data, index=[0])], ignore_index=True)
    
    def nextPage(self):
        try:
            nextPageButton = self.driver.find_element(By.XPATH, '*//span[text()="Next"]')
            self.driver.execute_script("arguments[0].click()", nextPageButton)
        except:
            raise Exception("No More Pages")
            
    def scrollDown(self):
        for i in range(27):
            self.driver.execute_script("window.scrollBy(0, 1000)")
            sleep(0.5)
            
    def getHotelName(self):
        try:
            return self.driver.find_element(by=By.CLASS_NAME, value='HeaderCerebrum__AdaName').text
        except:
            return None
        
    def getHotelPrice(self):
        try:
            return self.driver.find_element(By.XPATH,'//*[@id="hotelNavBar"]/div/div/div/span/div/span[5]').text
        except:
            return "JMI"
        
    def getHotelStars(self):
        try:
            return self.driver.find_element(by=By.CLASS_NAME, value="HeaderCerebrum__RatingIcon").get_attribute("aria-label")
        except:
            return None
        
    def getHotelScore(self):
        try:
            return self.driver.find_element(By.CLASS_NAME,"ReviewScoreCompact__section").text
        except:
            return None
        
    def getHotelSpokenLanguage(self):
        try:
            languages_element = self.driver.find_element(by=By.CLASS_NAME, value="Liststyled__ListStyled-sc-ksl08h-0.iTjiYt")
            languages = languages_element.find_elements(by=By.CSS_SELECTOR, value="div.Box-sc-kv6pi1-0.fEjEnG div[data-element-name='property-feature'] span.Spanstyled__SpanStyled-sc-16tp9kb-0.gwICfd.kite-js-Span")
            return ', '.join([language.text for language in languages])
        except:
            return None
        
    def getHotelReviews(self):
        try:
            return self.driver.find_element(by=By.CSS_SELECTOR, value="p.Typographystyled__TypographyStyled-sc-j18mtu-0.Hkrzy.kite-js-Typography").text
        except:
            return None
        
    def getHotelLocation(self):
        try:
            return self.driver.find_element(by=By.CSS_SELECTOR, value="span.Spanstyled__SpanStyled-sc-16tp9kb-0.gwICfd.kite-js-Span.HeaderCerebrum__Address[data-selenium='hotel-address-map']").text
        except:
            return None
        
    def getHotelFloors(self):
        try:
            return self.driver.find_element(By.XPATH,"//*[@id='abouthotel-usefulinfo']/div[3]/div[4]/ul/li[3]/div/div/div[2]/div/span/div/span[2]").text
        except:
            return None
        
    def getHotelRooms(self):
        try:
            return self.driver.find_element(By.XPATH,"//*[@id='abouthotel-usefulinfo']/div[3]/div[4]/ul/li[5]/div/div/div[2]/div/span/div/span[2]").text
        except:
            return None
        
    def getHotelResturents(self):
        try:
            return self.driver.find_element(By.XPATH,"//*[@id='abouthotel-usefulinfo']/div[3]/div[4]/ul/li[4]/div/div/div[2]/div/span/div/span[2]").text
        except:
            return None
        
    def getHotelSparklingClean(self):
        try:
            return "Sparkling clean" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[0].text.split('\n')
        except:
            return False
    
    def getHotelNewlyBuilt(self):
        try:
            return "Newly built" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[0].text.split('\n')
        except:
            return False
    
    def getHotelExcellentView(self):
        try:
            return "Excellent view" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[0].text.split('\n')
        except:
            return False
    
    def getHotelCheckIn24(self):
        try:
            return "Check-in [24-hour]" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[0].text.split('\n')
        except:
            return False
    
    def getHotelAirportTransfer(self):
        try:
            return "Airport transfer" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[0].text.split('\n')
        except:
            return False
    
    def getHotelFrontDesk(self):
        try:
            return "Front desk [24-hour]" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[1].text.split('\n')
        except:
            return False
    
    def getHotelValetParking(self):
        try:
            return "Valet parking" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[1].text.split('\n')
        except:
            return False
    
    def getHotelFreeWiFiInAllRooms(self):
        try:
            return "Free Wi-Fi in all rooms!" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[1].text.split('\n')
        except:
            return False
    
    def getHotelSwimmingPool(self):
        try:
            return "Swimming pool [outdoor]" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[1].text.split('\n')
        except:
            return False
    
    def getHotelFitnessCenter(self):
        try:
            return "Fitness center" in self.driver.find_elements(By.XPATH,"//div[@data-element-name='facility-highlights']")[1].text.split('\n')
        except:
            return False
    
    def getHotelBar(self):
        try:
            return "Bar" in self.driver.find_element(By.XPATH,"//*[@id='abouthotel-features']").text
        except:
            return False
    
    def getHotelCoffee(self):
        try:
            return "Coffee" in self.driver.find_element(By.XPATH,"//*[@id='abouthotel-features']").text
        except:
            return False
      
    def getHotelDailyHousekeeping(self):
        try:
            return "Daily housekeeping" in self.driver.find_element(By.XPATH,"//*[@id='abouthotel-features']").text
        except:
            return False
    
    def getHotelElevator(self):
        try:
            return "Daily Elevator" in self.driver.find_element(By.XPATH,"//*[@id='abouthotel-features']").text
        except:
            return False
        
    def getHotelHairDryer(self):
        try:
            return "Hair Dryer" in self.driver.find_element(By.XPATH,"//*[@id='abouthotel-features']").text
        except:
            return False
    
    def getHotelGolfCourse(self):
        try:
            return "Golf course" in self.driver.find_element(By.XPATH,"//*[@id='abouthotel-features']").text
        except:
            return False
    
    def getHotelKidsClub(self):
        try:
            return "Kids club" in self.driver.find_element(By.XPATH,"//*[@id='abouthotel-features']").text
        except:
            return False
    
    def getHotelBookedToday(self):
        try:
            return self.driver.find_element(by=By.CSS_SELECTOR, value='span.UserEngagement__Count').text
        except:
            return 0
        
    def getHotelGuestCleanlines(self):
        try:
            return self.driver.find_elements(by=By.CLASS_NAME, value='Review-travelerGradeScore')[0].text
        except:
            return False
    
    def getHotelGuestFacilities(self):
        try:
            return self.driver.find_elements(by=By.CLASS_NAME, value='Review-travelerGradeScore')[1].text
        except:
            return False
    
    def getHotelGuestLocation(self):
        try:
            return self.driver.find_elements(by=By.CLASS_NAME, value='Review-travelerGradeScore')[2].text
        except:
            return False
    
    def getHotelGuestService(self):
        try:
            return self.driver.find_elements(by=By.CLASS_NAME, value='Review-travelerGradeScore')[3].text
        except:
            return False
    
    def getHotelGuestValueForMoney(self):
        try:
            return self.driver.find_elements(by=By.CLASS_NAME, value='Review-travelerGradeScore')[4].text
        except:
            return False

In [None]:
AS = AgodaScrapper('Koh Samui')
AS.run()

In [None]:
AS = AgodaScrapper('Ko Pha-ngan')
AS.run()

In [None]:
AS = AgodaScrapper('Ko Phi Phi')
AS.run()

In [None]:
AS = AgodaScrapper('bangkok')
AS.run()

In [None]:
AS = AgodaScrapper('Phuket')
AS.run()