In [1]:
!pip install selenium

Defaulting to user installation because normal site-packages is not writeable


In [3]:
# from selenium import webdriver
# DRIVER_PATH = '/home/xiao/Downloads//chromedriver_linux64/chromedriver'
# driver = webdriver.Chrome(executable_path=DRIVER_PATH)
# driver.get('https://google.com')

In [4]:
import pandas as pd
from random import randint
import time
import re
# Scraping through Chrome webdriver
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys

In [5]:
# starting URL of centris.ca
centris_url = "https://www.centris.ca/en/lots~for-sale~shefford/9743866?view=Summary&uc=2"

In [218]:
class Centris:
    """
    Accessing 
    
    Attr:
    self.url - starting url for scraping process
    self.data - pandas.DataFrame object containing scraped data
    self.driver - Chrome webdriver
    self.containers - List of web-elements containing information on listings
        - eg: link to listing, price, picture, address,...
    self.links_to_listings - List of web-elements, each containing the link to a listing
    """
     
    def __init__(self, url, DRIVER_PATH): 
        self.url = url
        self.data = pd.DataFrame({\
                        'centris_No' : None,
                        'title': None,\
                        'address': None,\
                        'price': None,\
                        'lat': None,\
                        'long': None,\
                        'descriptions': None,\
                        'neighbourhood_indicators': None,\
                        'demographics': None
                    }, index=[0])
        # Path to Chromedriver
        self.DRIVER_PATH = DRIVER_PATH
        self.driver = None
        
        self.current_page = None
        self.previous_page = None
        
#         # Verification for new DOM
#         self.old_DOM = {\
#                         "title" : [],\
#                         "address": [],\
#                         "price": [],\
#                         "lat": [],\
#                         "long": [],\
#                         "descriptions": [],\
#                         'neighbourhood_top': [],\
#                         'neighbourhood_buttom': [],\
#                         "demographics_buttons": [],\
#                     }

    def append_data(self):
        """Appends data to columns.
        centrisNo. : xpath = "//span[@id='ListingDisplayId']"
        price: xpath = "//span[@id='BuyPrice']"
        title: xpath = '//span[@data-id="PageTitle"]'
        address: xpath = '//h2[@itemprop="address"]'
        lat: xpath = '//span[@id="PropertyLat"]'
        lng: xpath = '//span[@id="PropertyLng"]'
        """

        new_data = pd.DataFrame({\
                        'centris_No': self.scrape_feature("//span[@id='ListingDisplayId']"),
                        'title': self.scrape_feature('//span[@data-id="PageTitle"]'),\
                        'address': self.scrape_feature('//h2[@itemprop="address"]'),\
                        'price': self.scrape_feature("//span[@id='BuyPrice']"),\
                        'lat': self.scrape_feature('//span[@id="PropertyLat"]'),\
                        'long': self.scrape_feature('//span[@id="PropertyLng"]'),\
                        'descriptions': str(self.scrape_description()),\
                        'neighbourhood_indicators': "neighbourhood_indicators",\
                        'demographics': "demographics"\
                    }, index=[0])
        self.data = self.data.append(new_data, ignore_index=True)
             
#     def get_data(self):
#         return self.data
        
    def start_driver(self):
        """
        Starts and returns Chrome webdriver. 
        The page link in the url attribute 
        is opened in headless mode.
        """
        
        # Activate headless mode for fastest response
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-infobars"); # disabling infobars
        options.add_argument("--disable-extensions"); # disabling extensions
        options.add_argument("--disable-gpu"); # applicable to windows os only
        options.add_argument("--disable-dev-shm-usage"); # overcome limited resource problems
        options.add_argument("--no-sandbox"); # Bypass OS security model
        options.add_argument('--start-maximized') # open Browser in maximized mode
        options.add_argument('--incognito')

        # Start driver with url
        self.driver = webdriver.Chrome(executable_path=self.DRIVER_PATH)
        self.driver.get(self.url)
        self.page_update()

        
    def sort_listings(self):
        """Sorts listings in webdriver from newest to oldest."""
        
        # Click drop down menu
        drop_down = self.driver.find_element_by_xpath(\
                                    "//button[@id='dropdownSort']")
        drop_down.click()
        
        # Sort by most recent listings
        sort_by = self.driver.find_element_by_xpath("//a[@data-option-value='3']")
        sort_by.click()
        time.sleep(0.5) #######
        self.page_update()
    
    
    def goto_first_page(self):
        page = str(self.current_page).split(" / ")[0].replace(",", "")
        print(page)
        if page == 1: return
        else: 
            try:
                next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='goFirst']")
                next_page.click()
                
#                 time.sleep(0.5) #######

#                 self.current_page = self.driver.find_element_by_xpath(\
#                                     "//li[@class='pager-current']").text
        
#                 print("current page:", str(self.current_page))

                self.page_update()
            except:
                print("goFirst button not available")
            
    
    def next_page(self):
        try:
            next_page = self.driver.find_element_by_xpath(\
                                        "//li[@class='next']")
            next_page.click()
            self.page_update()
            pass
        except:
            time.sleep(0.5)
            # Try again after waiting 0.5 sec.
            try:
                next_page = self.driver.find_element_by_xpath(\
                                            "//li[@class='next']")
                next_page.click()
                pass
            except:
                print("Next-page button not found!")
     
    
    def get_last_page(self):
        '''Returns page number of last page in browser.'''
        
        last_page = self.driver.find_element_by_xpath(\
                                    "//li[@class='pager-current']").text
        
        return int(last_page.split(" / ")[1].replace(",", ""))
    
    
    
    def refresh_page(self):
        "Refreshes current webdriver page."
        self.driver.refresh()
        # Wait until page fully loaded
        time.sleep(2)
        

#     def scrape_id(self):
#         """ get Centris No. """
#         print("current page:", str(self.current_page))
#         time.sleep(2)
#         xpath = "//span[@id='ListingDisplayId']"
#         element = centris.driver.find_elements_by_xpath(xpath)
#         uniqueID = element[0].text
#         print(uniqueID)
#         return uniqueID   

    
    def scrape_feature(self, xpath):
        """ get info
        centrisNo. : xpath = "//span[@id='ListingDisplayId']"
        price: xpath = "//span[@id='BuyPrice']"
        title: xpath = '//span[@data-id="PageTitle"]'
        address: xpath = '//h2[@itemprop="address"]'
        
    
        """
        print("current page:", str(self.current_page))
#         time.sleep(2)
        xpath = xpath
        element = centris.driver.find_elements_by_xpath(xpath)
        print(element)
        info = element[0].text
        print(info)
        return info   

    
#     def scrape_Lng(self):
#         """ get Centris No. """
#         print("current page:", str(self.current_page))
#         time.sleep(2)
#         xpath = "//span[@id='PropertyLng']"
#         element = centris.driver.find_elements_by_xpath(xpath)
#         uniqueID = element[0]
#         print(uniqueID)
#         return uniqueID       
    
        
    def scrape_description(self):
        """ get description """
        print("current page:", str(self.current_page))
        time.sleep(2)
        xpath = "//div[@class='col-lg-12 description']"
        element = centris.driver.find_elements_by_xpath(xpath)
        description = element[0].text.split("\n")[3:]
        print(description)
        return description
    
    
    def scrape_neighbourhood(self):
        pass
        

    
    def page_update(self):
        """ allowed page change to finish before scrape info """
        self.current_page = self.driver.find_element_by_xpath(\
                                    "//li[@class='pager-current']").text
        
        print("current page:", str(self.current_page))
        self.previous_page = self.current_page
        print("previous page:", str(self.previous_page))
        print("page upgraded")
        
    
# Instantiate class object
centris = Centris(centris_url, DRIVER_PATH)

In [219]:
# tests
centris = Centris(centris_url, DRIVER_PATH)
centris.start_driver()
print("------start successed-----")

centris.page_update()
centris.scrape_feature("//span[@id='BuyPrice']")
centris.scrape_description()
print("------ page update successed--------")


centris.sort_listings()
centris.scrape_feature("//span[@id='BuyPrice']")
centris.scrape_description()
print("------sort successed---")


centris.goto_first_page()
centris.scrape_feature("//span[@id='BuyPrice']")
centris.scrape_description()
print("-------go to 1st successed---------")

centris.next_page()
centris.scrape_feature("//span[@id='BuyPrice']")
centris.scrape_description()
print("-------go to next successed -------")

current page: 4,518 / 44,927
previous page: 4,518 / 44,927
page upgraded
------start successed-----
current page: 4,518 / 44,927
previous page: 4,518 / 44,927
page upgraded
current page: 4,518 / 44,927
[<selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="ce3339f6-307f-4c0f-b3c8-095621118b59")>, <selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="c41546a6-37ed-4281-b012-883907e3fe3a")>]
$145,000
current page: 4,518 / 44,927
['Lot area', '124,450 sqft', 'Zoning', 'Residential']
------ page update successed--------
current page: 1 / 44,927
previous page: 1 / 44,927
page upgraded
current page: 1 / 44,927
[<selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="ce3339f6-307f-4c0f-b3c8-095621118b59")>, <selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="c41546a6-37ed-4281-b012-883907e3fe3a")>]
$

In [214]:
centris.scrape_feature("//span[@id='RawPrice']")

current page: 2 / 44,927
[<selenium.webdriver.remote.webelement.WebElement (session="f7036465e92e6a0647548b331c3c7821", element="bdebb4e2-ec93-497a-bcda-30365ac5e76e")>]



''

In [222]:
centris.data

Unnamed: 0,centris_No,title,address,price,lat,long,descriptions,neighbourhood_indicators,demographics
0,,,,,,,,,
1,17905680.0,Duplex for sale,"4183 - 4185, boulevard Décarie, Montréal (Côte...","$899,000",,,"['Use of property', 'Residential and commercia...",neighbourhood_indicators,demographics


In [221]:
centris.append_data()

current page: 2 / 44,927
[<selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="389391d7-9583-48b0-9ecd-ca551ac8d1c4")>]
17905680
current page: 2 / 44,927
[<selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="4032ecfc-53fb-4559-aa9d-e6a83f63bf6b")>, <selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="dc02f4da-91b6-4746-8cc7-a09f576f7f19")>]
Duplex for sale
current page: 2 / 44,927
[<selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="bf83453f-b89d-4ba9-a25e-154489d45498")>, <selenium.webdriver.remote.webelement.WebElement (session="18afccc255c517ed92c550b62c56f5dc", element="8969f3e0-1437-4d8c-b927-ac6e14b0759c")>]
4183 - 4185, boulevard Décarie, Montréal (Côte-des-Neiges/Notre-Dame-de-Grâce), Neighbourhood Notre-Dame-de-Grâce
current page: 2 / 44,927
[<selenium.webdriver.remote.webelement

In [163]:
centris.data['descriptions']

0                                                 None
1    ['9 rooms', '3 bedrooms (1 in basement)', '2 b...
Name: descriptions, dtype: object