## Centris Web Scraping

In [1]:
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import matplotlib.pyplot as plt
%matplotlib inline
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains

### Web scraping function

In [2]:
def scrape_results_page(url):
    
    headers = {"Accept-Language": "en-US, en;q=0.5"}
    response = get(url, headers=headers) 
    html_soup = BeautifulSoup(response.text, 'html.parser')
    
    first_realtor = html_soup.find('p', class_='middle')
    first_realtor = re.sub('\nReal Estate Broker','',first_realtor.text.strip())
    first_realtor = re.sub('\nResidential and Commercial Real Estate Broker','',first_realtor)
    first_realtor = re.sub('\nCertified Real Estate Broker','',first_realtor)
    first_realtor = re.sub('\nResidential Real Estate Broker','',first_realtor)

    first_firm = html_soup.find('span', itemprop='legalName')
    first_firm = first_firm.text.strip()

    first_price = html_soup.find('span', itemprop='price')
    first_price = first_price.text.strip()

    try:
        first_price_txs = html_soup.find('span', class_='price-with-tax')
        first_price_txs = first_price_txs.text.strip()
    except:
        first_price_txs = '0'
        
    first_property_type = html_soup.find('div', class_='description').h1
    first_property_type = first_property_type.span.text

    first_property_features = html_soup.find_all('td', class_='last-child')
    first_property_subtype = first_property_features[0].span.text
    first_property_yr = first_property_features[1].span.text

    first_property_address = html_soup.find('div', class_='address').h2
    first_property_address = first_property_address.text

    first_neighborhood = first_property_address.partition("Neighbourhood ")[2]
    
    return first_realtor,first_firm,first_price,first_price_txs,first_property_type,first_property_subtype,first_property_yr,first_property_address,first_neighborhood

### Web browsing (+ store results) function

In [3]:
def launch_centris_search(url,hood):
    
    # Launch web browser
    browser = webdriver.Firefox()
    browser.get(url)
    launch_search = browser.find_element_by_css_selector('html.k-ff.k-ff62.js.flexbox.canvas.canvastext.webgl.no-touch.geolocation.postmessage.no-websqldatabase.indexeddb.hashchange.history.draganddrop.websockets.rgba.hsla.multiplebgs.backgroundsize.borderimage.borderradius.boxshadow.textshadow.opacity.cssanimations.csscolumns.cssgradients.no-cssreflections.csstransforms.csstransforms3d.csstransitions.fontface.generatedcontent.video.audio.localstorage.sessionstorage.webworkers.applicationcache.svg.inlinesvg.smil.svgclippaths body#body.body-class-propertyresult div#site header#header div#header-wrapper.wrapper div.top-nav nav.menu-container ul.main-menu li.property-search-icon a.current')
    launch_search.click()

    # Select Property Types
    single_family_home_bttn = browser.find_element_by_xpath('/html/body/div[1]/header/div/div[2]/div/fieldset[2]/fieldset[1]/div/button[1]')
    single_family_home_bttn.click()
    sleep(randint(1,3))
    condo_bttn = browser.find_element_by_xpath('/html/body/div[1]/header/div/div[2]/div/fieldset[2]/fieldset[1]/div/button[2]')
    condo_bttn.click()
    sleep(randint(1,3))
    condo_home_bttn = browser.find_element_by_xpath('/html/body/div[1]/header/div/div[2]/div/fieldset[2]/fieldset[1]/div/button[4]')
    condo_home_bttn.click()
    sleep(randint(1,3))

    # Select Building Age
    new_construction_bttn = browser.find_element_by_xpath('//*[@id="NEUVE"]')
    new_construction_bttn.click()
    sleep(randint(1,3))
    less_10yr_bttn = browser.find_element_by_xpath('//*[@id="-10"]')
    less_10yr_bttn.click()
    sleep(randint(1,3))

    # Select Price Range
    slider = browser.find_element_by_css_selector('a.ui-slider-handle:nth-child(3)')
    slider_range = browser.find_element_by_css_selector('.ui-slider-range')
    actions = ActionChains(browser)
    actions.click_and_hold(slider)
    actions.move_to_element(slider_range) 
    actions.release()
    actions.perform()
    sleep(randint(1,3))

    # Select Neighborhoods
    search_hood = browser.find_element_by_css_selector('#search')
    #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#search')))
    search_hood.send_keys(hood)
    #wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#ui-id-2')))
    hidden_menu = browser.find_element_by_css_selector('#ui-id-2')
    sleep(5)
    hidden_menu.click()
    #search_hood.send_keys(Keys.LEFT_SHIFT,Keys.ENTER)
    #search_hood.submit()

    # Store Results Count
    results_count = browser.find_element_by_xpath('//*[@id="property-count"]')
    res_str = results_count.text.strip()
    char = len(res_str)

    if char > 3:
        a = res_str[:char-4]
        b = res_str[-3:]
    else:
        a = 0
        b = res_str

    results_count = int(a)*1000+int(b)
    
    # Set up lists which will form our scrape result's DataFrame
    realtors,firms,prices,price_txs,types,subtypes,yrs,addresses,hoods,urls = [[] for i in range(10)]

    # If search returns results
    if results_count > 0:
        
        # Click Search Button to generate results
        confirm_search = browser.find_element_by_css_selector('#submit-search')
        confirm_search.click()

        # Once search results appear, click on first result's summary
        sleep(5)
        first_summary = browser.find_element_by_css_selector('div.templateListItem:nth-child(1) > a:nth-child(3)')
        first_summary.click()

        # Loop through results and scrape each page
        for i in range(results_count):

            # Fetch current summary page's URL
            current_url = browser.current_url

            # Scrape results' page
            scrape_results = scrape_results_page(current_url)
            realtors.append(scrape_results[0])
            firms.append(scrape_results[1])
            prices.append(scrape_results[2])
            price_txs.append(scrape_results[3])
            types.append(scrape_results[4])
            subtypes.append(scrape_results[5])
            yrs.append(scrape_results[6])
            addresses.append(scrape_results[7])
            hoods.append(scrape_results[8])
            urls.append(current_url) 

            # Click Next Result
            sleep(randint(1,3))
            next_res = browser.find_element_by_css_selector('div.last-child:nth-child(2) > ul:nth-child(1) > li:nth-child(4) > a:nth-child(1)')
            next_res.click()
            
    # If search returns no results, quit browser
    else:
        browser.quit()

    # Store results in DataFrame
    results_df = pd.DataFrame({'realtor': realtors,
                               'firm': firms,
                               'price': prices,
                               'price_txs': price_txs,
                               'type': types,
                               'subtype': subtypes,
                               'yr': yrs,
                               'address': addresses,
                               'neighborhood': hoods,
                               'url': urls})
    
    # Quit browser
    browser.quit()
    
    return results_df

### Results

In [4]:
url = 'https://www.centris.ca/en/properties~for-sale?uc=1&view=List'
hoods = ["Pierrefonds-Roxboro (Montréal)","Kirkland","Dorval","Dollard-Des Ormeaux","Pointe-Claire","Beaconsfield","L'Île-Bizard/Sainte-Geneviève (Montréal)"]

In [5]:
dfs = []
for hood in hoods:
    
    res = launch_centris_search(url,hood)
    dfs.append(res)

results = pd.concat(dfs)

In [8]:
results

Unnamed: 0,address,firm,neighborhood,price,price_txs,realtor,subtype,type,url,yr
0,"5211, Rue du Sureau, apt. 404, Pierrefonds-Rox...",GROUPE SUTTON SYNERGIE INC.,Pierrefonds/Central East,"$264,888",0,Martin Dumont\nMartin Dumont Inc.,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,"1,080 sqft"
1,"13330, boulevard de Pierrefonds, apt. A101, Pi...",RE/MAX CADIBEC INC.,Pierrefonds/Central East,"$169,820","($183,724)1",Sarah Namer,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,757 sqft
2,"18466, boulevard Gouin Ouest, apt. I-6, Pierr...",RE/MAX DU CARTIER INC.,Pierrefonds/West,"$353,000","($399,751)1",Nicolas Bel,"Two or more storey, Attached",House,https://www.centris.ca/en/houses~for-sale~pier...,"2018, New"
3,"10425, boulevard Gouin Ouest, apt. 105, Pierr...",IMMOBILIER FARLINA / FARLINA REALTY,Roxboro,"$259,000",0,Uddip Dutta,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,"1,010 sqft"
4,"10435, boulevard Gouin Ouest, apt. 107, Pierr...",RE/MAX 3000 INC.,Roxboro,"$229,900",0,Melik Melkonian\nMelik Melkonian Inc.,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,788 sqft
5,"13330, boulevard de Pierrefonds, apt. B301, Pi...",RE/MAX CADIBEC INC.,Pierrefonds/Central East,"$254,900",0,Sarah Namer,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,794 sqft
6,"14399, boulevard Gouin Ouest, apt. 603, Pierr...",KELLER WILLIAMS URBAIN,Pierrefonds/Central West,"$269,895","($302,451)1",Marie-Ève Brulotte,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,947 sqft
7,"14399, boulevard Gouin Ouest, apt. 802, Pierr...",KELLER WILLIAMS URBAIN,Pierrefonds/Central West,"$283,626","($319,360)1",Marie-Ève Brulotte,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,947 sqft
8,"420, Chemin de la Rive-Boisée, apt. 305, Pierr...",STAMINA AGENCE IMMOBILIÈRE INC.,Pierrefonds/Central East,"$334,900",0,André MacDuff AEO,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,"1,005 sqft"
9,"14399, boulevard Gouin Ouest, apt. 301, Pierr...",ENGEL & VOLKERS MONTRÉAL,Pierrefonds/Central West,"$410,000",0,Natalia Sokova,Divided,Condo,https://www.centris.ca/en/condos~for-sale~pier...,973 sqft
