In [1]:
import random
import numpy as np
import pandas as pd
import time
import selenium
from selenium import webdriver
import typing
from typing import List, Dict

In [2]:
class ElissaScraper():
    """
    This class and it's scrape method can be used to gather all data from the website
    www.immoelisa.be.
    """
    def __init__(self):
        self.first_page = 'https://immoelissa.be/immobilier/?sort=prix-c'
        self.data = []
        self.driver = webdriver.Firefox()
    
    def scrape(self):
        """
        Scrape methods calls other methods in the class in order. 
        readPagination method gives all the individual ad urls, and
        for each of these we run the readAdPage method
        """
        
        all_advert_urls = readPagination()
        
        for url in all_advert_urls:
            result = readAdPage(url)
            self.data.append(result)
    
    def properties_url(self, inp_url):
        """
        method that gathers all the individual links from each front page.
        
        :inp_url: link to the front page
        :return: a list of all the urls on the page
        """
        # initialize empty list to return the data
        url = []
        
        # get page data with selenium driver
        self.driver.get(inp_url)
        time.sleep(random.uniform(1.0, 2.0))
        
        # gather data from the html code.
        for elem in driver.find_elements_by_tag_name(\"a\"):
            link = elem.get_attribute(\"href\")
            if not link is None and \"annonces\" in link:
                url.append(link)
                                      
        return url
    
    def readPagination(self):
        """
        readPagination switches between the front pages that contain 
        independent ads. The first one is hardcoded and aferwards it 
        formats the URL as a string until no new ads are found.

        :return: a list of URLs that link to each page of a search query. 
        """

        # initiaze a list for storing all the page URLS
        page_urls = []
        page_urls.append(self.first_page)

        # initialize a list for storing all the seperate links to individual ads.
        all_advert_urls = []
        advert_urls = []
        counter = 2

        advert_urls = properties_url(self.first_page)
        all_advert_urls.extend(advert_urls)

        while advert_urls != []:
            url2 = f\"https://immoelissa.be/immobilier/?pg={counter}&sort=prix-c\"
            advert_urls = properties_url(url2)
            all_advert_urls.extend(advert_urls)
            counter += 1
            print(advert_urls)
        
        return all_advert_urls
    
    def readAdPage(self, inp_url):
        """
        Method the reads all the information on the page and writes it to a dict using the key, 
        value pairs used on the website in question. 

        :inp_url: get the input URL that we will be gathering information from.
        :return: a dict that stores scraped data
        """

        # initialize some variables.
        detailsKeys = []
        detailsValues = []
        details = {}

        # grab the URL page code
        driver.get(inp_url)

        # find the tables in the page and split data in 2 types, keys and values.
        for desc_list in driver.find_elements_by_tag_name(\"dt\"):
            detail = desc_list.text.split(\"\\n\")
            detailsKeys.extend(detail)

        for desc_list in driver.find_elements_by_tag_name(\"dd\"):
            detail = desc_list.text.split(\"\\n\")
            detailsValues.extend(detail)

        # build a dict containing all gathered data
        for idx, x in enumerate(detailsKeys):
            details[x] = detailsValues[idx]

        return details

In [None]:
# The following cell runs the scraper for the full website. 
# Use of batches and threading have not been used because the 
# dataset is small.
scraper = ElissaScraper()
scraper.scrape()