In [1]:
import numpy as np
import pandas as pd
import time
import random
from random import randint
from threading import Thread
from queue import Queue
from tqdm import tqdm

In [2]:
import selenium
from selenium import webdriver

import typing
from typing import List, Dict

In [3]:
from selenium.webdriver import FirefoxOptions

opts = FirefoxOptions()
opts.add_argument("--headless")
browser = webdriver.Firefox(firefox_options=opts)

  browser = webdriver.Firefox(firefox_options=opts)


In [4]:
# Threading seems almost impossible...

In [5]:
import pandas as pd 

In [6]:
class ImmoWeb():
    """
    This class and it's scrape method can be used to gather all data from the website
    www.immoweb.be.
    """
    
    def __init__(self, driver, counter = 2, debug_mode = False, start_url="https://www.immoweb.be/fr/recherche/maison/a-vendre?countries=BE&page=1&orderBy=relevance"):
        """
        :driver: inputs which driver will be used for scraping.
        :debug_mode: Flag to trigger some print statements, useful for debugging.
        :start_url: Url of the first search front page (sorted by postal code for continuity) .
        """
        self.driver = driver
        self.start_url = start_url
        self.advert_urls = []
        self.advert_details = []
        self.all_data = []
        self.debug = debug_mode
        
        # counter, used for selecting which batch to start from.
        self.counter = counter      
        
    def scrape(self):
        """
        Scrape methods calls other methods in the class in order.
        """
        
        # work in batches of 5
        urls = self.readPagination(5)

        for url in tqdm(self.advert_urls):
            data = self.readAdPage(url)
            self.all_data.append(data)

        return [self.all_data, self.counter]
            
        
    def page_advert_urls(self, page_url, url_piece = "annonce"):
        """
        Method that gathers urls for each individual ad. 
        
        :page_url: a url that links to a page containing several urls of individual ads
        :return: a list containing all the urls to the front pages.
        """
        # Init empty list to return gathered 
        url = []
        
        # call the driver get method with a time delay.
        self.driver.get(page_url)
        time.sleep(random.uniform(0.3, 0.8))
        
        # find all the links to individual ads.
        for elem in self.driver.find_elements_by_tag_name("a"):
            link = elem.get_attribute("href")
            if not link is None and url_piece in link:
                self.advert_urls.append(link)
                url.append(link)
                
        return url
       
    def readPagination(self, batches):
        """
        readPagination gathers all pages that contain independent ads. The first one is hardcoded and 
        aferwards it formats the URL as a string until no new ads are found.
        
        :return: a list of URLs that link to each page of a search query. 
        """
        # initialize a list for storing all the seperate links to individual ads.
        batch = 0
        max_batch = batches
        
        urls = ['init']
        
        # loop until no results are found
        while batch < max_batch:
            
            page_url = f"https://www.immoweb.be/fr/recherche/maison/a-vendre?countries=BE&page={self.counter}&orderBy=postal_code"
            urls = self.page_advert_urls(page_url)
            
            adverts_amount = len(urls)
            self.counter += 1
            batch += 1
            
        return urls
        
            
    def readAdPage(self, inp_url):
        """
        Method that reads all the information on the page and writes it to a dict using the key, 
        value pairs used on the website in question. 
    
        :inp_url: get the input URL that we will be gathering information from.
        :return: a dict that stores scraped data
        """

        # initialize some variables.
        detailsKeys = []
        detailsValues = []
        details = {}

        # grab the URL page code
        self.driver.get(inp_url)

        # find the tables in the page and split data in 2 types, keys and values.
        for desc_list in self.driver.find_elements_by_tag_name("th"):
            if desc_list.text != "":
                detailsKeys.append(desc_list.text)

        for desc_list in self.driver.find_elements_by_tag_name("td"):
            if desc_list.text != "":
                if desc_list.find_elements_by_tag_name("a") == []:
                    if desc_list.find_elements_by_tag_name("span") != []:
                        detail = desc_list.text.split("\n")
                        detailsValues.extend(detail[:-1])
                    else:
                        detailsValues.append(desc_list.text)

        # build a dict containing all gathered data
        try:
            for idx, x in enumerate(detailsKeys):
                details[x] = detailsValues[idx]
        except:
            pass

        self.advert_details.append(details)

        return details
            
            

def threadedScraper(driver, first_index):
    """
    
    """
    scraper = ImmoWeb(driver = driver, debug_mode = True, counter = first_index)
    data, data_read = scraper.scrape()
    dataframe = pd.DataFrame(data)
    filename = f"immoweb/immoweb_batch_{first_index+1}_{data_read}.csv"
    dataframe.to_csv(filename)

In [None]:
# Use following two cells to run the scraper in batches, define the range of batches
# you want to gather in the cell below

In [8]:
data_read = 180 #SET STARTING BATCH HERE
max_batch = 200 #SET ENDING BATCH HERE

In [9]:
while data_read < max_batch:
    driver1 = webdriver.Firefox()
    try: 
        threadedScraper(driver1, data_read)
    except:
        print(f"FAILURE AT PAGE {data_read}")
    
    driver1.quit()
    data_read += 5

100%|██████████| 153/153 [09:46<00:00,  3.84s/it]
100%|██████████| 154/154 [08:31<00:00,  3.32s/it]
 99%|█████████▊| 153/155 [11:36<00:09,  4.55s/it]


FAILURE AT PAGE 190


 97%|█████████▋| 148/153 [08:07<00:16,  3.30s/it]

FAILURE AT PAGE 195





In [None]:
# run the following cell to run in threads. Be careful, because memory usage is very high and can
# and will cause crashes.

In [None]:
first_batch = 0 #SET THE FIRST INDEX OF BATCH YOU WANT TO RUN HERE

In [None]:
driver1 = webdriver.Firefoc()
thread1 = Thread(target=threadedScraper, args=(driver1, first_batch))
thread1.start()

driver2 = webdriver.Firefoc()
thread2 = Thread(target=threadedScraper, args=(driver2, first_batch))
thread2.start()

driver3 = webdriver.Firefoc()
thread3 = Thread(target=threadedScraper, args=(driver3, first_batch))
thread3.start()

driver4 = webdriver.Firefoc()
thread4 = Thread(target=threadedScraper, args=(driver4, first_batch))
thread4.start()

driver5 = webdriver.Firefoc()
thread5 = Thread(target=threadedScraper, args=(driver5, first_batch))
thread5.start()

thread1.join()
thread2.join()
thread3.join()
thread4.join()
thread5.join()

driver1.quit()
driver2.quit()
driver3.quit()
driver4.quit()
driver5.quit()