In [8]:
from selenium import webdriver
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import requests
import json
import time
import lxml

In [9]:
def scrape_ad(driver, url):
    driver.get(url)
    return pd.read_html(driver.page_source)

In [10]:
driver = webdriver.Chrome()

# load dataframe from existing csv file or build new one from scratch
try:
    df = pd.read_csv('data/dump_df_combine.csv').set_index('Unnamed: 0')
except:
    df = pd.DataFrame()

# set page number
n = 1

while True:
    try:
        r = requests.get(f'https://www.immoweb.be/en/search-results/house-and-apartment/for-rent?countries=BE&page={n}&orderBy=newest')
        list_ads = r.json()["results"]
        for ad in list_ads:
            data = None

            # rebuild url
            property_type = ad["property"]["type"]
            locality = ad["property"]["location"]["locality"]
            postalcode = ad["property"]["location"]["postalCode"]
            id = ad["id"]
            url = f"https://www.immoweb.be/en/classified/{property_type}/for-rent/{locality}/{postalcode}/{id}"

            # avoid duplicates
            if url in df.index:
                continue

            # scrape ad
            try:
                data = scrape_ad(driver, url)
            except:
                time.sleep(3)
                try:
                    data = scrape_ad(driver, url)
                except:
                    print(f'Failed to scrape this ad: {url}')
            
            # transform data
            if data is not None:
                data = pd.concat(data, axis=0).reset_index(drop=True)
                data = data.set_index(0)
                data.columns = [url]
                data = data[~data.index.duplicated(keep='first')]
                data = data.T
                columns = [col for col in data.columns if isinstance(col, str) and 'Insure this property' not in col]
                df = pd.concat([df, data[columns]])
        
        # store dataframe in csv file
        df.to_csv('data/dump_df_combine.csv')

        # go to next page
        n += 1
        
    except:
        break

    # test on a specific number of pages
    # if n == 4:
        # break

driver.close()
pd.read_csv('data/dump_df_combine.csv').shape

Failed to scrape this ad: https://www.immoweb.be/en/classified/HOUSE/for-rent/Gent/9050/10010384
Failed to scrape this ad: https://www.immoweb.be/en/classified/APARTMENT/for-rent/Namur/5000/10010332
Failed to scrape this ad: https://www.immoweb.be/en/classified/APARTMENT/for-rent/Woluwe-Saint-Lambert / Sint-Lambrechts-Woluwe/1200/10002386
Failed to scrape this ad: https://www.immoweb.be/en/classified/APARTMENT/for-rent/SCHAERBEEK / SCHAARBEEK/1030/9999563
Failed to scrape this ad: https://www.immoweb.be/en/classified/APARTMENT/for-rent/Woluwe-Saint-Lambert / Sint-Lambrechts-Woluwe/1200/9986342
Failed to scrape this ad: https://www.immoweb.be/en/classified/HOUSE/for-rent/Kruisem/Ouwegem/9750/9971376


  pd.read_csv('data/dump_df_combine.csv').shape


(9964, 108)

In [17]:
pd.read_csv('data/dump_df_combine.csv').isnull().sum()

  pd.read_csv('data/dump_df_combine.csv').isnull().sum()


Unnamed: 0                                                                                    0
Available date                                                                             4694
Floor                                                                                      3885
Number of floors                                                                           5274
Building condition                                                                         2833
                                                                                           ... 
Insure your purchase Your home insurance, simple, quick and complete Calculate my price    9963
Extra information                                                                          9960
Percentage rented                                                                          9963
Current monthly revenue                                                                    9962
Sea view                                