In [5]:
from selenium import webdriver
import pandas as pd
import requests
import json
import time

In [2]:
def scrape_ad(driver, url):
    driver.get(url)
    return pd.read_html(driver.page_source)

In [4]:
driver = webdriver.Chrome()

# load dataframe from existing csv file or create new one from scratch
try:
    df = pd.read_csv('data/dump_df_full.csv').set_index('Unnamed: 0')
except:
    df = pd.DataFrame()

# set page number
n = 1

while True:
    r = requests.get(f'https://www.immoweb.be/en/search-results/house-and-apartment/for-rent?countries=BE&page={n}&orderBy=newest')
    list_ads = r.json()["results"]
    for ad in list_ads:

        # create dictionary for building df1
        property = {}
        features = {}
        features["API id"] = ad["id"]
        features["API property type"] = ad["property"]["type"].lower()
        features["API subtype"] = ad["property"]["subtype"].lower()
        features["API number bedrooms"] = ad["property"]["bedroomCount"]
        features["API number rooms"] = ad["property"]["roomCount"]
        features["API region"] = ad["property"]["location"]["region"]
        features["API province"] = ad["property"]["location"]["province"]
        features["API locality"] = ad["property"]["location"]["locality"]
        features["API postalcode"] = ad["property"]["location"]["postalCode"]
        features["API street"] = ad["property"]["location"]["street"]
        features["API full address"] = ad["property"]["location"]["street"], ad["property"]["location"]["number"], ad["property"]["location"]["postalCode"], ad["property"]["location"]["locality"]
        features["API floor"] = ad["property"]["location"]["floor"]
        features["API net habitable surface"] = ad["property"]["netHabitableSurface"]
        features["API rent"] = ad["transaction"]["rental"]["monthlyRentalPrice"]
        features["API monthly costs"] = ad["transaction"]["rental"]["monthlyRentalCosts"]

        # rebuild URL for indexing and scraping with Selenium
        url = f'https://www.immoweb.be/en/classified/{features["API property type"]}/for-rent/{features["API locality"]}/{features["API postalcode"]}/{features["API id"]}'

        # avoid duplicates
        if url in df.index:
            continue

        # build df1
        property[url] = features
        df1 = pd.DataFrame.from_dict(property)
        df1 = df1.T

        # create df2 with Selenium
        df2 = None
        try:
            df2 = scrape_ad(driver, url)
        except:
            time.sleep(3)
            try:
                df2 = scrape_ad(driver, url)
            except:
                print(f'Failed to scrape this ad: {url}')

        # transform df2
        if df2 is not None:
            df2 = pd.concat(df2).reset_index(drop=True)
            df2 = df2.set_index(0)
            df2.columns = [url]
            df2 = df2[~df2.index.duplicated(keep='first')] # if time allows try: df2 = df2.loc[~df2.index.duplicated()]
            df2 = df2.T

            # filter columns
            columns_to_keep = [
                'Number of floors',
                'Building condition',
                'Number of frontages',
                'Outdoor parking spaces',
                'Living room surface',
                'Kitchen type',
                'Kitchen surface',
                'Bedroom 1 surface',
                'Bedroom 2 surface',
                'Bathrooms',
                'Toilets',
                'Furnished',
                'Terrace surface',
                'Elevator',
                'Primary energy consumption',
                'Energy class',
                'Yearly theoretical total energy consumption',
                'Heating type',
                'Double glazing',
                'Dressing room',
                'Shower rooms',
                'Office',
                'Professional space',
                'Armored door',
                'Accessible for disabled people',
                'Intercom',
                'Secure access / alarm',
                'TV cable',
                'Visio phone',
                'Jacuzzi',
                'Sauna',
                'Swimming pool',
                'Internet',
                'Heat pump',
                'Photovoltaic solar panels',
                'Thermic solar panels',
                'Common water heater',
                'Type of building',
                'Construction year',
                'Surroundings type',
                'Covered parking spaces',
                'Basement',
                'Flood zone type',
                'Small pet-friendly',
                'Living room'
            ]
            # columns = [col for col in df2.columns if isinstance(col, str) and 'Insure' not in col]
            columns = [col for col in df2.columns if col in columns_to_keep]
            df2 = df2[columns]

            # assemble df1 and df2
            df3 = pd.concat([df1, df2], axis=1)
            
            # add data in df
            df = pd.concat([df, df3])

        else:
            df = pd.concat([df, df1])

        # save collected data to a csv file
        df.to_csv('data/dump_df_full.csv')

    # go to next page
    n += 1

    # next two lines only for testing on a limited number of pages:
    if n == 4:
        break

driver.close()
df.shape # expect 30 ads per page and 60 columns

(90, 60)

In [8]:
df

Unnamed: 0_level_0,API floor,API full address,API id,API locality,API monthly costs,API net habitable surface,API number bedrooms,API number rooms,API postalcode,API property type,...,Jacuzzi,Sauna,Swimming pool,Internet,Photovoltaic solar panels,Thermic solar panels,Common water heater,Type of building,Flood zone type,Small pet-friendly
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
https://www.immoweb.be/en/classified/apartment/for-rent/Etterbeek/1040/10015891,0.0,"('Avenue Milcamps', '4', '1040', 'Etterbeek')",10015891,Etterbeek,,60.0,1,,1040,apartment,...,,,,,,,,,,
https://www.immoweb.be/en/classified/apartment/for-rent/Macquenoise/6593/10015889,1.0,"('La Distillerie', '1', '6593', 'Macquenoise')",10015889,Macquenoise,50.0,130.0,3,,6593,apartment,...,,,,,,,,,,
https://www.immoweb.be/en/classified/apartment/for-rent/Anderlecht/1070/10015880,7.0,"(None, None, '1070', 'Anderlecht')",10015880,Anderlecht,200.0,120.0,3,,1070,apartment,...,,,,,,,,,,
https://www.immoweb.be/en/classified/apartment/for-rent/Herselt/2230/9961742,1.0,"('Dorp', '78', '2230', 'Herselt')",9961742,Herselt,25.0,150.0,3,,2230,apartment,...,No,No,No,No,No,No,No,All kind,Non flood zone,
https://www.immoweb.be/en/classified/apartment/for-rent/Mons/7000/10015876,2.0,"('Rue de La Halle', '17C/2.3', '7000', 'Mons')",10015876,Mons,,87.0,2,,7000,apartment,...,,,,Yes,,,No,Apartment building,Non flood zone,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
https://www.immoweb.be/en/classified/apartment/for-rent/Wetteren/9230/10015665,0.0,"('Mellepontweg', '9,', '9230', 'Wetteren')",10015665,Wetteren,50.0,,2,,9230,apartment,...,,,No,,,,,,,
https://www.immoweb.be/en/classified/apartment/for-rent/Etterbeek/1040/10015664,4.0,"('Rue Batonnier Braffort', '39', '1040', 'Ette...",10015664,Etterbeek,120.0,82.0,2,,1040,apartment,...,,,,,,,,,,
https://www.immoweb.be/en/classified/house/for-rent/Gistel/8470/10015662,,"('Postwegel', '2,', '8470', 'Gistel')",10015662,Gistel,,110.0,0,,8470,house,...,,,,,,,,,,
https://www.immoweb.be/en/classified/house/for-rent/Hooglede/8830/10015661,,"('Beverenstraat', '12,', '8830', 'Hooglede')",10015661,Hooglede,,150.0,3,,8830,house,...,,,,,,,,,,
