In [1]:
from selenium import webdriver
import pandas as pd
import requests
import json
import time

In [2]:
r = requests.get("https://www.immoweb.be/en/search-results/house-and-apartment/for-rent?countries=BE&page=1&orderBy=newest")
with open("data/response.json", "w") as f:
    f.write(str(json.dumps(r.json()["results"])))

In [2]:
def scrape_ad(driver, url):
    driver.get(url)
    return pd.read_html(driver.page_source)

In [15]:
driver = webdriver.Chrome()

# load dataframe from existing csv file or create new one from scratch
try:
    df = pd.read_csv('data/dump_df.csv').set_index('Unnamed: 0')
except:
    df = pd.DataFrame()

# set page number
n = 1

while True:
    try:
        r = requests.get(f'https://www.immoweb.be/en/search-results/house-and-apartment/for-rent?countries=BE&page={n}&orderBy=newest')
        list_ads = r.json()["results"]
    except:
        print(f"Page {n} not found.")
        break

    for ad in list_ads:
        try:
            # create dictionaries for rebuilding URL and create df1
            property = {}
            features = {}
            features["ID"] = ad["id"]
            features["Property type"] = ad["property"]["type"].lower()
            features["Subtype"] = ad["property"]["subtype"].lower()
            features["Bedrooms"] = ad["property"]["bedroomCount"]
            features["Rooms"] = ad["property"]["roomCount"]
            features["Region"] = ad["property"]["location"]["region"]
            features["Province"] = ad["property"]["location"]["province"]
            features["Locality"] = ad["property"]["location"]["locality"]
            features["Postalcode"] = ad["property"]["location"]["postalCode"]
            features["Street"] = ad["property"]["location"]["street"]
            features["Address"] = ad["property"]["location"]["street"], ad["property"]["location"]["number"], ad["property"]["location"]["postalCode"], ad["property"]["location"]["locality"]
            features["Floor"] = ad["property"]["location"]["floor"]
            features["Habitable surface"] = ad["property"]["netHabitableSurface"]
            features["Rent"] = ad["transaction"]["rental"]["monthlyRentalPrice"]
            features["Monthly costs"] = ad["transaction"]["rental"]["monthlyRentalCosts"]

            # rebuild URL for indexing and scraping with Selenium
            url = f'https://www.immoweb.be/en/classified/{features["Property type"]}/for-rent/{features["Locality"]}/{features["Postalcode"]}/{features["ID"]}'

            # avoid duplicates
            if url in df.index:
                continue

            # create df1 from dictionaries
            property[url] = features
            df1 = pd.DataFrame.from_dict(property)
            df1 = df1.T

            # create df2 with Selenium
            try:
                df2 = scrape_ad(driver, url)
            except:
                time.sleep(2)
                try:
                    df2 = scrape_ad(driver, url)
                except:
                    print(f'Failed to scrape this ad: {url}')
                    continue

            # transform df2
            df2 = pd.concat(df2).reset_index(drop=True)
            df2 = df2.set_index(0)
            df2.columns = [url]
            df2 = df2[~df2.index.duplicated(keep='first')] # if time allows try: df2 = df2.loc[~df2.index.duplicated()]
            df2 = df2.T

            # filter columns
            columns_to_keep = [
                'Number of floors',
                'Building condition',
                'Number of frontages',
                'Outdoor parking spaces',
                'Living room surface',
                'Kitchen type',
                'Kitchen surface',
                'Bedroom 1 surface',
                'Bedroom 2 surface',
                'Bathrooms',
                'Toilets',
                'Furnished',
                'Terrace surface',
                'Elevator',
                'Primary energy consumption',
                'Energy class',
                'Yearly theoretical total energy consumption',
                'Heating type',
                'Double glazing',
                'Dressing room',
                'Shower rooms',
                'Office',
                'Professional space',
                'Armored door',
                'Accessible for disabled people',
                'Intercom',
                'Secure access / alarm',
                'TV cable',
                'Visio phone',
                'Jacuzzi',
                'Sauna',
                'Swimming pool',
                'Internet',
                'Heat pump',
                'Photovoltaic solar panels',
                'Thermic solar panels',
                'Common water heater',
                'Construction year',
                'Surroundings type',
                'Covered parking spaces',
                'Basement',
                'Flood zone type',
                'Small pet-friendly',
                'Living room'
            ]
            columns = [col for col in df2.columns if col in columns_to_keep]
            df2 = df2[columns]

            # assemble df1 and df2
            df3 = pd.concat([df1, df2], axis=1)
            
            # add data in df
            df = pd.concat([df, df3])

            # save collected data to csv file
            df.to_csv('data/dump_df.csv')

        except:
            continue

    # go to next page
    n += 1

    # next two lines for testing on limited number of pages:
    if n == 16:
        break

driver.close()
df.shape

(571, 59)