In [2]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime
root_url = 'https://www.immoweb.be/en'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
start_run = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
start = datetime.now()
print(f"Started at {start_run}") 
### to get to the page-urls (seemingly always with 30 properties per pages) for list of 10000  houses (only), 
### only in BE and excluding life annuity sales (hence 2 filters applied)
### sorted by newest first, language set to English
type_of_property = 'house' # 'house' or 'apartment' to be 
pages = range(1) # assuming 333 pages if range is set to 333, assumption could be checked with last 'li' tag and class = "pagination__item" on the first page url (TBC)
### !!!! improvement probabaly would be to use the <next page> button instead of recalling each time with new request.  As the list can change continiously, we might end up with dupplicates now.
listing_url_pages = []
listing_url_properties = []
for i in pages:
    page = i+1
    page_url=f"https://www.immoweb.be/en/search/{type_of_property}/for-sale?countries=BE&isALifeAnnuitySale=false&page={page}&orderBy=relevance&orderBy=newest"
    listing_url_pages.append(page_url)
print(f"Number of page-urls retrieved : {len(listing_url_pages)}")
print(f"Type_of_property retrieved : {type_of_property}") # with this search url, we are only looking for properties od type "house"


Started at 2024-10-11_14-59-22
Number of page-urls retrieved : 1
Type_of_property retrieved : house


In [3]:
def list_propertie_urls(page_url):
    """function returns a listing of property-urls 
    from an url to be provided as parameter(=url representing of a listing-page of immoweb.be/en)"""
    list_1_page = []
    req_1_page = requests.get(page_url, headers=headers)
    page_soup = BeautifulSoup(req_1_page.text, 'html.parser')
    listings = page_soup.find_all('article', class_='card--result')
    for listing in listings:
        list_1_page.append(listing.find('a', class_='card__title-link', href=True)['href'])
    return (list_1_page)   

#list_propertie_url(listing_url_pages[-1]) # test when on last page

In [4]:
# looping over a list of pages and retrieve all urls for the individual properties shown on those pages
for i in listing_url_pages:
    listing_url_properties += (list_propertie_urls(i))
Numbre_of_individual_properties = len(listing_url_properties)
print(f"nbr of individual properties (urls) retrieved : {Numbre_of_individual_properties}")

nbr of individual properties (urls) retrieved : 30


In [5]:
def list_properties_detail(property_url):
    """function returns a dictionary of attributes of 1 specific property 
    from an property url to be provided as parameter """
    req_1_house = requests.get(property_url, headers=headers)
    if req_1_house.status_code == 200:         # check if still valid url !!!!!    
        soup = BeautifulSoup(req_1_house.text, 'html.parser')
        listing_data = {}
        ### ID ###
        for x in soup.find('div', attrs={"class": "classified__header--immoweb-code"}):
            listing_data["Property_ID"] = x.text.split(":")[-1].strip()
        ### PRICE ###
        price_container = soup.find('p', attrs={"class": "classified__price"})
        sr_only_span = price_container.find('span', attrs={"class": "sr-only"})
        if sr_only_span:
            listing_data["Price"] = price_container.find('span', attrs={"class": "sr-only"}).text  
        ### MISC ### everything that was under subtitles like interior, exterior, facilities, ...
        ### at same level, selected together, then filtered out by a list of relevant attributes
        rows = soup.find_all('tr', attrs={"class" :'classified-table__row'})
        #print(rows)
        for row in rows:
            #print("row : ",row)
            if row.find('th', attrs={"class" :'classified-table__header'}):
                header = row.find('th', attrs={"class" :'classified-table__header'}).contents
            key_field =((header[0]).strip()) 
            #print(key_field)  
            if row.find('td', attrs={"class" :'classified-table__data'}):  
                data = row.find('td', attrs={"class" :'classified-table__data'}).contents
            value_field = ((data[0]).strip())
            #print(value_field) 
            relevant_keys = ["Living area","Terrace surface","Swimming pool", "Number of frontages", "Bedrooms", "How many fireplaces?", "Kitchen type", "Garden surface", "Swimming pool", "Type of building", "Furnished"]   
            if key_field in relevant_keys:
                listing_data[key_field]= value_field
        ### MISC2 - extracted out of the input url ###
        parts = property_url.split('/')
        listing_data['Postal_Code']=parts[8]
        listing_data['Locality_Name']=parts[7]  
        listing_data['extra']=parts[6] 
        listing_data['Subtype_of_property']=parts[5] 
        listing_data['Type_of_property']=type_of_property     
        return listing_data

#list_properties_detail('https://www.immoweb.be/en/classified/villa/for-sale/braine-lalleud/1421/20237028') # for testing only

In [6]:
# looping over a list of properties and retrieve attributes for that property 
# the attributes of one property are put in a dictionary.  Every dictionary is added to a list.
listing_properties_detail = []
for i, j in enumerate(listing_url_properties):
    if i%100==0:  
        print((i, j)) # to be able to see progress while running, every x-th property-url is printed.
    listing_properties_detail.append(list_properties_detail(j))        

(0, 'https://www.immoweb.be/en/classified/house/for-sale/overijse/3090/20241711')


In [7]:
### SAVING THE DATA ###
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
end = datetime.now()
run_time = end-start
print(run_time)

### save data with json
filename = rf'data\raw\{timestamp}_{type_of_property}_{Numbre_of_individual_properties}.json' #relative path and filename js with timestamp
import json
def save(file_to_be_save):
    with open(filename, 'w') as f:
        json.dump(listing_properties_detail, f)
save(listing_properties_detail)

0:00:08.045381


In [8]:
### save data after df conversion to csv format
import pandas as pd
cleaned_listing_properties_detail = [item for item in listing_properties_detail if item is not None]
df = pd.DataFrame(cleaned_listing_properties_detail)
print(f"shape of the dataframe is {df.shape} with: (1)the number of properties -in rows, (2) the number of attributes -in columns")

filename = rf'data\raw\{timestamp}_{type_of_property}_{Numbre_of_individual_properties}.csv'
df.to_csv(filename, index=False)

shape of the dataframe is (30, 16) with: (1)the number of properties -in rows, (2) the number of attributes -in columns


In [9]:
df.head(4)

Unnamed: 0,Property_ID,Price,Number of frontages,Living area,Kitchen type,Bedrooms,Garden surface,Terrace surface,Postal_Code,Locality_Name,extra,Subtype_of_property,Type_of_property,Type of building,How many fireplaces?,Furnished
0,20241711,599500€,4,160,Hyper equipped,3,1482.0,60.0,3090,overijse,for-sale,house,house,,,
1,20241707,559000€,3,228,Installed,5,,,2950,kapellen,for-sale,house,house,,,
2,20241700,160000€,2,227,Installed,3,,,5500,dinant,for-sale,house,house,,,
3,20241699,469000€,3,225,,4,,,2570,duffel,for-sale,house,house,All kind,,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Property_ID           30 non-null     object
 1   Price                 30 non-null     object
 2   Number of frontages   25 non-null     object
 3   Living area           25 non-null     object
 4   Kitchen type          20 non-null     object
 5   Bedrooms              29 non-null     object
 6   Garden surface        10 non-null     object
 7   Terrace surface       7 non-null      object
 8   Postal_Code           30 non-null     object
 9   Locality_Name         30 non-null     object
 10  extra                 30 non-null     object
 11  Subtype_of_property   30 non-null     object
 12  Type_of_property      30 non-null     object
 13  Type of building      6 non-null      object
 14  How many fireplaces?  5 non-null      object
 15  Furnished             6 non-null      obje