#  Scraper
Our goal in this notebook is to scrape data for house sales in the city of Thesaloniki. We will scrape data from the biggest Greek website for house sales: spitogatos.gr. First we import all the necessary libraries. We are going to use selenium for the scraping.


In [1]:
import pandas as pd
import os
import httpx
from selenium import webdriver
from selenium.webdriver.common.by import By
import numpy as np
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import time
import datetime
import logging
from fake_useragent import UserAgent

We initiate a webdriver element

In [2]:
driver=webdriver.Chrome()


We create a dictionary with all the years from 1952 to 2023 as keys and all the webpages containing the houses for sale that were built in that year in Thessaloniki.

In [3]:
html_files_years1952_2023={}
for i in range(1952,2024):
    url_solo=f"https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/etos_kataskevis_apo-{i}/etos_kataskevis_eos-{i}?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14" 
    
    
    html_files_years1952_2023[i]=url_solo
html_files_years1952_2023

{1952: 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/etos_kataskevis_apo-1952/etos_kataskevis_eos-1952?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 1953: 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/etos_kataskevis_apo-1953/etos_kataskevis_eos-1953?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 1954: 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/etos_kataskevis_apo-1954/etos_kataskevis_eos-1954?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 

The next function takes a webdriver element which is going to be one page from all houses build on a specific year (30 houses per page), in the spitogatos website, and returns a dataframe with the attributes location,price,area,description,floor number,number of rooms, number of bathrooms

In [None]:
def attributes2(sel_webdriver,df_base):
    """
    Extracts house attributes from a webpage and appends them to a DataFrame.

    Args:
        sel_webdriver (WebDriver): Selenium WebDriver instance.
        df_base (DataFrame): Base DataFrame to append data to.

    Returns:
        DataFrame: Updated DataFrame with extracted attributes.
    """
    
    ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,TimeoutException)
    try:
        #this is a list of webdriver elements one  for each house. Note that we wait until all elements have been located.
        entries=WebDriverWait(sel_webdriver, 10,ignored_exceptions=ignored_exceptions)\
                            .until(expected_conditions.visibility_of_all_elements_located((By.XPATH, "//article[@class='ordered-element']")))
        # for each house now we want to find the floor, number of rooms etc. Again we wait until all elements have been located. 
        # The exact locations for each attribute can be found by inspecting the html code of the website.
        houses_attributes=[]
        i=1
        for house in entries:
            try:
                floor=house.find_element(By.XPATH, ".//ul[@class='tile__info']/li[1]/span/span").text
                rooms=house.find_element(By.XPATH, ".//ul[@class='tile__info']/li[2]/span/span").text
                bathrooms=house.find_element(By.XPATH, ".//ul[@class='tile__info']/li[3]/span/span").text
                
                price=house.find_element(By.XPATH, ".//div[@class='tile__price']/p").text
                
                description_and_area=house.find_element(By.XPATH, ".//div[@class='content__top']/h3[1]").text
                
                location=house.find_element(By.XPATH, ".//div[@class='content__top']/h3[2]").text
                submission_date=house.find_element(By.XPATH, ".//p[@class='tile__updated']/time").text
                description=description_and_area.split(",")[0]
                area=description_and_area.split(",")[1]
                
            #Each of these attributes is a WebDriver element from which we want to extract the html content.
            #Also if the scraper did not find any of these attributes we want their value as np.nan. 
        
            except: 
                description=np.nan
                area=np.nan
                floor=np.nan
                rooms=np.nan
                bathrooms=np.nan
                price=np.nan
                location=np.nan
                submission_date=np.nan
            #Finallly we append to our empty dataframe the extracted attributes.   
            attributes=[location,price,area,description,floor,rooms,bathrooms,submission_date]
            houses_attributes.append(attributes)
            if i%10==0:
                print(f"We scraped entry number: \033[91m{i}\033[0m")
            i=i+1
            #df0=pd.DataFrame({"Location":location, "Price":price,"Total_area":area,"House_type":description,"Floor":floor,"Rooms":rooms,"Bathrooms":bathrooms,"submission_date":submission_date},index=[0])
            #print(df0)
            #df_base=pd.concat([df_base,df0],ignore_index=True)
        return(houses_attributes)
    except:
        input("Solve the captcha and press enter to continue")
        attributes2(sel_webdriver,df_base)

The next function takes a webdriver element and a year and takes the webdriver element to the correct page to scrape.


In [5]:
def get_driver(sel_webdriver,year):
    """
    Takes the driver instance to the correct website.

    Args:
        sel_webdriver (WebDriver): Selenium WebDriver instance.
        year: Year for which to fetch data.
    """
    sel_webdriver.get(html_files_years1952_2023[year])
    

The next function takes a webdriver element which is configured in the correct webpage containing 30 houses and clicks the next button so that the next 30 houses for that year are loaded.

In [6]:
def click(sel_webdriver,retries=3):
    """
    Clicks the next page of the pagination to load the next batch of entries.
    Args:
        sel_webdriver (WebDriver): Selenium WebDriver instance.
    """
    page1=sel_webdriver.find_elements(By.XPATH,"//ul[@class='pagination b-pagination']/li[@class='page-item page-arrow']")
    
    for i in range(retries):
        try:
            page1[-1].click()
            return
        except WebDriverException as e:
            logging.warning(f"Click failed on attempt {i + 1}: {e}")
        
    logging.error("Failed to click after multiple attempts.")

        
    

The next function takes a webdriver, a year and a dataframe. It then scrapes information for all house entries build on that year and puts it in a pandas dataframe. It saves the data in a csv file once it is done.

The general algorithm will work as follows:
1. First we load the first page of all houses build for a specific year. Each of the pages in this set contains 30 houses and we need to scrape all pages. 
2. We find the number of pages in the html file of the webpage by doing a .find_elements command.
3. We call the attributes2 function on the driver we loaded on the first step. We append the data (including the year) to our initial dataframe.
4. If there is more than one page of houses built on that specific year we load the next page with the click function.
5. We repeat step 4 until all pages for that year have been scraped.
6. Finally we export the data as a csv file.

This concludes the scraping process for a specific year.

In [None]:
def many_page_scrape(sel_webdriver,df,attribute=None,):
    """
    Scrapes house data for a specific year and appends it to the DataFrame.
    Args:
        sel_webdriver (WebDriver): Selenium WebDriver instance.
        
        df (DataFrame): Base DataFrame to append data to.
    Returns:
        DataFrame: Updated DataFrame with scraped data.
        """
    
    #2. We find the number of pages in the html file of the webpage by doing a .find_elements command.
    page_num,current_page=find_pages(sel_webdriver)
    print(f"the number of total pages to scrape is:{page_num}")
    #3. We call the attributes2 function on the driver we loaded on the first step. We append the data (including the year) to our initial dataframe.
    df1=pd.DataFrame(attributes2(sel_webdriver,df),columns=["Location", "Price","Total_area","House_type","Floor","Rooms","Bathrooms","submission_date"])
    
    df=pd.concat([df,df1],ignore_index=True)
    #4. If there is more than one page of houses built on that specific year we load the next page with the click function.
    if  page_num>1:
        
        
        
        for page in range(current_page,page_num):
            print(f"We scraped page {page} of {page_num}")
            click(sel_webdriver)
            current_url=sel_webdriver.current_url
            time.sleep(3)
        #5. We repeat step 4 until all pages for that year have been scraped.    
            df1=pd.DataFrame(attributes2(sel_webdriver,df),columns=["Location", "Price","Total_area","House_type","Floor","Rooms","Bathrooms","submission_date"])
    
            
            df=pd.concat([df,df1],ignore_index=True)
            if page%50==0 and attribute!=None:
                path=f"/home/tsantaris/OneDrive/Data science and AI stuff/Project spitogatos/Houses_data_{datetime.date.today()}/Houses_data_{attribute}/Houses_data_page{page-50}_to{page}.csv"
                df.drop_duplicates(inplace=True)
                df.reset_index(drop=True, inplace=True)
                df.to_csv(path)
                df=df.iloc[0:0]
               


            if page==page_num-1 and attribute!=None:
                path=f"/home/tsantaris/OneDrive/Data science and AI stuff/Project spitogatos/Houses_data_{datetime.date.today()}/Houses_data_{attribute}/Houses_data_lastbatch.csv"
                df.drop_duplicates(inplace=True)
                df.reset_index(drop=True, inplace=True)
                df.to_csv(path)
    #print(df)
    return(df)

In [59]:
def find_pages(sel_webdriver):
    """
    Finds the total number of pages in the current webpage and the current page.
    Args:
        sel_webdriver (WebDriver): Selenium WebDriver instance.
    """
    pages=sel_webdriver.find_elements(By.XPATH,"//a[@class='page-link']")
    current_page=sel_webdriver.find_element(By.XPATH,"//li[@class='page-item active']")
    if len(pages)>0:
        page_num=int(pages[-2].text)
        current_page=int(current_page.text)
    else:
        page_num=1
        current_page=1
    return page_num, current_page

Now we want to do the same for all years from 1952 to 2023.

In [None]:
import os

def main():
    """Main function to scrape house data
    Args:
        path (str): Path to save the scraped data.
        
    """
    folder_name = f"Houses_data_{datetime.date.today()}"

    try:
        os.mkdir(folder_name)
        print(f"Folder '{folder_name}' created successfully.")
    except FileExistsError:
        print(f"Folder '{folder_name}' already exists.")
    except OSError as error:
        print(f"Error creating folder '{folder_name}': {error}")
    
    for year in range(2016, 2024):
        #1. First we load the webpage which has all houses build for a specific year. Each of the pages in this webpage contains 30 houses and we need to scrape all pages. 
        get_driver(driver,year)
        df=pd.DataFrame(columns=["Location", "Price","Total_area","House_type","Floor","Rooms","Bathrooms","submission_date"])
        df = many_page_scrape(driver, year, df) 
        print(df.shape)
        
        path = f"/home/tsantaris/OneDrive/Data science and AI stuff/Project spitogatos/Houses_data_{datetime.date.today()}/Houses build in {year}_scraped on {datetime.date.today()}.csv"

        df.drop_duplicates(inplace=True)
        df.reset_index(drop=True, inplace=True)
        df["Year_of_construction"]=year
        df.to_csv(path)
        print(f"Data for year {year} saved")
     




main()

In [8]:
list_of_attributes_heating={"autonomous_heating":"autonomi_thermansi","central_heating":"kentriki_thermansi","individual_heating":"atomiki_thermansi","no_heating":"xwris_thermansi"}

list_of_attributes_kind_of_heating={
"petrol_heating":"thermansi_petrelaio","natural_gas_heating":"thermansi_fisiko_aerio","LPG_heating":"thermansi_igraerio","electrical_heating":"thermansi_revma",
"thermal_storage_heating":"thermansi_thermosisswreftis","wood_headting":"thermansi_sompa","pellet_heating":"thermansi_pellet","heat_pump_heating":"thermansi_antlia_thermansis"
}

list_of_attributes={"with_AC":"me_klimatismo","with_storage_room":"me_apothiki","with_elavator":"me_anelkistira","with_solar_heater":"me_iliako_thermosifona","with_fireplace":"me_tzaki",
"Furnished":"epiplwmeno","with_parking":"me_garage","with_garden":"me_kipo","with_pool":"me_pisina","with_balcony":"me_mpalkoni","last_floor":"retire"}

filters=list_of_attributes_heating|list_of_attributes_kind_of_heating|list_of_attributes


In [9]:
html_files_years_filters={}
for attribute in filters:
    
    url_solo=f"https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/{filters[attribute]}?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14" 
    
    
    html_files_years_filters[attribute]=url_solo


#soup=BeautifulSoup(html_file,"html5lib")
html_files_years_filters

{'autonomous_heating': 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/autonomi_thermansi?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 'central_heating': 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/kentriki_thermansi?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 'individual_heating': 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/atomiki_thermansi?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 'no_heating': 'https://www.spitogatos.gr/pwliseis

In [10]:
def get_driver2(sel_webdriver,attribute):
    """
    Takes the driver instance to the correct website.

    Args:
        sel_webdriver (WebDriver): Selenium WebDriver instance.
        attribute: Attribute for which to fetch data.
    """
    sel_webdriver.get(html_files_years_filters[attribute])
    

In [47]:
# ua = UserAgent()
# user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) HeadlessChrome/87.0.4280.88 Safari/537.36-816"
# options = webdriver.ChromeOptions()
# options.add_argument(f'user-agent={user_agent}')
driver=webdriver.Chrome()


In [None]:

def main2(attribute,dont_start_at_first_page=False):

    if not dont_start_at_first_page:

        
        get_driver2(driver,attribute)
    folder_name =f"/home/tsantaris/OneDrive/Data science and AI stuff/Project spitogatos/Houses_data_{datetime.date.today()}"

    try:
        os.mkdir(folder_name)
        
        print(f"Folder '{folder_name}' created successfully.")
    except FileExistsError:
        print(f"Folder '{folder_name}' already exists.")
    except OSError as error:
        print(f"Error creating folder '{folder_name}': {error}")
    try:
        os.mkdir(f"{folder_name}/Houses_data_{attribute}")
        print(f"Folder Houses_data_{attribute} created successfully.")
    except FileExistsError:
        print(f"Folder Houses_data_{attribute} already exists.")
    except OSError as error:
        print(f"Error creating folder Houses_data_{attribute}: {error}")

    print(f"We are scraping attribute {attribute}")

    df=pd.DataFrame(columns=["Location", "Price","Total_area","House_type","Floor","Rooms","Bathrooms","submission_date"])
    df=many_page_scrape(driver,df,attribute=attribute)
    

        


In [39]:
main2("autonomous_heating",dont_start_at_first_page=True)

Folder '/home/tsantaris/OneDrive/Data science and AI stuff/Project spitogatos/Houses_data_2025-05-26' already exists.
We are scraping attribute autonomous_heating
[91m10[0m
[91m20[0m
[91m30[0m
901
We are on page 801 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 802 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 803 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 804 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 805 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 806 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 807 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 808 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 809 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 810 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 811 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 812 of 901
[91m10[0m
[91m20[0m
[91m30[0m
We are on page 813 of 901
[91m10[0m
[91m20[0m
[9

In [50]:
filters

{'autonomous_heating': 'autonomi_thermansi',
 'central_heating': 'kentriki_thermansi',
 'individual_heating': 'atomiki_thermansi',
 'no_heating': 'xwris_thermansi',
 'petrol_heating': 'thermansi_petrelaio',
 'natural_gas_heating': 'thermansi_fisiko_aerio',
 'LPG_heating': 'thermansi_igraerio',
 'electrical_heating': 'thermansi_revma',
 'thermal_storage_heating': 'thermansi_thermosisswreftis',
 'wood_headting': 'thermansi_sompa',
 'pellet_heating': 'thermansi_pellet',
 'heat_pump_heating': 'thermansi_antlia_thermansis',
 'with_AC': 'me_klimatismo',
 'with_storage_room': 'me_apothiki',
 'with_elavator': 'me_anelkistira',
 'with_solar_heater': 'me_iliako_thermosifona',
 'with_fireplace': 'me_tzaki',
 'Furnished': 'epiplwmeno',
 'with_parking': 'me_garage',
 'with_garden': 'me_kipo',
 'with_pool': 'me_pisina',
 'with_balcony': 'me_mpalkoni',
 'last_floor': 'retire'}

In [57]:
main2('with_parking',dont_start_at_first_page=True)

Folder '/home/tsantaris/OneDrive/Data science and AI stuff/Project spitogatos/Houses_data_2025-06-05' already exists.
Folder Houses_data_with_parking already exists.
We are scraping attribute with_parking
the number of total pages to scrape is:441
We scraped entry number: [91m10[0m
We scraped entry number: [91m20[0m
We scraped entry number: [91m30[0m
We are on page 101 of 441
We scraped entry number: [91m10[0m
We scraped entry number: [91m20[0m
We scraped entry number: [91m30[0m
We are on page 102 of 441
We scraped entry number: [91m10[0m
We scraped entry number: [91m20[0m
We scraped entry number: [91m30[0m
We are on page 103 of 441
We scraped entry number: [91m10[0m
We scraped entry number: [91m20[0m
We scraped entry number: [91m30[0m
We are on page 104 of 441
We scraped entry number: [91m10[0m
We scraped entry number: [91m20[0m
We scraped entry number: [91m30[0m
We are on page 105 of 441
We scraped entry number: [91m10[0m
We scraped entry number: [91m

TimeoutException: Message: 
