#  Scraper
Our goal in this notebook is to scrape data for house sales in the city of Thesaloniki. We will scrape data from the biggest Greek website for house sales: spitogatos.gr. First we import all the necessary libraries. We are going to use selenium for the scraping.


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import httpx
from selenium import webdriver
from selenium.webdriver.common.by import By
import numpy as np
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
import asyncio

We initiate a webdriver element

In [2]:
driver=webdriver.Chrome()


We create an empty dataframe which will contain all the scraped data. 

In [3]:
df=pd.DataFrame(columns=["Location", "Price","Total_area","House_type","Floor","Rooms","Bathrooms"])
df

Unnamed: 0,Location,Price,Total_area,House_type,Floor,Rooms,Bathrooms


We create a dictionary with all the years from 1952 to 2023 as keys and all the webpages containing the houses for sale that were built in that year in Thessaloniki.

In [4]:
html_files_years1952_2023={}
for i in range(1952,2023):
    url_solo=f"https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/etos_kataskevis_apo-{i}/etos_kataskevis_eos-{i}?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14" 
    
    
    html_files_years1952_2023[i]=url_solo


#soup=BeautifulSoup(html_file,"html5lib")
html_files_years1952_2023

{1952: 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/etos_kataskevis_apo-1952/etos_kataskevis_eos-1952?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 1953: 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/etos_kataskevis_apo-1953/etos_kataskevis_eos-1953?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 1954: 'https://www.spitogatos.gr/pwliseis-katoikies/anazitisi-xarti/etos_kataskevis_apo-1954/etos_kataskevis_eos-1954?gad_source=1&gclid=Cj0KCQjw4MSzBhC8ARIsAPFOuyWdbWoq9MkzYB_9GZ1t8uEdSNmoSlUXErPyIku-g_dip54sBWRoNC0aArmUEALw_wcB&latitudeLow=40.535981&latitudeHigh=40.733210&longitudeLow=22.799034&longitudeHigh=23.120041&zoom=14',
 

The next function takes a webdriver element which is going to be one page from all houses build on a specific year (30 houses per page), in the spitogatos website, and returns a dataframe with the attributes location,price,area,description,floor number,number of rooms, number of bathrooms

In [5]:
def attributes2(sel_webdriver,df_base):
    ignored_exceptions=(NoSuchElementException,StaleElementReferenceException,TimeoutException)
    #this is a list of webdriver elements one  for each house. Note that we wait until all elements have been located.
    entries=WebDriverWait(sel_webdriver, 3,ignored_exceptions=ignored_exceptions)\
                        .until(expected_conditions.presence_of_all_elements_located((By.XPATH, "//article[@class='ordered-element']")))
    # for each house now we want to find the floor, number of rooms etc. Again we wait until all elements have been located. 
    # The exact locations for each attribute can be found by inspecting the html code of the website.
    
    for house in entries:
        try:
            floor=WebDriverWait(house, 0.3,ignored_exceptions=ignored_exceptions)\
                            .until(expected_conditions.presence_of_all_elements_located((By.XPATH, ".//ul[@class='tile__info']/li[1]/span/span")))
            
            
            rooms=WebDriverWait(house, 0.3,ignored_exceptions=ignored_exceptions)\
                            .until(expected_conditions.presence_of_all_elements_located((By.XPATH, ".//ul[@class='tile__info']/li[2]/span/span")))
            
            bathrooms=WebDriverWait(house, 0.3,ignored_exceptions=ignored_exceptions)\
                            .until(expected_conditions.presence_of_all_elements_located((By.XPATH, ".//ul[@class='tile__info']/li[3]/span/span")))
            
            price=WebDriverWait(house, 0.3,ignored_exceptions=ignored_exceptions)\
                            .until(expected_conditions.presence_of_all_elements_located((By.XPATH, ".//div[@class='tile__price']/p")))
            
            description_and_area=WebDriverWait(house, 0.3,ignored_exceptions=ignored_exceptions)\
                            .until(expected_conditions.presence_of_all_elements_located((By.XPATH, ".//div[@class='content__top']/h3[1]")))
            
            location=WebDriverWait(house, 0.3,ignored_exceptions=ignored_exceptions)\
                            .until(expected_conditions.presence_of_all_elements_located((By.XPATH, ".//div[@class='content__top']/h3[2]")))
        
        #Each of these attributes is a WebDriver element from which we want to extract the html content.
        #Also if the scraper did not find any of these attributes we want their value as np.nan. 
            if len(description_and_area)>0:
                description=description_and_area[0].get_attribute("innerHTML").split(",")[0]
                area=description_and_area[0].get_attribute("innerHTML").split(",")[1]
            else:
                desciption=np.nan
                area=np.nan
                
            if len(floor)>0:
                floor=floor[0].get_attribute("innerHTML")
            else:
                floor=np.nan

            if len(rooms)>0:
                rooms=rooms[0].get_attribute("innerHTML")
            else:
                rooms=np.nan

            if len(bathrooms)>0:
                bathrooms=bathrooms[0].get_attribute("innerHTML")
            else:
                bathrooms=np.nan

            if len(price)>0:
                price=price[0].get_attribute("innerHTML")
            else:
                price=np.nan

            if len(location)>0:
                location=location[0].get_attribute("innerHTML")
            else:
                location=np.nan
        except: 
            description=np.nan
            area=np.nan
            floor=np.nan
            rooms=np.nan
            bathrooms=np.nan
            price=np.nan
            location=np.nan
        #Finallly we append to our empty dataframe the extracted attributes.   
        df0=pd.DataFrame({"Location":location, "Price":price,"Total_area":area,"House_type":description,"Floor":floor,"Rooms":rooms,"Bathrooms":bathrooms},index=[0])
        print(df0)
        df_base=pd.concat([df_base,df0],ignore_index=True)
    return(df_base)


The next function takes a webdriver element and a year and takes the webdriver element to the correct page to scrape.


In [6]:
async def get_driver(sel_webdriver,year):
    sel_webdriver.get(html_files_years1952_2023[year])
    await asyncio.sleep(3)

The next function takes a webdriver element which is configured in the correct webpage containing 30 houses and clicks the next button so that the next 30 houses for that year are loaded.

In [7]:
async def click(sel_webdriver):
    page1=sel_webdriver.find_elements(By.XPATH,"//ul[@class='pagination b-pagination']/li[@class='page-item page-arrow']")
    page1[-1].click()
    await asyncio.sleep(3)

The next function takes a webdriver, a year and a dataframe. It then scrapes information for all house entries build on that year and puts it in a pandas dataframe. It saves the data in a csv file once it is done.

The general algorithm will work as follows:
1. First we load the first page of all houses build for a specific year. Each of the pages in this set contains 30 houses and we need to scrape all pages. 
2. We find the number of pages in the html file of the webpage by doing a .find_elements command.
3. We call the attributes2 function on the driver we loaded on the first step. We append the data (including the year) to our initial dataframe.
4. If there is more than one page of houses built on that specific year we load the next page with the click function.
5. We repeat step 4 until all pages for that year have been scraped.
6. Finally we export the data as a csv file.

This concludes the scraping process for a specific year.

In [8]:
async def yearly_scrape(sel_webdriver,year,df):
    #1. First we load the webpage which has all houses build for a specific year. Each of the pages in this webpage contains 30 houses and we need to scrape all pages. 
    await get_driver(sel_webdriver,year)
    #2. We find the number of pages in the html file of the webpage by doing a .find_elements command.
    pages=sel_webdriver.find_elements(By.XPATH,"//ul[@class='pagination b-pagination']")

    #3. We call the attributes2 function on the driver we loaded on the first step. We append the data (including the year) to our initial dataframe.
    df1= attributes2(sel_webdriver,df)
    df1["Year_of_construction"]=year
    df=pd.concat([df,df1],ignore_index=True)
    #4. If there is more than one page of houses built on that specific year we load the next page with the click function.
    if  len(pages)>0:
        string=pages[0].get_attribute("innerHTML")
        page_num=string.count("<li")-2
        for page in range(2,page_num):
            await click(sel_webdriver)
        #5. We repeat step 4 until all pages for that year have been scraped.    
            df1=attributes2(sel_webdriver,df)
            df1["Year_of_construction"]=year
            df=pd.concat([df,df1],ignore_index=True)
    #6. Finally we export the data as a csv file.
    df.to_csv(f"Houses build in {year}")
    return(df)

Now we want to do the same for all years from 1952 to 2023.

In [10]:
for year in range(1952,2024):
    df=pd.DataFrame(columns=["Location", "Price","Total_area","House_type","Floor","Rooms","Bathrooms"])
    await yearly_scrape(driver,year,df)

                              Location       Price Total_area House_type  \
0  Λευκός Πύργος (Κέντρο Θεσσαλονίκης)  €2.700.000    170τ.μ.   Μεζονέτα   

  Floor Rooms Bathrooms  
0     7     3         2  
                            Location     Price Total_area  House_type Floor  \
0  Βαρδάρης (Βαρδάρης - Λαχανόκηποι)  €245.000     70τ.μ.  Διαμέρισμα     7   

  Rooms Bathrooms  
0     2         1  
                                Location     Price Total_area  House_type  \
0  Ιστορικό Κέντρο (Κέντρο Θεσσαλονίκης)  €245.000     70τ.μ.  Διαμέρισμα   

  Floor Rooms Bathrooms  
0     7     2         1  
                            Location     Price Total_area  House_type Floor  \
0  Βαρδάρης (Βαρδάρης - Λαχανόκηποι)  €245.000     70τ.μ.  Διαμέρισμα     7   

  Rooms Bathrooms  
0     2         1  
                           Location     Price Total_area  House_type Floor  \
0  Αγία Σοφία (Κέντρο Θεσσαλονίκης)  €290.000     71τ.μ.  Διαμέρισμα     1   

  Rooms Bathrooms  
0     2      

CancelledError: 