# Hobby project: Webscraping etuovi.com to get data for analyzing the housing market in Finland.

In [15]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page.
import pandas as pd
import itertools
from tqdm import tqdm
import time
from datetime import date

## Scraping the hrefs of all of the house adverts (cards) that were listed in Finland.


In [9]:
/**
 * @todo Make the program save backups at certain intervals e.g. every 1000 iterations
 * @body Incase there are unexpected errors, the program should save backups so that hours of scraping do not go to waste
 */

/**
 * @todo Handle ConnectionErrors.
 * @body -
 */

# Download webpage (etuovi.com, Turku).
url = "https://www.etuovi.com/myytavat-asunnot?haku=M1857661494"
# Download contents in text format.
data = requests.get(url).text

soup = BeautifulSoup(data, "html.parser")

# Firstly, lets get the amount of pages we need to go through:
no_pages = int([i.string for i in soup.find('div', class_= 'Pagination__Col-sc-3ydysw-1 kpHoDY').next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_element][0])

houses_href = []

for i in tqdm(range(1,no_pages + 1)):
    # Download webpage (etuovi.com, Turku).
    url = "https://www.etuovi.com/myytavat-asunnot?haku=M1857661494&sivu=" + str(i)
    # Download contents in text format.
    data = requests.get(url).text

    soup = BeautifulSoup(data, "html.parser")

    # Scrape all links on page

    project_href = [i['href'] for i in soup.find_all('a', href=True)]
    # print(project_href)

    #Filter out links that are not houses for sale

    for i in project_href:
        if i.startswith('/kohde'):
            houses_href.append(i)

    time.sleep(0.25) #Lets not overload servers
    # print(houses_href)

100%|██████████| 1593/1593 [19:15<00:00,  1.38it/s]


In [10]:
# Just to make sure lets remove possible duplicates.
print(len(houses_href))

houses_href = list(dict.fromkeys(houses_href))
print(len(houses_href))

# print(houses_href)

47769
47761


## Scraping data from the adverts
This part iterates through all of the adverts and looks for values like type, address, price, size and year. In the end it transforms it to a pandas dataframe. NOTE: This iteration is quite large and it takes approximately 10 hours.

In [14]:
houses_for_sale = []

# for href in itertools.islice(houses_href, 0, 30): # For testing
for href in tqdm(houses_href[0:1000]):

    url = "https://www.etuovi.com/" + str(href)

    data = requests.get(url).text

    soup = BeautifulSoup(data, "html.parser")

    # Type, rooms, kitchen, bathroom etc.
    try:
        house_type_etc = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-md-6__1n8OT ItemSummaryContainer__alignLeft__2IE5Z')][0]
    except:
        TypeError
        house_type_etc = None
        continue


    # City
    try:
        house_city = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary')][0]
    except:
        TypeError
        house_city = None
        continue


    #Area  
    try:
        house_area = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary').nextSibling][1]
    except:
        TypeError
        house_area = None
        continue 


    #Address
    try:
        house_address = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary').nextSibling.parent.nextSibling][0]
    except:
        TypeError
        house_address = None
        continue 


    # Price
    try:
        house_price = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-5__3SFMx')][1]
    except:
        TypeError
        house_price = None
        continue

    # Take only the number, not unit.
    if house_price is None:
        house_price = None 
    else: house_price_number = house_price[0:(len(house_price)-2)]


    # Size
    try:
        house_size = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-4__2DYW-')][1]
    except:
        TypeError
        house_size = None
        continue

    # Take only the number, not unit.
    if house_size is None:
        house_size_number = None
    else: house_size_number = house_size.rsplit(' ', 100)[0]


    # Year
    try:
        house_year = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-3__1YPhN')][1]
    except:
        TypeError
        house_year = None
        continue


    # "house_type_etc" is now in format type | number of rooms + etc. + etc.
    # Lets divide it to own categories
    house_type_etc = str(house_type_etc)
    # house_type = house_type_etc.rsplit(' ', 100)[0]

    houses_for_sale.append({"Tyyppi":house_type_etc,"Osoite":house_address, "Kaupunginosa":house_area ,"Kaupunki":house_city, "Hinta (€)":house_price_number, "Pinta-ala (sqm.)":house_size_number, "Rakennusvuosi" : house_year})
    time.sleep(0.25)

houses_df = pd.DataFrame(houses_for_sale)


100%|██████████| 1000/1000 [12:31<00:00,  1.33it/s]


## Saving the scraped data to a .csv file for later refinement and analysis.

In [17]:
today = date.today()
bdy = today.strftime("%b-%d-%Y")
filename = bdy + '-house_market_data.csv'
print(filename)
houses_df.to_csv(filename, encoding="utf-8-sig")

print(houses_df)

Jun-29-2022-house_market_data.csv
                                                Tyyppi  \
0                             Kerrostalo | 2H + KK + S   
1    Omakotitalo | 4h, k, takkah., s. 2 tallia, ver...   
2           Kerrostalo | 3h, k, kh, s, wc, vh, parveke   
3                Omakotitalo | 3 mh, oh, k, khh, kh, s   
4    Kerrostalo | 3-4h + kt + s + vh + khh + las.pa...   
..                                                 ...   
973                                Kerrostalo | 3h + k   
974                           Omakotitalo | 3h + k + s   
975    Omakotitalo | 3-4h, oh, k, khh, kph / wc, s, wc   
976  Kerrostalo | 2h + kt + lasitettu parveke ja as...   
977                               Kerrostalo | 1h + kt   

                     Osoite  Kaupunginosa     Kaupunki  Hinta (€)  \
0               Swingitie 8          None      Tuusula  16 712,24   
1             Karpalotie 10      Lapijoki     Eurajoki    119 000   
2            Kaivokatu 12 B      Keskusta  Hämeenlinna    155 