# Hobby project: Webscraping etuovi.com to get data for analyzing the housing market in Finland.

In [1]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page.
import pandas as pd
import itertools
import os
from tqdm import tqdm
import threading
import time
from joblib import Parallel, delayed

## Initial part
Here I wanted to test that everything works as expected. However, I decided to leave it be so that I don't need to adjust the later iteration for the different URL format of the first page of house adverts.

In [2]:
MAX_THREADS = 30

# Download webpage (etuovi.com, Turku).
url = "https://www.etuovi.com/myytavat-asunnot?haku=M1857661494"
# Download contents in text format.
data = requests.get(url).text

In [3]:
# Create BeautifulSoup object
soup = BeautifulSoup(data, "html.parser")

In [4]:
# Scrape all links on page

project_href = [i['href'] for i in soup.find_all('a', href=True)]
# print(project_href)

#Filter out links that are not houses for sale

houses_href = []

for i in project_href:
    if i.startswith('/kohde'):
        houses_href.append(i)
# print(houses_href)

## Scraping the hrefs of all of the house adverts (cards) that were listed in Turku.
TODO: Right now the range is set so that final page will be &sivu=54. It should be adjusted so that we scrape the value for how many pages of adverts there are.

In [5]:
# Now lets repeat the same process for the rest of the pages and lets add all
# of the hrefs to the same "houses_href" list

# Firstly, lets get the amount of pages we need to go through:
no_pages = int([i.string for i in soup.find('div', class_= 'Pagination__Col-sc-3ydysw-1 kpHoDY').next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_element][0])

for i in tqdm(range(2,no_pages + 1)):
    # Download webpage (etuovi.com, Turku).
    url = "https://www.etuovi.com/myytavat-asunnot?haku=M1857661494&sivu=" + str(i)
    # Download contents in text format.
    data = requests.get(url).text

    soup = BeautifulSoup(data, "html.parser")

    # Scrape all links on page

    project_href = [i['href'] for i in soup.find_all('a', href=True)]
    # print(project_href)

    #Filter out links that are not houses for sale

    for i in project_href:
        if i.startswith('/kohde'):
            houses_href.append(i)

    time.sleep(0.1) #Lets not overload servers
    # print(houses_href)

  1%|          | 16/1591 [00:13<22:39,  1.16it/s]


KeyboardInterrupt: 

In [81]:
# Just to make sure lets remove possible duplicates.
print(len(houses_href))

houses_href = list(dict.fromkeys(houses_href))
print(len(houses_href))

# print(houses_href)

69259
47706


## Scraping data from the adverts
This part iterates through all of the adverts and looks for values like type, address, price, size and year. In the end it transforms it to a pandas dataframe. NOTE: This iteration is quite large and it takes approximately 10-15 minutes.

In [82]:
houses_for_sale = []

# for href in itertools.islice(houses_href, 0, 30): # For testing
for href in tqdm(houses_href):

    url = "https://www.etuovi.com/" + str(href)

    data = requests.get(url).text

    soup = BeautifulSoup(data, "html.parser")

    # Type, rooms, kitchen, bathroom etc.
    try:
        house_type_etc = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-md-6__1n8OT ItemSummaryContainer__alignLeft__2IE5Z')][0]
    except:
        TypeError
        house_type_etc = None
        continue


    # City
    try:
        house_city = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary')][0]
    except:
        TypeError
        house_city = None
        continue


    #Area  
    try:
        house_area = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary').nextSibling][1]
    except:
        TypeError
        house_area = None
        continue 


    #Address
    try:
        house_address = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary').nextSibling.parent.nextSibling][0]
    except:
        TypeError
        house_address = None
        continue 


    # Price
    try:
        house_price = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-5__3SFMx')][1]
    except:
        TypeError
        house_price = None
        continue

    # Take only the number, not unit.
    if house_price is None:
        house_price = None 
    else: house_price_number = house_price[0:(len(house_price)-2)]


    # Size
    try:
        house_size = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-4__2DYW-')][1]
    except:
        TypeError
        house_size = None
        continue

    # Take only the number, not unit.
    if house_size is None:
        house_size_number = None
    else: house_size_number = house_size.rsplit(' ', 100)[0]


    # Year
    try:
        house_year = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-3__1YPhN')][1]
    except:
        TypeError
        house_year = None
        continue


    # "house_type_etc" is now in format type | number of rooms + etc. + etc.
    # Lets divide it to own categories
    house_type_etc = str(house_type_etc)
    # house_type = house_type_etc.rsplit(' ', 100)[0]

    houses_for_sale.append({"Tyyppi":house_type_etc,"Osoite":house_address, "Kaupunginosa":house_area ,"Kaupunki":house_city, "Hinta (€)":house_price_number, "Pinta-ala (sqm.)":house_size_number, "Rakennusvuosi" : house_year})
    time.sleep(0.1)

houses_df = pd.DataFrame(houses_for_sale)


  9%|▉         | 4389/47706 [39:35<5:56:31,  2.02it/s]  

KeyboardInterrupt: 

## Saving the scraped data to a .csv file for later refinement and analysis.

In [47]:
houses_df.to_csv('House_market_data.csv', encoding="utf-8-sig")

print(houses_df)

                                               Tyyppi  \
0   Kerrostalo | 2h + k + var + vh + lasitettu par...   
1   Kerrostalo | 3h + k + s + var + erill.wc + las...   
2       Kerrostalo | 2h + k + kph + lasitettu parveke   
3                      Kerrostalo | 1h + kt + parveke   
4            Puutalo-osake | 2-3 h + k + kph + s + wc   
5   Omakotitalo | 3mh + oh + k + rh + et + tk + vh...   
6                             Kerrostalo | 2h + k + s   
7                                 Kerrostalo | 3h + k   
8                  Omakotitalo | 3h + k + s + wc + at   
9                                  Kerrostalo | 3h, k   
10                           Erillistalo | 5h + k + s   
11          Rivitalo | 2h + k + kph + s + ulkovarasto   
12                                  Rivitalo | 3h + k   
13      Kerrostalo | 1h + kt + kph + alkovi + terassi   
14                  Omakotitalo | 5 h, k, tkh, khh, s   
15                           Erillistalo | 4H + K + S   
16                           Ke