# Hobby project: Webscraping etuovi.com to get data for analyzing the housing market in Finland.

In [1]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page.
import pandas as pd
import itertools
from tqdm import tqdm
import time
from datetime import date

## Scraping the hrefs of all of the house adverts (cards) that were listed in Finland.


In [19]:
#TODO: save hrefs to file so they can be opened when developing the next cell.

# Download webpage (etuovi.com, Turku).
url = "https://www.etuovi.com/myytavat-asunnot?haku=M1857661494"
# Download contents in text format.
data = requests.get(url).text

soup = BeautifulSoup(data, "html.parser")

# Firstly, lets get the amount of pages we need to go through:
# no_pages = 5
no_pages = int([i.string for i in soup.find('div', class_= 'Pagination__Col-sc-3ydysw-1 kpHoDY').next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_element][0])
houses_href = []

for i in tqdm(range(1,no_pages + 1)):
    # Download webpage (etuovi.com, Turku).
    url = "https://www.etuovi.com/myytavat-asunnot?haku=M1857661494&sivu=" + str(i)
    # Download contents in text format.
    data = requests.get(url).text

    soup = BeautifulSoup(data, "html.parser")

    # Scrape all links on page

    project_href = [i['href'] for i in soup.find_all('a', href=True)]
    # print(project_href)

    #Filter out links that are not houses for sale

    for i in project_href:
        if i.startswith('/kohde'):
            houses_href.append(i)

    time.sleep(0.25) #Lets not overload servers

100%|██████████| 1606/1606 [23:57<00:00,  1.12it/s]


In [20]:
# Just to make sure lets remove possible duplicates.
print("Amount of links before:", len(houses_href))

houses_href = list(dict.fromkeys(houses_href))
print("Amount of links after removing duplicates:", len(houses_href))

# print(houses_href)

Amount of links before: 48178
Amount of links after removing duplicates: 48165


## Scraping data from the adverts
This part iterates through all of the adverts and looks for values like type, address, price, size and year. In the end it transforms it to a pandas dataframe. NOTE: This iteration is quite large and it takes approximately 10 hours.

In [21]:

# TODO: Optimize memory usage, perhaps use batches or something? Check that types, it could also be reason for very high memory usage. (After doing data refinement, I noticed that the types are all objects which propably causes high memory usage)
# TODO: Handle ConnectionErrors.


houses_for_sale = []
iterations = 0
# for href in itertools.islice(houses_href, 0, 30): # For testing
for href in tqdm(houses_href):

    url = "https://www.etuovi.com/" + str(href)

    data = requests.get(url).text

    soup = BeautifulSoup(data, "html.parser")

    # Type, rooms, kitchen, bathroom etc.
    try:
        house_type_etc = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-md-6__1n8OT ItemSummaryContainer__alignLeft__2IE5Z')][0]
    except:
        TypeError
        house_type_etc = None
        continue


    # City
    try:
        house_city = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary')][0]
    except:
        TypeError
        house_city = None
        continue


    #Area  
    try:
        house_area = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary').nextSibling][1]
    except:
        TypeError
        house_area = None
        continue 


    #Address
    try:
        house_address = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary').nextSibling.parent.nextSibling][0]
    except:
        TypeError
        house_address = None
        continue 


    # Price
    try:
        house_price = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-5__3SFMx')][1]
    except:
        TypeError
        house_price = None
        continue

    # Take only the number, not unit.
    if house_price is None:
        house_price = None 
    else: 
        house_price_number_temp = house_price[0:(len(house_price)-2)]
        house_price_number = house_price_number_temp.replace('\xa0', '')


    # Size
    try:
        house_size = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-4__2DYW-')][1]
    except:
        TypeError
        house_size = None
        continue

    # Take only the number, not unit.
    if house_size is None:
        house_size_number = None
    else: house_size_number = house_size.rsplit(' ', 100)[0]


    # Year
    try:
        house_year = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-3__1YPhN')][1]
    except:
        TypeError
        house_year = None
        continue

    # Form of ownership
    try:
        house_form_of_ownership = [i.string for i in soup.find('div', class_='CompactInfoRow__infoRow__2hjs_ flexboxgrid__row__wfmuy').next_sibling.next_sibling.next_sibling][1]
    except:
        TypeError
        house_form_of_ownership = None
        continue


    # "house_type_etc" is now in format type | number of rooms + etc. + etc.
    # Lets divide it to own categories
    house_type_etc = str(house_type_etc)
    # house_type = house_type_etc.rsplit(' ', 100)[0]

    houses_for_sale.append({"Type":str(house_type_etc),"Address":str(house_address), "District":str(house_area) ,"City":str(house_city), "Price (€)":str(house_price_number), "Area (sqm.)":str(house_size_number), "Year built" : str(house_year), "Form of ownership" : str(house_form_of_ownership)})
    iterations += 1
    if iterations % 1000 == 0:
        houses_df = pd.DataFrame(houses_for_sale)
        filename = 'BACKUP-' + str(iterations/1000) + 'K-iterations-' + date.today().strftime("%b-%d-%Y") + '-house-market-data.csv'
        houses_df.to_csv(filename, encoding="utf-8-sig")
        print("Saved a backup of ", iterations, " iterations.")
    time.sleep(0.25)

houses_df = pd.DataFrame(houses_for_sale)


  2%|▏         | 1012/48165 [12:23<9:29:05,  1.38it/s] 

Saved a backup of  1000  iterations.


  4%|▍         | 2031/48165 [25:09<9:00:14,  1.42it/s] 

Saved a backup of  2000  iterations.


  6%|▋         | 3060/48165 [38:02<8:48:39,  1.42it/s] 

Saved a backup of  3000  iterations.


  8%|▊         | 4074/48165 [50:39<9:45:15,  1.26it/s] 

Saved a backup of  4000  iterations.


 11%|█         | 5157/48165 [1:04:16<8:36:52,  1.39it/s] 

Saved a backup of  5000  iterations.


 13%|█▎        | 6178/48165 [1:17:23<8:47:16,  1.33it/s] 

Saved a backup of  6000  iterations.


 15%|█▍        | 7197/48165 [1:30:45<8:05:26,  1.41it/s] 

Saved a backup of  7000  iterations.


 17%|█▋        | 8219/48165 [1:43:40<9:05:06,  1.22it/s] 

Saved a backup of  8000  iterations.


 19%|█▉        | 9237/48165 [1:56:25<7:57:00,  1.36it/s] 

Saved a backup of  9000  iterations.


 21%|██▏       | 10261/48165 [2:09:20<7:43:27,  1.36it/s] 

Saved a backup of  10000  iterations.


 23%|██▎       | 11289/48165 [2:22:17<7:56:36,  1.29it/s] 

Saved a backup of  11000  iterations.


 26%|██▌       | 12343/48165 [2:35:10<7:07:10,  1.40it/s] 

Saved a backup of  12000  iterations.


 28%|██▊       | 13378/48165 [2:48:11<6:43:55,  1.44it/s] 

Saved a backup of  13000  iterations.


 30%|██▉       | 14411/48165 [3:01:12<7:11:38,  1.30it/s] 

Saved a backup of  14000  iterations.


 32%|███▏      | 15435/48165 [3:14:02<7:02:59,  1.29it/s] 

Saved a backup of  15000  iterations.


 34%|███▍      | 16460/48165 [3:26:59<6:36:15,  1.33it/s] 

Saved a backup of  16000  iterations.


 36%|███▋      | 17483/48165 [3:39:44<6:29:57,  1.31it/s]

Saved a backup of  17000  iterations.


 38%|███▊      | 18513/48165 [3:52:30<5:30:29,  1.50it/s]

Saved a backup of  18000  iterations.


 41%|████      | 19589/48165 [4:05:59<6:56:25,  1.14it/s] 

Saved a backup of  19000  iterations.


 43%|████▎     | 20625/48165 [4:18:49<5:36:57,  1.36it/s]

Saved a backup of  20000  iterations.


 45%|████▍     | 21648/48165 [4:31:30<5:30:02,  1.34it/s]

Saved a backup of  21000  iterations.


 47%|████▋     | 22671/48165 [4:44:32<5:12:04,  1.36it/s]

Saved a backup of  22000  iterations.


 49%|████▉     | 23709/48165 [4:57:35<5:17:38,  1.28it/s]

Saved a backup of  23000  iterations.


 51%|█████▏    | 24732/48165 [5:10:21<5:03:20,  1.29it/s]

Saved a backup of  24000  iterations.


 54%|█████▎    | 25787/48165 [5:23:57<4:45:39,  1.31it/s]

Saved a backup of  25000  iterations.


 56%|█████▌    | 26808/48165 [5:37:03<4:32:53,  1.30it/s]

Saved a backup of  26000  iterations.


 58%|█████▊    | 27851/48165 [5:50:03<4:00:23,  1.41it/s]

Saved a backup of  27000  iterations.


 60%|█████▉    | 28889/48165 [6:02:59<4:02:12,  1.33it/s]

Saved a backup of  28000  iterations.


 62%|██████▏   | 29918/48165 [6:15:59<4:35:00,  1.11it/s]

Saved a backup of  29000  iterations.


 64%|██████▍   | 30944/48165 [6:29:16<4:18:43,  1.11it/s]

Saved a backup of  30000  iterations.


 66%|██████▋   | 31971/48165 [6:42:18<3:15:39,  1.38it/s]

Saved a backup of  31000  iterations.


 69%|██████▊   | 32996/48165 [6:55:10<3:23:37,  1.24it/s]

Saved a backup of  32000  iterations.


 71%|███████   | 34036/48165 [7:08:02<2:50:20,  1.38it/s]

Saved a backup of  33000  iterations.


 73%|███████▎  | 35081/48165 [7:21:43<2:06:07,  1.73it/s] 

Saved a backup of  34000  iterations.


 75%|███████▌  | 36140/48165 [7:35:36<2:43:38,  1.22it/s]

Saved a backup of  35000  iterations.


 77%|███████▋  | 37185/48165 [7:49:12<2:24:25,  1.27it/s]

Saved a backup of  36000  iterations.


 79%|███████▉  | 38206/48165 [8:02:47<1:58:03,  1.41it/s]

Saved a backup of  37000  iterations.


 82%|████████▏ | 39267/48165 [8:16:42<1:50:51,  1.34it/s]

Saved a backup of  38000  iterations.


 84%|████████▎ | 40325/48165 [8:30:26<1:42:12,  1.28it/s]

Saved a backup of  39000  iterations.


 86%|████████▌ | 41372/48165 [8:44:24<1:22:26,  1.37it/s]

Saved a backup of  40000  iterations.


 88%|████████▊ | 42437/48165 [8:58:24<1:13:06,  1.31it/s]

Saved a backup of  41000  iterations.


 90%|█████████ | 43516/48165 [9:12:22<59:43,  1.30it/s]  

Saved a backup of  42000  iterations.


 93%|█████████▎| 44616/48165 [9:26:20<44:58,  1.32it/s]  

Saved a backup of  43000  iterations.


 95%|█████████▍| 45688/48165 [9:40:04<33:23,  1.24it/s]  

Saved a backup of  44000  iterations.


 97%|█████████▋| 46910/48165 [9:54:25<13:31,  1.55it/s]

Saved a backup of  45000  iterations.


100%|██████████| 48165/48165 [10:08:47<00:00,  1.32it/s]


## Saving the scraped data to a .csv file for later refinement and analysis.

In [22]:
today = date.today()
bdy = today.strftime("%b-%d-%Y")
filename = bdy + '-house-market-data.csv'
print(filename)
houses_df.to_csv(filename, encoding="utf-8-sig")

print(houses_df)

Jul-10-2022-house-market-data.csv
                                                    Type  \
0                              Kerrostalo | 1h, kk, kph.   
1                    Kerrostalo | 1h + kk + alk + kh + p   
2                                    Luhtitalo | 1h + kk   
3           Kerrostalo | 2h + avokeittiö + kph / wc + vh   
4      Omakotitalo | 4h, k, aula, vh, th, s-tilat, wc...   
...                                                  ...   
45948      Omakotitalo | 4h + k + ph + WC + vh + ullakko   
45949  Erillistalo | 5h + k, sauna ja  2h + k parveke...   
45950        Rivitalo | 3h + k + s + v + katettu terassi   
45951  Omakotitalo | 4mh + oh + k + s + khh + aula + ...   
45952             Kerrostalo | 3h + k + wc + kph + s + p   

                        Address      District        City Price (€)  \
0            Soukanlahdenkatu 9   Armonkallio     Tampere    169000   
1               Lepolankatu 3 A  Kivistönmäki       Lahti     39000   
2              Viljatullintie 4 