# Hobby project: Webscraping etuovi.com to get data for analyzing the housing market in Finland.

In [1]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page.
import pandas as pd
import itertools
from tqdm import tqdm
import time
from datetime import date

## Scraping the hrefs of all of the house adverts (cards) that were listed in Finland.


In [2]:
#TODO: save hrefs to file so they can be opened when developing the next cell.

# Download webpage (etuovi.com, Turku).
url = "https://www.etuovi.com/myytavat-asunnot?haku=M1857661494"
# Download contents in text format.
data = requests.get(url).text

soup = BeautifulSoup(data, "html.parser")

# Firstly, lets get the amount of pages we need to go through:
no_pages = 50
# no_pages = int([i.string for i in soup.find('div', class_= 'Pagination__Col-sc-3ydysw-1 kpHoDY').next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_element][0])
houses_href = []

for i in tqdm(range(1,no_pages + 1)):
    # Download webpage (etuovi.com, Turku).
    url = "https://www.etuovi.com/myytavat-asunnot?haku=M1857661494&sivu=" + str(i)
    # Download contents in text format.
    data = requests.get(url).text

    soup = BeautifulSoup(data, "html.parser")

    # Scrape all links on page

    project_href = [i['href'] for i in soup.find_all('a', href=True)]
    # print(project_href)

    #Filter out links that are not houses for sale

    for i in project_href:
        if i.startswith('/kohde'):
            houses_href.append(i)

    time.sleep(0.25) #Lets not overload servers
    # print(houses_href)

100%|██████████| 50/50 [00:46<00:00,  1.07it/s]


In [5]:
# Just to make sure lets remove possible duplicates.
print("Amount of links before:", len(houses_href))

houses_href = list(dict.fromkeys(houses_href))
print("Amount of links after removing duplicates:", len(houses_href))

# print(houses_href)

Amount of links before: 1497
Amount of links after removing duplicates: 1497


## Scraping data from the adverts
This part iterates through all of the adverts and looks for values like type, address, price, size and year. In the end it transforms it to a pandas dataframe. NOTE: This iteration is quite large and it takes approximately 10 hours.

In [23]:

# TODO: Optimize memory usage, perhaps use batches or something? Check that types, it could also be reason for very high memory usage. (After doing data refinement, I noticed that the types are all objects which propably causes high memory usage)
# TODO: Handle ConnectionErrors.


houses_for_sale = []
iterations = 0
# for href in itertools.islice(houses_href, 0, 30): # For testing
for href in tqdm(houses_href):

    url = "https://www.etuovi.com/" + str(href)

    data = requests.get(url).text

    soup = BeautifulSoup(data, "html.parser")

    # Type, rooms, kitchen, bathroom etc.
    try:
        house_type_etc = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-12__1I1LS flexboxgrid__col-md-6__1n8OT ItemSummaryContainer__alignLeft__2IE5Z')][0]
    except:
        TypeError
        house_type_etc = None
        continue


    # City
    try:
        house_city = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary')][0]
    except:
        TypeError
        house_city = None
        continue


    #Area  
    try:
        house_area = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary').nextSibling][1]
    except:
        TypeError
        house_area = None
        continue 


    #Address
    try:
        house_address = [i.string for i in soup.find('a', class_='MuiTypography-root MuiLink-root MuiLink-underlineHover InfoSegment__noStyleLink__2e28Y MuiTypography-colorPrimary').nextSibling.parent.nextSibling][0]
    except:
        TypeError
        house_address = None
        continue 


    # Price
    try:
        house_price = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-5__3SFMx')][1]
    except:
        TypeError
        house_price = None
        continue

    # Take only the number, not unit.
    if house_price is None:
        house_price = None 
    else: house_price_number = house_price[0:(len(house_price)-2)]


    # Size
    try:
        house_size = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-4__2DYW-')][1]
    except:
        TypeError
        house_size = None
        continue

    # Take only the number, not unit.
    if house_size is None:
        house_size_number = None
    else: house_size_number = house_size.rsplit(' ', 100)[0]


    # Year
    try:
        house_year = [i.string for i in soup.find('div', class_='flexboxgrid__col-xs-4__p2Lev flexboxgrid__col-md-3__1YPhN')][1]
    except:
        TypeError
        house_year = None
        continue

    # Form of ownership
    try:
        house_form_of_ownership = [i.string for i in soup.find('div', class_='CompactInfoRow__infoRow__2hjs_ flexboxgrid__row__wfmuy').next_sibling.next_sibling.next_sibling][1]
    except:
        TypeError
        house_form_of_ownership = None
        continue


    # "house_type_etc" is now in format type | number of rooms + etc. + etc.
    # Lets divide it to own categories
    house_type_etc = str(house_type_etc)
    # house_type = house_type_etc.rsplit(' ', 100)[0]

    houses_for_sale.append({"Type":house_type_etc,"Address":house_address, "District":house_area ,"City":house_city, "Price (€)":house_price_number, "Area (sqm.)":house_size_number, "Year built" : house_year, "Form of ownership" : house_form_of_ownership})
    iterations += 1
    if iterations % 1000 == 0:
        houses_df = pd.DataFrame(houses_for_sale)
        filename = 'BACKUP-' + str(iterations/1000) + 'K-iterations-' + date.today().strftime("%b-%d-%Y") + '-house-market-data.csv'
        houses_df.to_csv(filename, encoding="utf-8-sig")
        print("Saved a backup of ", iterations, " iterations.")
    time.sleep(0.25)

houses_df = pd.DataFrame(houses_for_sale)


  2%|▏         | 1022/47833 [12:59<10:13:55,  1.27it/s]

Saved a backup of  1000  iterations.


  4%|▍         | 2035/47833 [26:03<10:47:05,  1.18it/s]

Saved a backup of  2000  iterations.


  6%|▋         | 3061/47833 [39:09<9:53:04,  1.26it/s] 

Saved a backup of  3000  iterations.


  9%|▊         | 4083/47833 [52:30<9:34:42,  1.27it/s] 

Saved a backup of  4000  iterations.


 11%|█         | 5097/47833 [1:05:37<9:35:33,  1.24it/s] 

Saved a backup of  5000  iterations.


 13%|█▎        | 6116/47833 [1:18:50<9:35:32,  1.21it/s] 

Saved a backup of  6000  iterations.


 15%|█▍        | 7147/47833 [1:33:04<8:56:47,  1.26it/s]  

Saved a backup of  7000  iterations.


 17%|█▋        | 8205/47833 [1:48:25<7:40:30,  1.43it/s]  

Saved a backup of  8000  iterations.


 19%|█▉        | 9247/47833 [2:01:39<9:13:46,  1.16it/s] 

Saved a backup of  9000  iterations.


 21%|██▏       | 10262/47833 [2:19:46<7:53:37,  1.32it/s] 

Saved a backup of  10000  iterations.


 24%|██▎       | 11299/47833 [2:32:49<7:58:34,  1.27it/s] 

Saved a backup of  11000  iterations.


 25%|██▌       | 12059/47833 [2:42:43<8:29:39,  1.17it/s] 

## Saving the scraped data to a .csv file for later refinement and analysis.

In [3]:
today = date.today()
bdy = today.strftime("%b-%d-%Y")
filename = bdy + '-house-market-data.csv'
print(filename)
houses_df.to_csv(filename, encoding="utf-8-sig")

print(houses_df)

Jun-30-2022-house-market-data.csv


NameError: name 'houses_df' is not defined