# Scraping finn for houses for sale, Data analysis, Price prediction

### Project Setup

In [3]:
import numpy as np
import pandas as pd
import requests
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import sys
import time
from pprint import pprint as pp
from deep_translator import GoogleTranslator
# from geopy.geocoders import Nominatim
# from matplotlib import pyplot as plt

Setting up selenium for properly scraping. I first tried scraping with requests package but it turns out finn renders the information in some pages with javascript on the client side. To get past that I had to do this bit much work to setup browser automation so selenium can work properly

In [4]:
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=chrome_options)

### Scraping Finn

1.   First I will be scraping only the add codes(finncode) of the house listing from the search page https://www.finn.no/realestate/homes/search.html
2.   Then I will be scraping information of the house with those finncodes.
The details about each of the add can be found with https://www.finn.no/realestate/homes/ad.html?finnkode={} link where the {} can be replaced with a finncode to get to the details page of the house.

There are also some ads which contain details about multiple houses usually from a real estate company. Im excluding those houses because some of those houses are in development and are not complete. In this project I will be focusing on the houses that finn did not place in the new houses category.



#### Scraping Phase 1: Grabbing the Finn Codes from the listing page

This function goes through each page of the house listing and grabs all the ad codes present in each page. I have noticed that finn do not show more than 50 pages. In the future I will run the scraping on a raspberry pi and run the scraping cron job once a day and collect the newly published house data. That way over the year I will have a lot of data.

In [None]:
# Url for Homes: "https://www.finn.no/realestate/homes/search.html?page="
# Url for Cars: "https://www.finn.no/car/used/search.html?page="
# Url for Jobs: "https://www.finn.no/job/fulltime/search.html?page="

def get_all_codes(url, driver, npages=100, verbose=1):
  all_codes = []
  
  for page_no in range(1, npages):
    page_url = url.format(page_no)

    driver.get(page_url)
    soup = bs(driver.page_source, 'html.parser')
    ad_links = soup.find_all('a', attrs={'class':'ads__unit__link'})
    ad_codes = [int(ad['id']) for ad in ad_links]
    
    if len(ad_codes) < 2:
      break
    if verbose == 1:
      print(f'Indexed: {len(ad_codes)} from {page_url}')
    
    all_codes.extend(ad_codes)
    time.sleep(0.2)

  return list(set(all_codes))

In [None]:
homes_listing_url = "https://www.finn.no/realestate/homes/search.html?page={}"
finn_codes_home = get_all_codes(homes_listing_url, driver, 55)
len(finn_codes_home)

Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=39
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=40
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=41
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=42
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=43
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=44
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=45
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=46
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=47
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=48
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=49
Indexed: 51 from https://www.finn.no/realestate/homes/search.html?page=50


2526

#### Scraping Phase 2: Scraping the House info pages from the finn codes

This is the main function that collect all the data from an ad detail page and returns a dictionary with collected features.

In [None]:
def parse_housing_page(finn_code, driver):
  ## Getting the raw page
  detail_page = "https://www.finn.no/realestate/homes/ad.html?finnkode={}".format(finn_code)
  driver.get(detail_page)
  soup = bs(driver.page_source, 'html.parser')
  

  ad_main = soup.find('div', attrs={"class":"u-word-break"})

  try:
    price = ad_main.find(lambda tag: tag.name == 'span' and tag.has_attr('class') and tag['class'] == ['u-t3']).string
  except AttributeError:
    # print(f'No house details found with {finn_code} finncode')
    return

  price = int(''.join(price.split()[:-1]))

  ad_dict = {
      "id" : finn_code,
    "title": ad_main.find('h1').string,
    "address": ad_main.find('p', attrs={'class':'u-caption'}).string,
    "price": price
  }

  ## Parsing number of rooms, bed rooms, total space and other info
  other_features = ad_main.find_all('dl', attrs={'class':'definition-list'})
  for feature_vec in other_features:
    feature_vec = feature_vec.get_text().split('\n')
    
    ## basic data cleaning
    feature_vec = [''.join(x.strip()) for x in feature_vec if len(x) > 0]

    if len(feature_vec) < 2:
      continue
    for k,v in zip(feature_vec[0::2], feature_vec[1::2]):
      if len(k) == 0 or len(v) == 0:
        continue
      ad_dict[k] = v
  
  facilities = []
  try:
    for item in ad_main.find_all('ul', attrs={'class':'list'}):
      flts = item.get_text().strip().split('\n')
      for item in flts:
        item = item.split('/')
        facilities.extend(item)
  except AttributeError:
    pass
    # ad_dict['facilities'] = None
  else:
    ad_dict['facilities'] = list(set(facilities))

  ## Parsing textual data in the ad body
  more_info = []
  try:
    for paragraph in ad_main.find('div', attrs={"data-controller":"moreKeyInfo"}).find_all('p'):
      for lines in paragraph.get_text().strip():
        more_info.extend(lines.strip())
  except AttributeError:
    pass

  try:
    for paragraph in ad_main.find('div', attrs={"id":"collapsableTextContent"}).find_all('p'):
      for lines in paragraph.get_text().strip().split('\n'):
        more_info.extend(lines.strip())
  except AttributeError:
    pass
  
  if len(more_info) > 5:
    ad_dict['description'] = ''.join(more_info)

  try:    # Get the latitude and longitude from the attached map in the post
    mapinfo = soup.find('a', attrs={"data-controller":"trackMap"})
    lat = mapinfo['href'].split('&')[1].split('=')[1]
    lon = mapinfo['href'].split('&')[2].split('=')[1]
  except AttributeError:
    # print('cant find map')
    pass
  else:
    ad_dict['lat'] = lat
    ad_dict['lon'] = lon

  return ad_dict

Its a sample of data thats been collected from a finn code. I have cleaned a bit when scraping the data. But a thorough cleaning would be done when I process the whole dataset.

In [None]:
finn_code = 200661038
pp(parse_housing_page(finn_code, driver))

{'Boligselgerforsikring': 'Ja',
 'Boligtype': 'Hytte',
 'Bruksareal': '107 m²',
 'Bruttoareal': '118 m²',
 'Byggeår': '2014',
 'Eieform bolig': 'Eier (Selveier)',
 'Energimerking': 'C - gul',
 'Kommunale avg.': '8\xa0203 kr per år',
 'Omkostninger': '106\xa0892 kr',
 'Primærrom': '101 m²',
 'Soverom': '4',
 'Tomteareal': '2345 m² (eiet)',
 'Totalpris': '3\xa0756\xa0892 kr',
 'address': 'Havåsen 9, 4187 Ombo',
 'description': 'Entre/gang,4soverom,bad,stue/kjøkken,wc,vaskerom.3650000,-(Prisantydning)Omkostninger2800,-(BoligkjøperforsikringPluss(valgfritttillegg))172,-(Pantattestkjøper)585,-(Tingl.gebyrpantedokument)585,-(Tingl.gebyrskjøte)11500,-(BoligkjøperforsikringHELP(valgfritt))91250,-(Dokumentavgift(forutsattsalgssum:3650000,-))--------------------------------------------------------106892,-(Omkostningertotalt)--------------------------------------------------------3756892,-(Totalprisinkl.omkostninger)NB:Regnestykketforutsetteratdetkuntinglysesettpantedokumentogateiendommenselgesti

Scraping all the collected finn codes and making a dataset from them.

In [None]:
houses_dataset = []
for finn_code in tqdm(finn_codes_home):
  house_dict = parse_housing_page(finn_code, driver)
  if house_dict is None:
    print(f'\texcluded:{finn_code}')
    continue
  # print(f'\tAdded:{finn_code}')
  houses_dataset.append(house_dict)

### Data Analysis
Sample of the dataset

#### Data Cleaning

In [None]:
df = pd.DataFrame.from_dict(list(filter(None,houses_dataset)))
df.sample(5)

Unnamed: 0,id,title,address,price,Fellesgjeld,Omkostninger,Totalpris,Felleskost/mnd.,Boligtype,Eieform bolig,Soverom,Primærrom,Bruksareal,Etasje,Byggeår,Energimerking,Rom,Tomteareal,Bruttoareal,Fellesformue,Formuesverdi,facilities,description,lat,lon,Kommunale avg.,Boligselgerforsikring,Verditakst,Lånetakst,Tomt,Festeavgift,Pris med fellesgjeld,Renovert år,Låneverdi,Grunnflate,Felleskost/mnd. etter avdragsfri periode,Sikringsordning,Festeår,Utleiedel
1478,200408117,Sofienberg 2-roms oppussingsobjekt med et bra...,"Trondheimsveien 95, 0565 Oslo",3300000,93 000 kr,15 385 kr,3 408 385 kr,3 866 kr,Leilighet,Andel,1.0,50 m²,53 m²,3,1941,F - rød,2.0,9283 m² (eiet),58 m²,,959 906 kr,"[Kabel-TV, vektertjeneste, Vaktmester-, Felles...","entrè,kjøkken,stue,badogsoverom.jFelleskostnad...",59.924236,10.774315,,Ja,,,,,,,,,,,,
1681,200327120,"Delikat og innbydende 2,5 roms med stor sydves...","Tokerudberget 10, 0986 Oslo",2990000,158 000 kr,80 072 kr,3 228 072 kr,3 790 kr,Leilighet,Eier (Selveier),1.0,79 m²,81 m²,2,1972,E - rød,2.0,28203 m²,90 m²,27 690 kr,814 248 kr,"[Ingen gjenboere, Kabel-TV, Offentlig vann, Ro...","Varmtvann,kabel-tv/internett(grunnpakke),vaktm...",59.9649,10.9262,,,,,,,,,,,,,,
1071,199849055,Lekker og praktisk toppleilighet med solrik ba...,"Leif Aunes vei 5 D, 8012 Bodø",1920000,169 976 kr,960 kr,2 090 936 kr,2 627 kr,Leilighet,Andel,,32 m²,39 m²,3,1968,G - rød,1.0,12241 m² (eiet),,17 056 kr,386 420 kr,"[Offentlig vann, Kabel-TV, Rolig, Balkong, Par...",Felleskostnaderinkludererbetjeningavandelfelle...,67.2993,14.412008,,,1 920 000 kr,,,,,,,,,,,
304,196601244,Fyllingsdalen - Fantastisk 3-roms leilighet m/...,"Løvåsbakken 33, 5145 Fyllingsdalen",890000,2 598 933 kr,5 660 kr,3 494 593 kr,13 501 kr,Leilighet,Andel,2.0,77 m²,80 m²,2,2008,C - rød,,16122 m² (eiet),80 m²,11 774 kr,781 933 kr,[],Vedtekbestemtforkjøpsrettsomvilbliavklartiette...,60.355522,5.2844105,,Ja,,,,,,,,,,,,
824,199856052,"Pen, attraktiv og innbydende endeleilighet med...","Plogveien 15, 0679 Oslo",3400000,92 000 kr,15 385 kr,3 507 385 kr,3 190 kr,Leilighet,Andel,1.0,53 m²,53 m²,2,1958,F - rød,2.0,28865 m² (eiet),59 m²,9 477 kr,774 555 kr,"[Kabel-TV, Ingen gjenboere, Rolig, vektertjene...","entre,stue,kjøkken,bad/wc,1soveromjaVarmtvann,...",59.898865,10.808667,,Ja,,,,,,,,,,,,


In [None]:
df.shape

(1694, 39)

Saving the raw dataset, as backup.

In [None]:
df.to_csv('Norway_houses.csv')

Lets checkout the amount of missing values the dataset has

In [None]:
df.isnull().sum().sort_values(ascending = False)

Utleiedel                                   1693
Festeår                                     1692
Sikringsordning                             1690
Grunnflate                                  1682
Felleskost/mnd. etter avdragsfri periode    1680
Låneverdi                                   1677
Pris med fellesgjeld                        1669
Lånetakst                                   1664
Renovert år                                 1662
Festeavgift                                 1649
Tomt                                        1614
Verditakst                                  1415
Boligselgerforsikring                       1303
Fellesformue                                1051
Fellesgjeld                                  992
Kommunale avg.                               962
Rom                                          636
Felleskost/mnd.                              571
Etasje                                       528
Bruttoareal                                  374
Energimerking       

Lets translate the columns to english.

In [None]:
def n2e(norsk_text):
  return GoogleTranslator(source='no', target='en').translate(norsk_text)

df.columns = list(map(n2e, df.columns))

In [None]:
missing_vals = df.isnull().sum().sort_values(ascending = False)
missing_vals[missing_vals>0]

Rental part                                        1693
Party year                                         1692
Security scheme                                    1690
Base surface                                       1682
Felleskost / mnd. after installment-free period    1680
Loan value                                         1677
Price with joint debt                              1669
Loan rate                                          1664
Renovated year                                     1662
Fixing fee                                         1649
Empty                                              1614
Valuation                                          1415
Home seller insurance                              1303
Common property                                    1051
Joint debt                                          992
Municipal avg.                                      962
Room                                                636
Felleskost / mnd.                               

There are some column with a lot of missing values, In close inspection lot of these columns are not that important either. I will be removing the columns with more than 50% missing values.

In [None]:
cutoff_pct = 50
cols_to_remove = missing_vals[missing_vals > df.shape[0] * cutoff_pct / 100].keys()
df = df.drop(columns=list(cols_to_remove))
df.sample(5)

Unnamed: 0,id,title,address,price,Costs,Total price,Felleskost / mnd.,Housing type,Own form of housing,Bedroom,Primary room,Usable area,Story,Year of construction,Energy labeling,Room,Land area,Gross area,Property value,facilities,description,lazy,lon
1107,200537320,Sjelden 3-roms toppleilighet med nydelig utsik...,"Heggdalsringen 65, 7049 Trondheim",5295000,140 440 kr,5 435 440 kr,2 189 kr,Leilighet,Eier (Selveier),2,69 m²,73 m²,5.0,2019,C - mørkegrønn,,15357 m² (eiet),,,[],"Felleskostnadenstipulertdekkerdriftavsameiet,f...",63.40735,10.449953
1555,200646058,Meget pen og tiltalende halvpart av tomannsbol...,"Narntegata 27, 1636 Gamle Fredrikstad",3500000,103 170 kr,3 603 170 kr,,Tomannsbolig,Eier (Selveier),3,87 m²,87 m²,,1917,,,577 m² (eiet),,614 394 kr,"[P-plass, Turterreng, Kabel-TV, Offentlig vann...","2.etg:Bad,vaskerom,gang,kjøkken,stue,soverom1....",59.217472,10.971416
1140,199964033,Fin enebolig i rolige og barnevennlige omgivel...,"Råbygdveien 71, 7105 Stadsbygd",1590000,40 920 kr,1 630 920 kr,,Enebolig,Eier (Selveier),4,128 m²,177 m²,,1964,G - oransje,5.0,1188 m² (eiet),192 m²,364 638 kr,"[Barnevennlig, Rolig, Ildsted, Peis]","kr1590000,-(Prisantydning)Omkostninger:kr585,-...",63.521866,10.00205
1401,184408739,"Innholdsrik familiebolig med stor tomt, 2-3 ga...","Ekeliveien 13, 3961 Stathelle",3390000,98 770 kr,3 488 770 kr,,Enebolig,Eier (Selveier),4,134 m²,186 m²,,1967,G - rød,,924 m² (eiet),201 m²,691 375 kr,"[P-plass, Fiskemulighet, Turterreng, Kabel-TV,...","3390000,-(Prisantydning)Omkostninger84750,-(Do...",59.035664,9.711189
516,200616425,RØYKEN - GLEINÅSEN - Nytt og flott rekkehus me...,"Brudeberget 19, 3440 Røyken",1700000,5 232 kr,4 988 232 kr,7 522 kr,Rekkehus,Andel,3,118 m²,124 m²,,2019,B - oransje,,,142 m²,,[],"20dagersomdekkerbla:renovasjon,TV/bredbånd,byg...",59.74256,10.417895


In [None]:
missing_vals = df.isnull().sum().sort_values(ascending = False)
missing_vals[missing_vals>0]

Room                    636
Felleskost / mnd.       571
Story                   528
Gross area              374
Energy labeling         371
Property value          330
Land area               101
Costs                    77
Total price              76
Bedroom                  55
Year of construction     28
Primary room             17
Usable area              11
dtype: int64

Now lets checkout some of the columns and what values they contain.

In [None]:
df['Own form of housing'].value_counts()

Eier (Selveier)    1115
Andel               540
Aksje                38
Obligasjon            1
Name: Own form of housing, dtype: int64

I see this column represent who owned the house. I will rename the column and values.

In [None]:
df.rename(columns={'Own form of housing':'owner'}, inplace=True)
df.replace({'owner':{'Eier (Selveier)':'private','Andel':'share','Aksje':'auction','Obligasjon':'bond'}}, inplace=True)
df['owner'].value_counts()

private    1115
share       540
auction      38
bond          1
Name: owner, dtype: int64

I see there is a lot of missing values in the 'Story' column. Lets see the type of house they are.

In [None]:
df[df['Story'].isna()]['Housing type'].value_counts()

Enebolig               290
Rekkehus                79
Leilighet               75
Tomannsbolig            65
Gårdsbruk/Småbruk        8
Andre                    5
Tomter                   3
Annet fritid             2
Produksjon/Industri      1
Name: Housing type, dtype: int64

They are mostly Detached, Semi Detached, farm houses. although there are some apartments as well But my guess is they are on the ground floor thats why they probably did not think its important to mention the number of stories. We can consider these properties as ground floors.

In [None]:
df['Story'].fillna(1, inplace = True)
missing_vals = df.isnull().sum().sort_values(ascending = False)
missing_vals[missing_vals>0]

Room                    636
Felleskost / mnd.       571
Gross area              374
Energy labeling         371
Property value          330
Land area               101
Costs                    77
Total price              76
Bedroom                  55
Year of construction     28
Primary room             17
Usable area              11
dtype: int64

Lets explore some questions about the houses.



Some columns like gross area is represented in other columns as well but have a lot of missing values. These columns need to be removed for predictions.

In [None]:
df.drop(columns=['Room', 'Felleskost / mnd.', 'Property value', 'Land area', 'Costs', 'Total price'], inplace=True)
df.sample(5)

Unnamed: 0,id,title,address,price,Housing type,owner,Bedroom,Primary room,Usable area,Story,Year of construction,Energy labeling,Gross area,facilities,description,lazy,lon
1145,200398217,Super barnevennlig beliggenhet på beste Bønest...,"Bønesskogen 389, 5154 Bønes",3550000,Enebolig,share,3,105 m²,114 m²,1,1984,E - oransje,125 m²,"[P-plass, Turterreng, Kabel-TV, Barnevennlig, ...","1.etasje:Entré,hall,2soverom,badogtoalettrom.2...",60.334713,5.297718
218,199967754,Lys og romslig 2-roms topp-/endeleilighet med ...,"Refstad allé 14, 0586 Oslo",2780000,Leilighet,share,1,55 m²,55 m²,4,1957,E - lysegrønn,61 m²,"[Kabel-TV, Offentlig vann, vektertjeneste, Vak...",Raskavklaringavforkjøpsretten.MeldefristforOBO...,59.94149,10.80319
1155,199898563,STOR OG TILTALENDE 4-ROMS ENDELEILIGHET I 2. E...,"Fjordveien 67A, 1363 Høvik",7100000,Leilighet,private,3,99 m²,99 m²,2,1961,G - oransje,109 m²,"[P-plass, Turterreng, Ingen gjenboere, Barneve...",Alleromeritilstandsrapportenmedregnetiprimærea...,59.89179,10.571858
531,200395336,Enebolig i kjede beliggende på enden - Solvend...,"Glassbegerveien 271, 4032 Stavanger",2350000,Enebolig,share,2,87 m²,87 m²,3,2004,D - rød,99 m²,"[P-plass, Kabel-TV, Offentlig vann, Rolig, Bal...",1Etasje:Bruksareal:1.etasje:37kvmPrimærrom:1.e...,58.9017,5.7236
386,199771988,"SØRREISA - Koselig leilighet, gode solforhold ...","Borgveien 31, 9310 Sørreisa",1200000,Leilighet,private,1,64 m²,69 m²,1,1995,E - oransje,73 m²,"[Peis, Rolig, Utsikt, Ildsted]","Forsikring,brøyting/strøing,samtforefallendeve...",69.1404,18.1206


Lets rename some of the columns for ease of use

In [None]:
df.columns = [''.join(col.lower().split(' ')) for col in list(df.columns)]
df.rename(columns={
    'housingtype':'type',
    'bedroom':'broom',
    'primaryroom':'proom',
    'yearofconstruction':'year',
    'energylabeling':'energy',
    'lazy':'lat'
    }, inplace=True)

df.sample(5)

Unnamed: 0,id,title,address,price,type,owner,broom,proom,usablearea,story,year,energy,grossarea,facilities,description,lat,lon
151,200508099,Eksklusiv leilighet med gjennomgående planløsn...,"Dyna brygge 3, 0252 Oslo",17000000,Leilighet,private,2,122 m²,122 m²,2,2014,C - mørkegrønn,134 m²,"[Vaktmester-, Lademulighet, Utsikt, Ildsted, B...","Entré/gang,2soverom,dusjbad,hovedbad,stue,åpen...",59.907555,10.719667
1203,198145724,Sjarmerende enebolig med landlig beliggenhet. ...,"Håkestadveien 275, 3280 Tjodalyng",3100000,Enebolig,private,2,195 m²,265 m²,2,1967,F - gul,284 m²,"[P-plass, Ingen gjenboere, Rolig, Balkong, Hag...","EneboligBruksareal:1.etasje:156kvmEntre,2xgang...",59.0779,10.1137
1556,199884205,VISN: 07/12 kl.16.30-17.30! (Påmelding) Innhol...,"Bassenggata 13, 6516 Kristiansund N",2790000,Enebolig,private,4,185 m²,196 m²,1,1976,E - oransje,222 m²,"[P-plass, Offentlig vann, Rolig, Balkong, Hage...",SokkeletasjeBruksareal:Kjeller:92kvmTrappegang...,63.1111,7.7541
1445,197229472,Enebolig Minkvegen 10,"MINKVEGEN 10, 2030 Nannestad",3300000,Enebolig,private,2,153 m²,172 m²,2,1965,G - rød,,"[P-plass, Offentlig vann, Balkong, Sentralt, k...","NeiDokumentavgift,tinglysingavskjøte,tinglysin...",60.222977,11.028235
1131,199890271,Innholdsrik og pen 3-roms leilighet med to alt...,"Hjalmar Brantings vei 127, 5143 Fyllingsdalen",2890000,Leilighet,share,2,92 m²,92 m²,2,1973,,101 m²,"[Kabel-TV, Offentlig vann, Rolig, Balkong, Par...","Bruksareal:2.etasje:92kvmGang,stue/kjøkken,bad...",60.3468,5.2915


bed room, primary room, usable area, gross area looks to be correlated. Lets make a smaller dataset without Missing values to see how they correlate.

In [None]:
df_areas = df[['broom', 'proom', 'usablearea', 'grossarea']]

clean_area = lambda x: float(x.split(' ')[0])

df_areas = df_areas[df_areas.isnull().sum(axis=1) == 0]
df_areas['proom'] = df_areas['proom'].apply(clean_area)
df_areas['broom'] = df_areas['broom'].apply(clean_area)
df_areas['usablearea'] = df_areas['usablearea'].apply(clean_area)
df_areas['grossarea'] = df_areas['grossarea'].apply(clean_area)

df_areas.corr().style.background_gradient(cmap='coolwarm', axis=None)

Unnamed: 0,broom,proom,usablearea,grossarea
broom,1.0,0.825387,0.791842,0.791427
proom,0.825387,1.0,0.946089,0.946891
usablearea,0.791842,0.946089,1.0,0.996301
grossarea,0.791427,0.946891,0.996301,1.0


As we can see usable area is highly correlated with gross area and primary room size. So we can remove those columns as they do not add a lot of information for prediction model.

In [None]:
df.drop(columns=['proom', 'grossarea'], inplace=True)
missing_vals = df.isnull().sum().sort_values(ascending = False)
missing_vals[missing_vals>0]

energy        371
broom          55
year           28
usablearea     11
dtype: int64

In [None]:
df = df[df.isnull().sum(axis=1) == 0]

Unnamed: 0,id,title,address,price,type,owner,broom,usablearea,story,year,energy,facilities,description,lat,lon
0,200548355,PEN OG INNBYDENDE LEILIGHET I 2. ETASJE PÅ NYB...,"Tampereveien 4 B, 7020 Trondheim",1690000,Leilighet,share,0,31 m²,2,1960,E - rød,"[P-plass, Turterreng, Kabel-TV, Offentlig vann...","Bruksareal:2.etasje:31kvmGang,bad,stue,kjøkken...",63.4142,10.3496
3,181444617,"Åros, enebolig med solrik og sentral bel. kun ...","Hurumveien 53A, 3474 Åros",4260000,Enebolig,private,3,162 m²,1,1990,D - gul,"[P-plass, Fiskemulighet, Turterreng, Offentlig...","U.etg:Vindfang,trapperom,1soverom,kjellerstue,...",59.704712,10.51447
5,176406543,Flott bolig med alt på ett plan - Gode solforh...,"Rygjaveien 29 a, 4020 Stavanger",6850000,Enebolig,private,3,201 m²,1,1968,F - oransje,"[P-plass, Offentlig vann, Rolig, Balkong, Park...","Bruksareal:1.etasje:149kvmEntre/hall,WC,mellom...",58.9268,5.7255
6,200294416,"Lekker familiebolig med 3 soverom, 2 bad og 2 ...","Grannesstubben 28, 4044 Hafrsfjord",4990000,Rekkehus,private,3,134 m²,1,2011,B - rød,"[P-plass, Turterreng, Offentlig vann, Balkong,...",Bruksareal:1.etasje:66kvm2.etasje:68kvmPrimærr...,58.9308,5.6916
7,197926933,Tidligere småbruk med idyllisk beliggenhet ved...,"Strandvegen 70, 2430 Jordet",1900000,Gårdsbruk/Småbruk,private,4,157 m²,1,1912,G - oransje,[],Bruksareal:Kjeller:13kvm1.etasje:102kvm2.etasj...,61.4209,12.1463
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1689,200638453,Oppussingsobjekt: Stort selveiende enderekkehu...,"Kiplehaugen 43, 5179 Godvik",3590000,Rekkehus,private,2,165 m²,1,1969,F - gul,[],"3590000,-(Prisantydning)Omkostninger2800,-(Bol...",60.37144,5.2012715
1690,200499190,Lys og hyggelig leilighet over 2 plan med nærh...,"Laskenveien 2C, 3214 Sandefjord",2100000,Leilighet,share,2,104 m²,2,1999,D - oransje,"[P-plass, Turterreng, Offentlig vann, Rolig, B...",Laskenveien2cBruksareal:2.etasje:40kvmGang/tra...,59.134,10.2047
1691,200531964,Innholdsrik enebolig fra 2014 med egen inngang...,"Solbergliveien 9C, 0682 Oslo",10800000,Enebolig,private,3,164 m²,1,2014,B - rød,"[P-plass, Turterreng, Kabel-TV, Offentlig vann...","Kjeller:Bad,vaskerom,gang,kjellerstue.1.etasje...",59.908833,10.837808
1692,200515582,Steinkjer - Enebolig på Byafossen. Oppusset 1 ...,"Klepparvegen 6, 7716 Steinkjer",1490000,Enebolig,private,3,232 m²,1,1973,G - oransje,[],"1490000,-(Prisantydning)Omkostninger172,-(Pant...",64.03574,11.548495
