In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import math

# Intro

In this notebook I´m data-cleaning the csv created in "2. Sthlm..." 

In [125]:
data = pd.read_csv("sthlm_raw.csv")

In [126]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58169 entries, 0 to 58168
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   adress          58169 non-null  object 
 1   omrade          57442 non-null  object 
 2   kvm             57966 non-null  object 
 3   rum             58151 non-null  object 
 4   maklare         58169 non-null  object 
 5   avgift          57957 non-null  float64
 6   slutpris        58169 non-null  int64  
 7   datum           58169 non-null  object 
 8   prisförändring  58169 non-null  int64  
 9   gata_id_lst     58169 non-null  int64  
 10  gata_lst        56908 non-null  object 
 11  stockholm_lst   58169 non-null  object 
dtypes: float64(1), int64(3), object(8)
memory usage: 5.3+ MB


# Drop rows where "kvm" or "rum" is missing

Start with dropping for efficiency (avoiding the risk of spending time fixing rows I'll later drop)

kvm and rum are two of the most important values and I prefer dropping rows where they are missing from any other approach. 

I feel ok with dropping since I'm only loosing app. 200 / 58000 rows ~ 0.35% of the total sample

In [127]:
data = data.loc[~data.rum.isna()]

In [128]:
data = data.loc[~data.kvm.isna()]

In [129]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57949 entries, 0 to 58168
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   adress          57949 non-null  object 
 1   omrade          57224 non-null  object 
 2   kvm             57949 non-null  object 
 3   rum             57949 non-null  object 
 4   maklare         57949 non-null  object 
 5   avgift          57737 non-null  float64
 6   slutpris        57949 non-null  int64  
 7   datum           57949 non-null  object 
 8   prisförändring  57949 non-null  int64  
 9   gata_id_lst     57949 non-null  int64  
 10  gata_lst        56693 non-null  object 
 11  stockholm_lst   57949 non-null  object 
dtypes: float64(1), int64(3), object(8)
memory usage: 5.7+ MB


# Adapting numerical decimals to pandas

In [130]:
# Changing , to . for kvm
data.kvm = data.kvm.str.replace(",", ".")

In [131]:
# Changing , to . for rum
data.rum = data.rum.str.replace(",",".")

# Data-types

## kvm

In [132]:
# kvm changed to float from object
data.kvm = data.kvm.astype(float)

## rum

In [133]:
# rum changed to float from object
data.rum = data.rum.astype("float")

## datum

In [134]:
# Need to remove and replace text before converting to datetime-type

In [135]:
# remove text "Såld" from all rows
data.datum = data.datum.str.replace("Såld", "")

In [136]:
# Function to change text for month to numerical value for month 
def manader_to_num(datum): 
    if "januari" in datum: 
        return datum.replace("januari", "1")
    elif "februari" in datum: 
        return datum.replace("februari", "2")
    elif "mars" in datum: 
        return datum.replace("mars", "3")
    elif "april" in datum: 
        return datum.replace("april", "4")
    elif "maj" in datum: 
        return datum.replace("maj", "5")
    elif "juni" in datum: 
        return datum.replace("juni", "6")
    elif "juli" in datum: 
        return datum.replace("juli", "7")
    elif "augusti" in datum:
        return datum.replace("augusti", "8")
    elif "september" in datum: 
        return datum.replace("september", "9")
    elif "oktober" in datum: 
        return datum.replace("oktober", "10")
    elif "november" in datum: 
        return datum.replace("november", "11")
    elif "december" in datum: 
        return datum.replace("december", "12")
    else: 
        return np.nan

In [137]:
# apply function to change month from text to numerical
data.datum = data.datum.apply(manader_to_num)

In [138]:
# change type from object to datetime. I'm specifying that it should parse day first 
data.datum = pd.to_datetime(data.datum, dayfirst=True)

In [139]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57949 entries, 0 to 58168
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   adress          57949 non-null  object        
 1   omrade          57224 non-null  object        
 2   kvm             57949 non-null  float64       
 3   rum             57949 non-null  float64       
 4   maklare         57949 non-null  object        
 5   avgift          57737 non-null  float64       
 6   slutpris        57949 non-null  int64         
 7   datum           57949 non-null  datetime64[ns]
 8   prisförändring  57949 non-null  int64         
 9   gata_id_lst     57949 non-null  int64         
 10  gata_lst        56693 non-null  object        
 11  stockholm_lst   57949 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(3), object(5)
memory usage: 5.7+ MB


# NaN in avgift: Future update (maybe?)

## Description for future update

I'm not really interested in the values in this column so i don't think I'll bother filling the missing values. 

When there are values missing it's simply because that info has been left out from my data-source (Hemnet)

# NaNs in gata_lst: Future update
## Description for future update: gata_lst - fillna from street name in adress column

In [140]:
# Inspecting rows there gata_lst is NaN
# I did not pick up all special cases when I created the data source for gata_lst
# i.e.: é 
data.loc[data.gata_lst.isna()]

Unnamed: 0,adress,omrade,kvm,rum,maklare,avgift,slutpris,datum,prisförändring,gata_id_lst,gata_lst,stockholm_lst
3635,Tegnérgatan 5,"Norrmalm,",92.000000,3.000000,Behrer & Partners,3746.000000,11700000,2020-09-08,14,475246,,Stockholms kommun
3636,Tegnérgatan 5A,"Vasastan - Östermalm,",34.000000,2.000000,Fastighetsbyrån Stockholm - Vasastan,1691.000000,3800000,2020-09-01,12,475246,,Stockholms kommun
3637,Tegnérgatan 10,"Vasastan,",66.000000,2.000000,BlumenthalHoffman Fastighetsmäkleri,2794.000000,6900000,2020-08-30,15,475246,,Stockholms kommun
3638,"Tegnérgatan 5 A, 2 tr","Vasastan / Norrmalm,",57.000000,2.000000,Eklund Stockholm New York,2352.000000,5890000,2020-07-02,7,475246,,Stockholms kommun
3639,"Tegnérgatan 55A, 2tr","Vasastan/Norrmalm,",35.500000,2.000000,Innerstadsspecialisten S AB,2284.000000,3900000,2020-06-24,16,475246,,Stockholms kommun
...,...,...,...,...,...,...,...,...,...,...,...,...
57050,"Lugnets Allé 67, 5tr","Hammarby Sjöstad,",66.000000,2.000000,HusmanHagberg Södermalm/Gamla Stan,4388.000000,3195000,2013-04-18,0,476302,,Stockholms kommun
57051,Lugnets Allé 41,"Sickla Kaj,",73.500000,3.000000,Fastighetsbyrån Stockholm - Hammarby Sjöstad,4269.000000,3450000,2013-04-10,6,476302,,Stockholms kommun
57052,Lugnets Allé 40,"Hammarby Sjöstad,",85.500000,3.000000,Fastighetsbyrån Stockholm - Hammarby Sjöstad,4489.000000,3725000,2013-03-22,0,476302,,Stockholms kommun
57053,Lugnets Allé 65,"Hammarby Sjöstad,",52.000000,2.000000,Fastighetsbyrån Stockholm - Hammarby Sjöstad,3755.000000,2225000,2013-03-14,-3,476302,,Stockholms kommun


# Outliers

## kvm

In [141]:
data.kvm.describe()

count   57949.000000
mean       57.793430
std        26.884555
min        10.000000
25%        38.000000
50%        52.500000
75%        73.000000
max       405.000000
Name: kvm, dtype: float64

### min

In [142]:
data.kvm.nsmallest()

7394    10.000000
22859   10.000000
41199   11.000000
2309    12.000000
2310    12.000000
Name: kvm, dtype: float64

In [143]:
# Values look reasonable 
data.loc[7394]

adress                           Hantverkargatan 77, 5 tr
omrade                          Kungsholmen/Fridhemsplan,
kvm                                             10.000000
rum                                              1.000000
maklare           Fastighetsbyrån Stockholm - Kungsholmen
avgift                                         534.000000
slutpris                                          1900000
datum                                 2017-06-22 00:00:00
prisförändring                                         27
gata_id_lst                                        475403
gata_lst                                  Hantverkargatan
stockholm_lst                           Stockholms kommun
Name: 7394, dtype: object

https://www.hemnet.se/salda/lagenhet-1rum-kungsholmen-fridhemsplan-stockholms-kommun-hantverkargatan-77,-5-tr-696839

Found the object and the information seems to be correct

### max

In [144]:
data.kvm.nlargest()

37355   405.000000
29779   334.000000
29388   309.000000
30120   308.000000
31530   279.000000
Name: kvm, dtype: float64

In [145]:
# value for kvm is not in sync with other values
# almost sure that it should be 40.5 kvm
data.loc[37355]

adress                  Svartensgatan 28 A, 2 tr ö.g
omrade                           Södermalm Katarina,
kvm                                       405.000000
rum                                         1.500000
maklare           HusmanHagberg Södermalm/Gamla Stan
avgift                                   1836.000000
slutpris                                     4060000
datum                            2015-12-03 00:00:00
prisförändring                                    33
gata_id_lst                                   475963
gata_lst                               Svartensgatan
stockholm_lst                      Stockholms kommun
Name: 37355, dtype: object

In [146]:
# changing the value for index = 37355
data.loc[37355, "kvm"] = 40.5

In [147]:
# value for kvm is in sync with other values
data.loc[29779]

adress             Eriksbergsgatan 13
omrade                     Östermalm,
kvm                        334.000000
rum                          9.000000
maklare                    Lagerlings
avgift                   10755.000000
slutpris                     51000000
datum             2019-12-19 00:00:00
prisförändring                      4
gata_id_lst                    475717
gata_lst              Eriksbergsgatan
stockholm_lst       Stockholms kommun
Name: 29779, dtype: object

In [148]:
# value for kvm in sync with other values
data.loc[29388]

adress                           Karlavägen 83
omrade              Östermalm - vid Karlaplan,
kvm                                 309.000000
rum                                   9.000000
maklare           Fredegårds Fastighetsbyrå AB
avgift                            10671.000000
slutpris                              33000000
datum                      2015-10-12 00:00:00
prisförändring                              -5
gata_id_lst                             475695
gata_lst                            Karlavägen
stockholm_lst                Stockholms kommun
Name: 29388, dtype: object

In [149]:
# moving on...

## rum

In [150]:
# Looks reasonable
data.rum.describe()

count   57949.000000
mean        2.234689
std         0.971506
min         1.000000
25%         1.500000
50%         2.000000
75%         3.000000
max         9.000000
Name: rum, dtype: float64

In [151]:
# Looks reasonable
data.rum.value_counts()

2.000000    23448
3.000000    11326
1.000000    11291
4.000000     4174
1.500000     3468
2.500000     1887
5.000000      964
3.500000      839
4.500000      251
6.000000      210
5.500000       46
7.000000       29
6.500000       10
9.000000        3
8.000000        2
7.500000        1
Name: rum, dtype: int64

## Slutpris

In [152]:
# changing format for displaying floats
pd.options.display.float_format = '{:f}'.format

# inspecting column slutpris
# max seems reasonable
# min is wrong
data.slutpris.describe()

count      57949.000000
mean     4728759.432259
std      2393844.647700
min            8.000000
25%      3150000.000000
50%      4100000.000000
75%      5650000.000000
max     60000000.000000
Name: slutpris, dtype: float64

### min

In [153]:
data.slutpris.nsmallest()

56366          8
40973     405000
51623    1180000
31124    1250000
57832    1265000
Name: slutpris, dtype: int64

In [154]:
# looking at nsmallest(1)
# can't infer appropriate value from other values
# removing this from data-set
data.loc[56366]

adress            Hammarby Allé 3, 8tr
omrade               Hammarby Sjöstad,
kvm                          83.500000
rum                           3.000000
maklare                          Notar
avgift                     4494.000000
slutpris                             8
datum              2016-02-05 00:00:00
prisförändring                    -100
gata_id_lst                     476284
gata_lst                           NaN
stockholm_lst        Stockholms kommun
Name: 56366, dtype: object

In [155]:
# removing row
data.drop(56366, inplace = True)

In [156]:
# looking at nsmallest(2)
# Probably missed a 0 for slutpris
# More efficient to just remove this row from the data-set
data.loc[40973]

adress                  Ringvägen 122, 5 tr
omrade                     Södermalm-Sofia,
kvm                               74.000000
rum                                3.000000
maklare           Södermäklarna Mariatorget
avgift                          3039.000000
slutpris                             405000
datum                   2014-04-07 00:00:00
prisförändring                          -89
gata_id_lst                          475989
gata_lst                          Ringvägen
stockholm_lst             Stockholms kommun
Name: 40973, dtype: object

In [157]:
# remove row
data.drop(40973, inplace = True)

In [158]:
# looking at nsmallest(3)
# values seem reasonable
data.loc[51623]

adress            Sjöbjörnsvägen 11, 1tr
omrade                          Gröndal,
kvm                            20.000000
rum                             1.000000
maklare                            Notar
avgift                       1269.000000
slutpris                         1180000
datum                2013-10-02 00:00:00
prisförändring                         8
gata_id_lst                       476175
gata_lst                  Sjöbjörnsvägen
stockholm_lst          Stockholms kommun
Name: 51623, dtype: object

### max

In [159]:
# values seem reasonable
data.slutpris.nlargest()

31530    60000000
31532    60000000
29779    51000000
31228    43800000
31534    41000000
Name: slutpris, dtype: int64

## Datum

In [162]:
# Looks good
data.datum.describe()

count                   57947
unique                   2726
top       2015-05-13 00:00:00
freq                       89
first     2011-02-09 00:00:00
last      2020-09-12 00:00:00
Name: datum, dtype: object

## Prisförändring

In [163]:
# Inspecting prisförändring

# min-value is suspicious
# max-value is suspicious
data.prisförändring.describe()

count       57947.000000
mean         2254.564309
std        540041.959270
min           -58.000000
25%             3.000000
50%            10.000000
75%            17.000000
max     129999900.000000
Name: prisförändring, dtype: float64

### min

In [164]:
data.prisförändring.nsmallest()

54559   -58
19734   -49
29244   -49
12467   -43
43896   -41
Name: prisförändring, dtype: int64

In [165]:
# looks strange but not impossible
data.loc[54559]

adress                        Rosenlundsgatan 28B
omrade                   Södermalm / Mariatorget,
kvm                                    101.000000
rum                                      3.000000
maklare           Mäklarhuset Stockholm Innerstan
avgift                                3173.000000
slutpris                                  3304815
datum                         2016-06-12 00:00:00
prisförändring                                -58
gata_id_lst                                476244
gata_lst                          Rosenlundsgatan
stockholm_lst                   Stockholms kommun
Name: 54559, dtype: object

https://www.hemnet.se/salda/lagenhet-3rum-sodermalm-mariatorget-stockholms-kommun-rosenlundsgatan-28b-531405 

Info seems to be correct

### max

In [166]:
# nlargest(1) and (2) is for sure wrong
data.prisförändring.nlargest()

50144    129999900
32232        10897
45062          523
14701          374
7192            85
Name: prisförändring, dtype: int64

In [167]:
# removing top2 
data.drop(50144, inplace = True)
data.drop(32232, inplace = True)

In [168]:
# unreasonable
data.loc[45062]

adress                               Lundagatan 46
omrade                                  Södermalm,
kvm                                      36.000000
rum                                       2.000000
maklare           Erik Olsson Fastighetsförmedling
avgift                                 1880.000000
slutpris                                   3200000
datum                          2017-10-06 00:00:00
prisförändring                                 523
gata_id_lst                                 476072
gata_lst                                Lundagatan
stockholm_lst                    Stockholms kommun
Name: 45062, dtype: object

In [169]:
data.drop(45062, inplace = True)

In [170]:
# possible but not likely to be correct
data.loc[14701]

adress                     Alströmergatan 8 A, 3tr ög
omrade                                   Kungsholmen,
kvm                                         64.000000
rum                                          3.000000
maklare           Innerstadsspecialisten Stockholm AB
avgift                                    2601.000000
slutpris                                      4150000
datum                             2014-06-13 00:00:00
prisförändring                                    374
gata_id_lst                                    475473
gata_lst                               Alströmergatan
stockholm_lst                       Stockholms kommun
Name: 14701, dtype: object

In [171]:
data.drop(14701, inplace = True)

In [172]:
data.prisförändring.nlargest()

7192     85
7177     69
11328    69
55033    63
12298    62
Name: prisförändring, dtype: int64

In [173]:
# possible 
data.loc[7192]

adress                                  Skillinggränd 5
omrade               Kungsholmstorg - Norr Mälarstrand,
kvm                                           43.000000
rum                                            2.000000
maklare           SkandiaMäklarna Stockholm Kungsholmen
avgift                                      2685.000000
slutpris                                        3700000
datum                               2016-06-16 00:00:00
prisförändring                                       85
gata_id_lst                                      475401
gata_lst                                  Skillinggränd
stockholm_lst                         Stockholms kommun
Name: 7192, dtype: object

## avgift

In [174]:
# min-value: some apartments have zero fee
# max-value is not unreasonable
data.avgift.describe()

count   57731.000000
mean     2769.866051
std      1230.804214
min         1.000000
25%      1886.000000
50%      2539.000000
75%      3472.000000
max     12945.000000
Name: avgift, dtype: float64

In [175]:
data.avgift.nlargest()

30351   12945.000000
31530   11651.000000
23426   11438.000000
42435   11192.000000
27805   11137.000000
Name: avgift, dtype: float64

In [176]:
# reasonable
data.loc[30351]

adress                  Brahegatan 23
omrade                     Östermalm,
kvm                        223.000000
rum                          6.000000
maklare                    Lagerlings
avgift                   12945.000000
slutpris                     22500000
datum             2019-09-27 00:00:00
prisförändring                      2
gata_id_lst                    475740
gata_lst                   Brahegatan
stockholm_lst       Stockholms kommun
Name: 30351, dtype: object

# Final inspection

In [177]:
# I'm satisfied with the data-types
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57943 entries, 0 to 58168
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   adress          57943 non-null  object        
 1   omrade          57218 non-null  object        
 2   kvm             57943 non-null  float64       
 3   rum             57943 non-null  float64       
 4   maklare         57943 non-null  object        
 5   avgift          57731 non-null  float64       
 6   slutpris        57943 non-null  int64         
 7   datum           57943 non-null  datetime64[ns]
 8   prisförändring  57943 non-null  int64         
 9   gata_id_lst     57943 non-null  int64         
 10  gata_lst        56688 non-null  object        
 11  stockholm_lst   57943 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(3), object(5)
memory usage: 8.2+ MB


In [178]:
# No values seem unreasonable now
data.describe()

Unnamed: 0,kvm,rum,avgift,slutpris,prisförändring,gata_id_lst
count,57943.0,57943.0,57731.0,57943.0,57943.0,57943.0
mean,57.785147,2.23461,2769.866051,4728775.267625,10.933918,475749.097648
std,26.844243,0.971472,1230.804214,2393242.8441,9.863888,329.646886
min,10.0,1.0,1.0,1180000.0,-58.0,475079.0
25%,38.0,1.5,1886.0,3150000.0,3.0,475473.0
50%,52.5,2.0,2539.0,4100000.0,10.0,475691.0
75%,73.0,3.0,3472.0,5650000.0,17.0,476062.0
max,334.0,9.0,12945.0,60000000.0,85.0,476350.0


# Resetting index before writing to csv

In [179]:
# resetting index and dropping the outdated index
data.reset_index(inplace = True, drop=True)

In [180]:
# inspecting index, looks correct
data

Unnamed: 0,adress,omrade,kvm,rum,maklare,avgift,slutpris,datum,prisförändring,gata_id_lst,gata_lst,stockholm_lst
0,"Gamla Brogatan 25, 2tr","Vasastan - City/Norrmalm,",114.000000,3.500000,Fastighetsbyrån Stockholm - Vasastan,6769.000000,7600000,2018-04-13,-5,475079,Gamla Brogatan,Stockholms kommun
1,"Gamla Brogatan 25, 2 tr","Vasastan- City/ Norrmalm,",71.000000,2.000000,Mäklarhuset Stockholm Innerstan,4696.000000,5050000,2016-06-23,4,475079,Gamla Brogatan,Stockholms kommun
2,Gamla Brogatan 25,"Vasastan- City/ Norrmalm,",102.000000,4.000000,Mäklarhuset Stockholm Innerstan,6519.000000,6950000,2016-04-29,1,475079,Gamla Brogatan,Stockholms kommun
3,"Gamla Brogatan 25, 2tr","Vasastan - City/Norrmalm,",107.000000,4.000000,Fastighetsbyrån Stockholm - Vasastan,6713.000000,7150000,2015-11-26,2,475079,Gamla Brogatan,Stockholms kommun
4,"Drottninggatan 114 A, 3 tr","Vasastan - Norrmalm,",90.000000,3.000000,Bostadsrättsspecialisten,3822.000000,8900000,2020-08-13,0,475084,Drottninggatan,Stockholms kommun
...,...,...,...,...,...,...,...,...,...,...,...,...
57938,"Lindevägen 56, 3 tr","Enskede Gård,",74.500000,3.000000,Svensk Fastighetsförmedling,5002.000000,2740000,2014-04-10,19,476350,Lindevägen,Stockholms kommun
57939,Lindevägen 56,"Enskede Gård,",63.000000,2.000000,Svensk Fastighetsförmedling,3960.000000,1905000,2014-02-28,12,476350,Lindevägen,Stockholms kommun
57940,"Lindevägen 44, 2tr","Enskede Gård,",107.000000,4.000000,Fastighetsbyrån Enskede,6977.000000,3720000,2013-08-22,6,476350,Lindevägen,Stockholms kommun
57941,Lindevägen 50,"Enskede Gård,",61.500000,2.000000,Mäklarhuset Enskede,4221.000000,2100000,2013-03-07,11,476350,Lindevägen,Stockholms kommun


# Clean data to_csv

In [181]:
data.to_csv("sthlm_raw_clean.csv", index = False)