# KV Data Cleanup

## Starting up

In [2]:
%pip install matplotlib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\Kasutaja\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [53]:
data = pd.read_csv("kv-rent-data-16-11-2024.csv")

In [25]:
# As we can see, the situation is pretty bad by default.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676 entries, 0 to 2675
Data columns (total 25 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   web-scraper-order       2676 non-null   object 
 1   web-scraper-start-url   2676 non-null   object 
 2   listing-link            2676 non-null   object 
 3   listing-link-href       2676 non-null   object 
 4   address                 2676 non-null   object 
 5   price                   2676 non-null   object 
 6   rooms                   2667 non-null   float64
 7   area                    2670 non-null   object 
 8   floor_out_of_floors     2494 non-null   object 
 9   build_year              2081 non-null   float64
 10  condition               2456 non-null   object 
 11  energy_grade            2311 non-null   object 
 12  summary                 2675 non-null   object 
 13  description             2633 non-null   object 
 14  bedrooms                1848 non-null   

In [26]:
# Let's remove the Web Scraper columns
data = data.drop(['web-scraper-order', 'web-scraper-start-url'], axis=1)
data.head()

Unnamed: 0,listing-link,listing-link-href,address,price,rooms,area,floor_out_of_floors,build_year,condition,energy_grade,...,ownership-form,katastrinumber,description-header,description-footer,prepayment,summer-winter,owner-or-broker-banner,registriosa-number,images-link,images-link-href
0,"Harjumaa, Tallinn, Vanalinn, Voorimehe tn 1",https://www.kv.ee/uurile-anda-2toaline-korter-...,"Anda üürile korter, 2 tuba - Voorimehe tn 1, V...",750 € 17.8 €/m²,2.0,42.1 m²,01-Mar,1807.0,Renoveeritud,-,...,,,STUUDIOKORTER RAEKOJA PLATSIL!,Kuulutuse link Kopeeri Kopeeri,,,Ats Pihl Realis OÜ,,Kõik pildid (29),https://www.kv.ee/object/images/3475992
1,"Tartumaa, Tartu, Kesklinn, Kivi 25",https://www.kv.ee/mobleeritud-paikeseline-3toa...,"Anda üürile korter, 3 tuba - Kivi 25, Kesklinn...",595 € 10.1 €/m²,3.0,58.7 m²,03-May,2005.0,Valmis,Puudub,...,Korteriomand,79514:018:0010,"Möbleeritud rõduga korter, parkimiskoht.",Kuulutuse link Kopeeri Kopeeri,,,Kristel Leetsi Kaanon Kinnisvarabüroo OÜ,,Kõik pildid (14),https://www.kv.ee/object/images/3696581
2,"Tartumaa, Tartu, Tartu linn, Kesklinn, Oru 2",https://www.kv.ee/uurile-anda-kesklinnas-asuv-...,"Anda üürile korter, 2 tuba - Oru 2, Kesklinn, ...",670 € 9.31 €/m²,2.0,72 m²,04-Apr,,Renoveeritud,-,...,Korteriomand,,,Kuulutuse link Kopeeri Kopeeri,,,Rait Sinimäe Raar Kinnisvara,,Kõik pildid (19),https://www.kv.ee/object/images/1543892
3,"Pärnumaa, Pärnu, Pärnu linn, Rannarajoon, Papl...",https://www.kv.ee/uus-hind-uurile-anda-kohesel...,"Anda üürile korter, 2 tuba - Papli 20, Rannara...",450 € 9.38 €/m²,2.0,48 m²,01-Feb,2003.0,Valmis,Puudub,...,Korteriomand,,Pikaajalise üürile terrassiga korter rannarajo...,Kuulutuse link Kopeeri Kopeeri,,,Krista Järv Lahe Kinnisvara,,Kõik pildid (14),https://www.kv.ee/object/images/3691815
4,"Harjumaa, Tallinn, Haabersti, Pikaliiva tn 5",https://www.kv.ee/korter-vabaneb-alates-01-11-...,"Anda üürile korter, 1 tuba - Pikaliiva tn 5, H...",550 € 22.9 €/m²,1.0,24 m²,03-Apr,2023.0,Uus,B,...,Korteriomand,78401:101:6960,"Super Pakkumine! Parkimiskoht, panipaik hinnas!",Kuulutuse link Kopeeri Kopeeri,550 €,,Stanislav Kostõljov ByPro OÜ,,Kõik pildid (38),https://www.kv.ee/object/images/3637834


## floor_out_of_floors deserves its own chapter - cell formatting is annoying

In [54]:
# floor_out_of_floors seems to have had a bit of a mishap. 
# they were automatically transformed to a date, but they were actually 1/5, 3/5, 1/2 etc.
# let's separate all the columns that are easily separable - price, floor_out_of_floors
# first, let's check the unique values.
data['floor_out_of_floors'].unique()
# most of them are pretty clear - Day-Month corresponds to FLOOR-TOTAL_FLOORS
# some, however aren't clear: -0.25, -0.333333333, -0.2. These need to be checked individually.

array(['01-Mar', '03-May', '04-Apr', '01-Feb', '03-Apr', '01-Apr',
       '02-Mar', '05-May', '01-May', '04-May', '03-Mar', nan, '02-May',
       '06-Jun', '03-Jun', '03-Jul', '04-Feb', '05-Jun', '02-Apr',
       '02-Feb', '07-Sep', '04-Jun', '04-Sep', '06-Aug', '09-Sep',
       '03-Aug', '04-Jul', '02-Jun', '08-Aug', '05-Jul', '01-Jun',
       '01-Aug', '08-Dec', 'Dec-14', '03-Sep', '06-Jul', '01-Jan',
       '08-Sep', '04-Aug', '13/30', 'May-14', '01-Sep', 'Apr-14',
       '05-Oct', '18/23', '06-Sep', '15/30', '07-Aug', '05-Aug', 'Aug-15',
       '05-Sep', '-0.25', '04-Oct', '02-Jan', '02-Jul', '10-Nov',
       '02-Sep', 'Jul-16', 'Jun-14', 'Oct-19', 'Jun-13', '09-Dec',
       'Feb-14', '10-Dec', '24/30', '15/16', 'Dec-13', '02-Aug', 'May-13',
       '07-Nov', '14/20', 'Sep-14', 'Oct-17', '07-Jul', '09-Oct',
       '03-Oct', '08-Oct', '03-Feb', '06-Dec', 'Jul-20', 'Nov-19',
       'Jul-19', 'Jul-14', '14/14', 'Apr-22', '09-Nov', 'Jun-16',
       'Dec-16', 'Apr-13', 'Dec-15', 'Aug-19'

In [66]:
#data[data['floor_out_of_floors'] == '-0.25'].iloc[0]['listing-link-href'] # -0.25 is actually a basement floor -1/4
#data[data['floor_out_of_floors'] == '-0.25'].iloc[1]['listing-link-href'] # -0.25 is actually a basement floor -1/4
# ^ those two listings are actually the same apartment listing two times, 
# the links really are different, though.
#data[data['floor_out_of_floors'] == '-0.333333333'].iloc[0]['listing-link-href'] # -0.333333333 is actually a basement floor -1/3
#data[data['floor_out_of_floors'] == '-0.2'].iloc[0]['listing-link-href'] # -0.2 is actually a basement floor -1/5 

'https://www.kv.ee/tallinna-vanalinnas-laial-tanaval-1850-aastal-valm-3613092.html'

In [None]:
# The plan is the following (not the most optimal, but definitely won't break anything):
# map -0.25, -0.333333333, -0.2 to -1/4, -1/3, -1/5
# map [Jan, Feb, Mar,...] to [1,2,3,...] in each string
# map - to / DANGER, map only once FROM RIGHT, otherwise negative floor numbers will be affected
# check all unique values
# if unique values are all good, then split from / and cast to int

In [None]:
data[['price', 'price_per_m2']] = data['price'].str.split('€ ', n=1, expand=True)
data[['floor', 'total_floors']] = data['floor_out_of_floors'].str.split('-', expand=True)

In [38]:
data = data.drop(['floor_out_of_floors'], axis=1)

## area and price_per_m2

In [50]:
# Let's strip m² from area and €/m² from price_per_m2
data['area'] = data['area'].str.strip(' m²')
data['price_per_m2'] = data['price_per_m2'].str.strip(' €/m²')