# KV Data Cleanup

## Starting up

In [1]:
%pip install matplotlib

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [3]:
data = pd.read_csv("kv-rent-data-16-11-2024.csv")

In [4]:
# underscores are easier to work with than hyphens.
data.columns = data.columns.str.replace('-', '_')
# As we can see, the situation is pretty bad by default.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676 entries, 0 to 2675
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   web_scraper_order      2676 non-null   object 
 1   web_scraper_start_url  2676 non-null   object 
 2   listing_link           2676 non-null   object 
 3   listing_link_href      2676 non-null   object 
 4   address                2676 non-null   object 
 5   price                  2676 non-null   object 
 6   rooms                  2667 non-null   float64
 7   area                   2670 non-null   object 
 8   floor_out_of_floors    2494 non-null   object 
 9   build_year             2081 non-null   float64
 10  condition              2456 non-null   object 
 11  energy_mark            2311 non-null   object 
 12  summary                2675 non-null   object 
 13  description            2633 non-null   object 
 14  bedrooms               1848 non-null   float64
 15  owne

In [5]:
# Let's remove the Web Scraper columns.
# listing-link and address have the same info, but address has more bloat. 
# listing-link is easier to filter and split. let's remove address as well
# catastre, registry and ownership_form have either no correlation or are mostly NaN values
data = data.drop(['web_scraper_order', 'web_scraper_start_url', 'address', 'katastrinumber', 'registriosa_number', 'ownership_form'], axis=1)

## floor_out_of_floors deserves its own chapter - cell formatting is annoying

In [6]:
# floor_out_of_floors seems to have had a bit of a mishap. 
# they were automatically transformed to a date, but they were actually 1/5, 3/5, 1/2 etc.
# let's separate all the columns that are easily separable - price, floor_out_of_floors
# first, let's check the unique values.
data['floor_out_of_floors'].unique()
# most of them are pretty clear - Day-Month corresponds to FLOOR-TOTAL_FLOORS
# some, however aren't clear: -0.25, -0.333333333, -0.2. These need to be checked individually.

array(['01-Mar', '03-May', '04-Apr', '01-Feb', '03-Apr', '01-Apr',
       '02-Mar', '05-May', '01-May', '04-May', '03-Mar', nan, '02-May',
       '06-Jun', '03-Jun', '03-Jul', '04-Feb', '05-Jun', '02-Apr',
       '02-Feb', '07-Sep', '04-Jun', '04-Sep', '06-Aug', '09-Sep',
       '03-Aug', '04-Jul', '02-Jun', '08-Aug', '05-Jul', '01-Jun',
       '01-Aug', '08-Dec', 'Dec-14', '03-Sep', '06-Jul', '01-Jan',
       '08-Sep', '04-Aug', '13/30', 'May-14', '01-Sep', 'Apr-14',
       '05-Oct', '18/23', '06-Sep', '15/30', '07-Aug', '05-Aug', 'Aug-15',
       '05-Sep', '-0.25', '04-Oct', '02-Jan', '02-Jul', '10-Nov',
       '02-Sep', 'Jul-16', 'Jun-14', 'Oct-19', 'Jun-13', '09-Dec',
       'Feb-14', '10-Dec', '24/30', '15/16', 'Dec-13', '02-Aug', 'May-13',
       '07-Nov', '14/20', 'Sep-14', 'Oct-17', '07-Jul', '09-Oct',
       '03-Oct', '08-Oct', '03-Feb', '06-Dec', 'Jul-20', 'Nov-19',
       'Jul-19', 'Jul-14', '14/14', 'Apr-22', '09-Nov', 'Jun-16',
       'Dec-16', 'Apr-13', 'Dec-15', 'Aug-19'

In [7]:
#data[data['floor_out_of_floors'] == '-0.25'].iloc[0]['listing-link-href'] # -0.25 is actually a basement floor -1/4
#data[data['floor_out_of_floors'] == '-0.25'].iloc[1]['listing-link-href'] # -0.25 is actually a basement floor -1/4
# ^ those two listings are actually the same apartment listing two times, 
# the links really are different, though.
#data[data['floor_out_of_floors'] == '-0.333333333'].iloc[0]['listing-link-href'] # -0.333333333 is actually a basement floor -1/3
#data[data['floor_out_of_floors'] == '-0.2'].iloc[0]['listing-link-href'] # -0.2 is actually a basement floor -1/5 

In [8]:
# The plan is the following (not the most optimal, but definitely won't break anything):
# map -0.25, -0.333333333, -0.2 to -1/4, -1/3, -1/5
# map [Jan, Feb, Mar,...] to [1,2,3,...] in each string
# somehow this needs to apply to substrings. 
# map - to / DANGER, map only once FROM RIGHT, otherwise negative floor numbers will be affected
# check all unique values
# if unique values are all good, then split from / and cast to int
## The following is created with help from Claude.ai.
def transform_floor_numbers(value):
    if pd.isna(value):
        return value
        
    # First handle the special basement cases
    basement_map = {
        '-0.25': '-1/4',
        '-0.333333333': '-1/3',
        '-0.2': '-1/5'
    }
    if str(value) in basement_map:
        return basement_map[str(value)]
    
    # Handle month name conversions
    month_map = {
        'Jan': '1', 'Feb': '2', 'Mar': '3', 'Apr': '4', 
        'May': '5', 'Jun': '6', 'Jul': '7', 'Aug': '8', 
        'Sep': '9', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }
    
    value = str(value)
    # Handle cases like "Dec-14"
    for month, num in month_map.items():
        if month in value:
            value = value.replace(month, num)
    
    # Handle cases where - needs to be converted to / (but only rightmost occurrence)
    if '-' in value and '/' not in value:
        parts = value.rsplit('-', 1)  # Split from right once
        value = parts[0] + '/' + parts[1]
        
    return value

# Apply the transformation
data['floor_out_of_floors'] = data['floor_out_of_floors'].apply(transform_floor_numbers)

In [9]:
data[['floor', 'total_floors']] = data['floor_out_of_floors'].str.split('/', expand=True)
data['floor'] = pd.to_numeric(data['floor'], errors='coerce')
data['total_floors'] = pd.to_numeric(data['total_floors'], errors='coerce')

## Elementary column transformations - extracting and cleaning up easy values

In [10]:
# price -> price, price_per_m2
data['price'] = data['price'].apply(lambda x: re.sub(r'\s', '', x))
data[['price', 'price_per_m2']] = data['price'].str.split('€', n=1, expand=True)
data['price'] = pd.to_numeric(data['price'].str.strip(), errors='coerce')
data['price_per_m2'] = data['price_per_m2'].str.replace('€/m²', '').str.strip()
data['price_per_m2'] = pd.to_numeric(data['price_per_m2'], errors='coerce')

In [11]:
data.price

0       750.0
1       595.0
2       670.0
3       450.0
4       550.0
        ...  
2671    790.0
2672    459.0
2673    500.0
2674    419.0
2675    650.0
Name: price, Length: 2676, dtype: float64

In [12]:
# area -> float64 area with filtering
data['area'] = data['area'].str.replace('\xa0m²', '').str.strip()
data['area'] = pd.to_numeric(data['area'], errors='coerce')

In [13]:
# energy_mark to numeric, A is highest, H is lowest value
data.energy_mark = data.energy_mark.map({
    'Puudub': np.nan, '-': np.nan,
    'H':1, 'G':2, 'F':3, 'E':4, 'D':5, 'C':6, 'B':7, 'A':8
})

In [14]:
# arbitrary condition mapping to 5 numeric categories
data['condition'] = data['condition'].map({
    'Uus': 5, 'Uusarendus': 5,
    'Renoveeritud': 4,
    'Valmis': 3, 'Heas korras': 3,
    'San. remont tehtud': 2, 'Keskmine': 2,
    'Vajab san. remonti': 1, 'Vajab renoveerimist': 1
})


In [15]:
# get copy_not_allowed and broker_not_allowed from the footer
# used Copilot to simplify this
def map_description_footer(df):
    df['copy_not_allowed'] = df['description_footer'].apply(lambda x: 'Ei luba enda kuulutust kopeerida' in x)
    df['no_broker_allowed'] = df['description_footer'].apply(lambda x: 'Maakleritel palun mitte tülitada' in x)
    return df

data = map_description_footer(data)

In [16]:
# from images_link, we can get the number of pictures attached to the post. that seems like a worthwhile data point to have
# we can use a regex function for that. used Copilot to check the correct regex function in python
def extract_images_attached(df):
    df['images_attached'] = df['images_link'].str.extract(r'\((\d+)\)')
    df['images_attached'] = pd.to_numeric(df['images_attached'], errors='coerce')
    return df

data = extract_images_attached(data)

In [17]:
# extract whether poster is the owner depending on the owner-broker banner
data['is_owner'] = data['owner_or_broker'].str.contains('Omanik')

In [18]:
# convert prepayment to numeric
data['prepayment'] = pd.to_numeric(data['prepayment'].str.replace('€', '').str.strip(), errors='coerce')

In [19]:
# let's have all the column dropping in the last cell.
data = data.drop(['images_link', 'description_footer', 'floor_out_of_floors', 'owner_or_broker'], axis=1)

## Removing some extreme outliers

In [20]:
# two rows have 29 rooms listed.
# those are actually "shared housing" listing & no rooms cost 200 - the cheapest is 350. Let's drop those rows.
data = data.drop(data[data.rooms == 29].index)

In [21]:
#data[data.rooms.isna()] # 9 rows, let's leave those in for "comparison with no room nr"
#data[data.rooms == 8] # 2 rows, seem correct
#data[data.rooms == 6] # 5 rows
#data[data.rooms == 5] # 22 rows, that's good enough

In [22]:
# row with index 2145 has price 1€ and area as 1 m2. Not going to bother finding out what's happening.
data = data.drop(2145)

In [23]:
# since price is the feature we're trying to predict and area is a good indication of price,
# let's drop all rows where price or area is NaN
#data = data.drop(data[data.rooms == 29].index)
data = data.drop(data[(data['price'].isna()) | (data['area'].isna())].index)

In [24]:
# one of the objects has a build year above 20000, that's actually supposed to be 2022
data.loc[1877, 'build_year'] = 2022

In [25]:
# 3 rows have a price above 15000. those are either sale listings or bad typos, not worth figuring out.
data = data.drop(data[data.price > 15000].index)

In [26]:
# 1 post is a "combo posting" of 5 apartments where area is only 1 apt, but price is 5 apt. Not going to figure all the details out
data = data.drop(data[data.price_per_m2 > 60].index)

In [27]:
# manually verified that the 3 is_owner banners that are NaN are truly posted by the owner 2449 2479 2572
# replace is_owner NaN with True
# data[data.is_owner.isna() == True]
data.loc[[2449, 2479, 2572], 'is_owner'] = True

In [28]:
data.is_owner = data.is_owner.astype(bool)

In [29]:
# one listing is actually for a storage room
data = data.drop(data[data.area == 3].index)

## Extract summer and winter utility costs from utility_costs

In [30]:
data.utility_costs.unique()[390:]

array(['30 € / -', '86 € / 162 €', '70 € / 200 €', '75 € / 140 €',
       '45 € / 85 €', '81 € / 138 €', '88 € / -', '220 € / 250 €',
       '80 € / 280 €'], dtype=object)

In [31]:
# replace € with '', strip()
data['utility_costs'] = data['utility_costs'].str.replace('€', '')
# split from /
data[['utility_summer', 'utility_winter']] = data['utility_costs'].str.split('/', n=1, expand=True)
# to_numeric
data['utility_summer'] = pd.to_numeric(data['utility_summer'], errors='coerce')
data['utility_winter'] = pd.to_numeric(data['utility_winter'], errors='coerce')
# drop utility_costs
data = data.drop(['utility_costs'], axis=1)

In [32]:
# if one exists and other doesn't, copy value from one to the other.
# this is not a perfect solution, but still gives us about 30 extra rows to work with.
data.loc[(data['utility_summer'].isna()) & (data['utility_winter'].notna()), 'utility_summer'] = data['utility_winter']
data.loc[(data['utility_winter'].isna()) & (data['utility_summer'].notna()), 'utility_winter'] = data['utility_summer']

In [33]:
# 2 posts are badly formatted and very difficult to parse
data = data.drop([510, 1693])

In [34]:
# 1 post (2133) had an utility bill of over 100 000€ summer/winter. let's replace that with NaN
# 2 posts have utility bills equal to the rent, while saying that utility bills are fixed
data.loc[2133, 'utility_summer'] = np.nan
data.loc[2133, 'utility_winter'] = np.nan

In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2655 entries, 0 to 2675
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   listing_link        2655 non-null   object 
 1   listing_link_href   2655 non-null   object 
 2   price               2655 non-null   float64
 3   rooms               2646 non-null   float64
 4   area                2655 non-null   float64
 5   build_year          2066 non-null   float64
 6   condition           2435 non-null   float64
 7   energy_mark         1207 non-null   float64
 8   summary             2654 non-null   object 
 9   description         2612 non-null   object 
 10  bedrooms            1836 non-null   float64
 11  description_header  2266 non-null   object 
 12  prepayment          560 non-null    float64
 13  images_link_href    2653 non-null   object 
 14  floor               2475 non-null   float64
 15  total_floors        2475 non-null   float64
 16  price_per_m

## Extracting address fields from listing_link

In [36]:
data['listing_link'].str.split(',').apply(lambda x: len(x)).unique() # address separated by ',' returns len 4, 5, 3, 6, 8, 7

array([4, 5, 3, 6, 7])

In [37]:
data[data['listing_link'].str.split(',').apply(lambda x: len(x) == 5)].listing_link

2            Tartumaa, Tartu, Tartu linn, Kesklinn, Oru 2
3       Pärnumaa, Pärnu, Pärnu linn, Rannarajoon, Papl...
8           Tartumaa, Tartu, Tartu linn, Karlova, Turu 29
10       Harjumaa, Tallinn, Kristiine, Kristiine, Kotka 1
13         Tartumaa, Tartu, Tartu linn, Kesklinn, Riia 26
                              ...                        
2665    Tartumaa, Tartu, Tartu linn, Kesklinn, Väike-T...
2668    Tartumaa, Tartu, Tartu linn, Kesklinn, Vanemui...
2670    Tartumaa, Tartu, Tartu linn, Tammelinn, Soinas...
2673    Harjumaa, Tallinn, Kesklinn, Juhkentali, Liiva...
2675    Harjumaa, Rae vald, Rae, Rae küla, Dolomiidi t...
Name: listing_link, Length: 880, dtype: object

In [38]:
# COUNTY, MUNICIPALITY?, CITY, (DISTRICT)*, STREET_ADDRESS
# I think we will try to go with the first two fields for county and municipality or city, then take the last field for a street address
def extract_address_components(address):
    parts = address.split(',')
    county = parts[0].strip()
    # replacing "vald" (abbr. for municipality) reduces complexity and unique variables while resulting a small data loss that the street address often solves
    mun_or_city = parts[1].replace(' vald', '').strip()
    # replacing "tn" (abbreviation for street) makes it more suitable for different APIs, for example, OpenStreetMap, if we decide to use that
    street_adr = parts[-1].replace(' tn', '').strip()
    return county, mun_or_city, street_adr

data[['county', 'mun_or_city', 'street_adr']] = data['listing_link'].apply(lambda x: pd.Series(extract_address_components(x)))
data = data.drop(['listing_link'], axis=1)

In [39]:
data.street_adr.unique()[:100]

array(['Voorimehe 1', 'Kivi 25', 'Oru 2', 'Papli 20', 'Pikaliiva 5',
       'Väike-Patarei 1/3', 'Pebre 3', 'Õismäe tee 175', 'Turu 29',
       'Mahla 67', 'Kotka 1', 'Aasa 5', 'Mere pst 4', 'Riia 26',
       'Keskuse 12', 'Pärnu mnt 32', 'Ravila 48', 'Kesk 11', 'Koidula 26',
       'Suur-Lossi 16', 'Tõnismägi 11a', 'Pikk 40', 'Pikk 33',
       'Koidu 62--', 'Hane 4-7', 'Valli 4', 'Meierei 30', 'Randla 13',
       'Raua 34', 'Asunduse 9', 'Ahtme mnt 57', 'Kuklase 3', 'Raadiku 14',
       'Pilve 4-16', 'Aiandi 5/3', 'J. Sütiste tee 39', 'Ümera',
       'Sõmera 4a', 'Mäealuse 9', 'Riia 9', 'Tamme 24', 'Dunkri 2',
       'Vanemuise', 'Pallasti 33', 'Talli 2', 'Pargi 1', 'Rüütli 22',
       'A. Puškini 51', 'Puiestee', 'Kangelaste prospekt 10a',
       'F. R. Faehlmanni 8', 'F. R. Faehlmanni 6', 'Kalda tee 14',
       'Kungla 18', 'J. V. Jannseni 4', 'Pille 7/4', 'Uus 13c',
       'Hariduse 12', 'Tildri 7', 'Pikk 5', 'Partisani 9', 'Pärnu mnt 33',
       'Lutsu 14', 'Kooli 6a', 'Vallikraav

## Extracting info from summary                 

In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2655 entries, 0 to 2675
Data columns (total 25 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   listing_link_href   2655 non-null   object 
 1   price               2655 non-null   float64
 2   rooms               2646 non-null   float64
 3   area                2655 non-null   float64
 4   build_year          2066 non-null   float64
 5   condition           2435 non-null   float64
 6   energy_mark         1207 non-null   float64
 7   summary             2654 non-null   object 
 8   description         2612 non-null   object 
 9   bedrooms            1836 non-null   float64
 10  description_header  2266 non-null   object 
 11  prepayment          560 non-null    float64
 12  images_link_href    2653 non-null   object 
 13  floor               2475 non-null   float64
 14  total_floors        2475 non-null   float64
 15  price_per_m2        2655 non-null   float64
 16  copy_not_al

In [41]:
data

Unnamed: 0,listing_link_href,price,rooms,area,build_year,condition,energy_mark,summary,description,bedrooms,...,price_per_m2,copy_not_allowed,no_broker_allowed,images_attached,is_owner,utility_summer,utility_winter,county,mun_or_city,street_adr
0,https://www.kv.ee/uurile-anda-2toaline-korter-...,750.0,2.0,42.1,1807.0,4.0,,"Kivimaja\r\n\r\nKöök: elektripliit, avatud köö...",Üürile anda 2-toaline korter Raekoja platsil.\...,,...,17.80,False,False,29.0,False,,,Harjumaa,Tallinn,Voorimehe 1
1,https://www.kv.ee/mobleeritud-paikeseline-3toa...,595.0,3.0,58.7,2005.0,3.0,,"Korteriomand, kivimaja\r\n\r\nKöök: keraamilin...",Möbleeritud päikeseline 3-toaline rõduga korte...,2.0,...,10.10,False,False,14.0,False,,,Tartumaa,Tartu,Kivi 25
2,https://www.kv.ee/uurile-anda-kesklinnas-asuv-...,670.0,2.0,72.0,,4.0,,"Korteriomand, kivimaja\r\n\r\nKöök: gaasipliit...","Üürile anda kesklinnas asuv 2- toaline, 72m² s...",1.0,...,9.31,False,False,19.0,False,,,Tartumaa,Tartu,Oru 2
3,https://www.kv.ee/uus-hind-uurile-anda-kohesel...,450.0,2.0,48.0,2003.0,3.0,,"Korteriomand, kivimaja\r\n\r\nKöök: keraamilin...",UUS HIND!\r\n Üürile anda koheselt kahetoaline...,1.0,...,9.38,False,False,14.0,False,,,Pärnumaa,Pärnu,Papli 20
4,https://www.kv.ee/korter-vabaneb-alates-01-11-...,550.0,1.0,24.0,2023.0,5.0,7.0,"Korteriomand, kivimaja\r\n\r\nLisainfo: parkim...",Korter vabaneb alates 01.11.2024\r\n\r\nÜürile...,1.0,...,22.90,False,False,38.0,False,,,Harjumaa,Tallinn,Pikaliiva 5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2671,https://www.kv.ee/sind-ootab-maitsekalt-sisust...,790.0,2.0,45.8,2022.0,5.0,7.0,"Korteriomand, kivimaja\r\n\r\nSanruum: dušš, p...",Sind ootab maitsekalt sisustatud mugava planee...,1.0,...,17.30,False,False,43.0,False,,,Harjumaa,Tallinn,Pagi 5
2672,https://www.kv.ee/suur-tanu-huvi-tundmast-maju...,459.0,1.0,21.0,2019.0,5.0,,"Kivimaja\r\n\r\nKöök: elektripliit, avatud köö...",Suur tänu huvi tundmast majutusvõimaluse vastu...,,...,21.90,False,False,19.0,False,,,Harjumaa,Tallinn,Liimi 1b
2673,https://www.kv.ee/uurnikke-asub-otsima-keskmis...,500.0,1.0,32.9,1994.0,5.0,4.0,"Korteriomand, paneelmaja\r\n\r\nKöök: elektrip...",Üürnikke asub otsima keskmisest suurem 1-toali...,1.0,...,15.20,False,False,19.0,False,50.0,90.0,Harjumaa,Tallinn,Liivamäe 2
2674,https://www.kv.ee/suur-tanu-huvi-tundmast-maju...,419.0,1.0,16.0,2019.0,5.0,,"Kivimaja\r\n\r\nKöök: elektripliit, avatud köö...",Suur tänu huvi tundmast majutusvõimaluse vastu...,,...,26.20,False,False,15.0,False,,,Harjumaa,Tallinn,Liimi 1b


In [42]:
# TODO

## Extracting info from description

In [43]:
# TODO

## Extract info from description header - is it in all caps (bool)? Count !? and something like that maybe?

In [44]:
# TODO

## Determine whether to try to analyse pictures from images_link_href and assign a rating to those

In [45]:
# TODO

## Remove links (listing_link_href, images_link_href) before fitting

In [46]:
# TODO