# KV Data Cleanup

## Starting up

In [3]:
%pip install matplotlib

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [5]:
data = pd.read_csv("kv-rent-data-16-11-2024.csv")

In [6]:
# underscores are easier to work with than hyphens.
data.columns = data.columns.str.replace('-', '_')
# As we can see, the situation is pretty bad by default.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2676 entries, 0 to 2675
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   web_scraper_order      2676 non-null   object 
 1   web_scraper_start_url  2676 non-null   object 
 2   listing_link           2676 non-null   object 
 3   listing_link_href      2676 non-null   object 
 4   address                2676 non-null   object 
 5   price                  2676 non-null   object 
 6   rooms                  2667 non-null   float64
 7   area                   2670 non-null   object 
 8   floor_out_of_floors    2494 non-null   object 
 9   build_year             2081 non-null   float64
 10  condition              2456 non-null   object 
 11  energy_mark            2311 non-null   object 
 12  summary                2675 non-null   object 
 13  description            2633 non-null   object 
 14  bedrooms               1848 non-null   float64
 15  owne

In [7]:
# Let's remove the Web Scraper columns.
# listing-link and address have the same info, but address has more bloat. 
# listing-link is easier to filter and split. let's remove address as well
# catastre, registry and ownership_form have either no correlation or are mostly NaN values
data = data.drop(['web_scraper_order', 'web_scraper_start_url', 'address', 'katastrinumber', 'registriosa_number', 'ownership_form'], axis=1)

## floor_out_of_floors deserves its own chapter - cell formatting is annoying

In [9]:
# floor_out_of_floors seems to have had a bit of a mishap. 
# they were automatically transformed to a date, but they were actually 1/5, 3/5, 1/2 etc.
# let's separate all the columns that are easily separable - price, floor_out_of_floors
# first, let's check the unique values.
data['floor_out_of_floors'].unique()
# most of them are pretty clear - Day-Month corresponds to FLOOR-TOTAL_FLOORS
# some, however aren't clear: -0.25, -0.333333333, -0.2. These need to be checked individually.

array(['01-Mar', '03-May', '04-Apr', '01-Feb', '03-Apr', '01-Apr',
       '02-Mar', '05-May', '01-May', '04-May', '03-Mar', nan, '02-May',
       '06-Jun', '03-Jun', '03-Jul', '04-Feb', '05-Jun', '02-Apr',
       '02-Feb', '07-Sep', '04-Jun', '04-Sep', '06-Aug', '09-Sep',
       '03-Aug', '04-Jul', '02-Jun', '08-Aug', '05-Jul', '01-Jun',
       '01-Aug', '08-Dec', 'Dec-14', '03-Sep', '06-Jul', '01-Jan',
       '08-Sep', '04-Aug', '13/30', 'May-14', '01-Sep', 'Apr-14',
       '05-Oct', '18/23', '06-Sep', '15/30', '07-Aug', '05-Aug', 'Aug-15',
       '05-Sep', '-0.25', '04-Oct', '02-Jan', '02-Jul', '10-Nov',
       '02-Sep', 'Jul-16', 'Jun-14', 'Oct-19', 'Jun-13', '09-Dec',
       'Feb-14', '10-Dec', '24/30', '15/16', 'Dec-13', '02-Aug', 'May-13',
       '07-Nov', '14/20', 'Sep-14', 'Oct-17', '07-Jul', '09-Oct',
       '03-Oct', '08-Oct', '03-Feb', '06-Dec', 'Jul-20', 'Nov-19',
       'Jul-19', 'Jul-14', '14/14', 'Apr-22', '09-Nov', 'Jun-16',
       'Dec-16', 'Apr-13', 'Dec-15', 'Aug-19'

In [10]:
#data[data['floor_out_of_floors'] == '-0.25'].iloc[0]['listing-link-href'] # -0.25 is actually a basement floor -1/4
#data[data['floor_out_of_floors'] == '-0.25'].iloc[1]['listing-link-href'] # -0.25 is actually a basement floor -1/4
# ^ those two listings are actually the same apartment listing two times, 
# the links really are different, though.
#data[data['floor_out_of_floors'] == '-0.333333333'].iloc[0]['listing-link-href'] # -0.333333333 is actually a basement floor -1/3
#data[data['floor_out_of_floors'] == '-0.2'].iloc[0]['listing-link-href'] # -0.2 is actually a basement floor -1/5 

In [11]:
# The plan is the following (not the most optimal, but definitely won't break anything):
# map -0.25, -0.333333333, -0.2 to -1/4, -1/3, -1/5
# map [Jan, Feb, Mar,...] to [1,2,3,...] in each string
# somehow this needs to apply to substrings. 
# map - to / DANGER, map only once FROM RIGHT, otherwise negative floor numbers will be affected
# check all unique values
# if unique values are all good, then split from / and cast to int
## The following is created with help from Claude.ai.
def transform_floor_numbers(value):
    if pd.isna(value):
        return value
        
    # First handle the special basement cases
    basement_map = {
        '-0.25': '-1/4',
        '-0.333333333': '-1/3',
        '-0.2': '-1/5'
    }
    if str(value) in basement_map:
        return basement_map[str(value)]
    
    # Handle month name conversions
    month_map = {
        'Jan': '1', 'Feb': '2', 'Mar': '3', 'Apr': '4', 
        'May': '5', 'Jun': '6', 'Jul': '7', 'Aug': '8', 
        'Sep': '9', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }
    
    value = str(value)
    # Handle cases like "Dec-14"
    for month, num in month_map.items():
        if month in value:
            value = value.replace(month, num)
    
    # Handle cases where - needs to be converted to / (but only rightmost occurrence)
    if '-' in value and '/' not in value:
        parts = value.rsplit('-', 1)  # Split from right once
        value = parts[0] + '/' + parts[1]
        
    return value

# Apply the transformation
data['floor_out_of_floors'] = data['floor_out_of_floors'].apply(transform_floor_numbers)

In [12]:
data[['floor', 'total_floors']] = data['floor_out_of_floors'].str.split('/', expand=True)
data['floor'] = pd.to_numeric(data['floor'], errors='coerce')
data['total_floors'] = pd.to_numeric(data['total_floors'], errors='coerce')

## Elementary column transformations - extracting and cleaning up easy values

In [14]:
# price -> price, price_per_m2
data['price'] = data['price'].apply(lambda x: re.sub(r'\s', '', x))
data[['price', 'price_per_m2']] = data['price'].str.split('€', n=1, expand=True)
data['price'] = pd.to_numeric(data['price'].str.strip(), errors='coerce')
data['price_per_m2'] = data['price_per_m2'].str.replace('€/m²', '').str.strip()
data['price_per_m2'] = pd.to_numeric(data['price_per_m2'], errors='coerce')

In [15]:
data.price

0       750.0
1       595.0
2       670.0
3       450.0
4       550.0
        ...  
2671    790.0
2672    459.0
2673    500.0
2674    419.0
2675    650.0
Name: price, Length: 2676, dtype: float64

In [16]:
# area -> float64 area with filtering
data['area'] = data['area'].str.replace('\xa0m²', '').str.strip()
data['area'] = pd.to_numeric(data['area'], errors='coerce')

In [17]:
# energy_mark to numeric, A is highest, H is lowest value
data.energy_mark = data.energy_mark.map({
    'Puudub': np.nan, '-': np.nan,
    'H':1, 'G':2, 'F':3, 'E':4, 'D':5, 'C':6, 'B':7, 'A':8
})

In [18]:
# arbitrary condition mapping to 5 numeric categories
data['condition'] = data['condition'].map({
    'Uus': 5, 'Uusarendus': 5,
    'Renoveeritud': 4,
    'Valmis': 3, 'Heas korras': 3,
    'San. remont tehtud': 2, 'Keskmine': 2,
    'Vajab san. remonti': 1, 'Vajab renoveerimist': 1
})


In [19]:
# get copy_not_allowed and broker_not_allowed from the footer
# used Copilot to simplify this
def map_description_footer(df):
    df['copy_not_allowed'] = df['description_footer'].apply(lambda x: 'Ei luba enda kuulutust kopeerida' in x)
    df['no_broker_allowed'] = df['description_footer'].apply(lambda x: 'Maakleritel palun mitte tülitada' in x)
    return df

data = map_description_footer(data)

In [20]:
# from images_link, we can get the number of pictures attached to the post. that seems like a worthwhile data point to have
# we can use a regex function for that. used Copilot to check the correct regex function in python
def extract_images_attached(df):
    df['images_attached'] = df['images_link'].str.extract(r'\((\d+)\)')
    df['images_attached'] = pd.to_numeric(df['images_attached'], errors='coerce')
    return df

data = extract_images_attached(data)


     images_attached
50              21.0
51               8.0
52              36.0
53              14.0
54              17.0
55              24.0
56              12.0
57              14.0
58              19.0
59              21.0
60              20.0
61               9.0
62              20.0
63              17.0
64              35.0
65              27.0
66              13.0
67               7.0
68              13.0
69              11.0
70              29.0
71               3.0
72              22.0
73              16.0
74               6.0
75              15.0
76               8.0
77              37.0
78               9.0
79              20.0
80               9.0
81              13.0
82              19.0
83              12.0
84              13.0
85              20.0
86              30.0
87              45.0
88              12.0
89              21.0
90              10.0
91               8.0
92              16.0
93              11.0
94              17.0
95              14.0
96           

Index(['listing_link', 'listing_link_href', 'price', 'rooms', 'area',
       'floor_out_of_floors', 'build_year', 'condition', 'energy_mark',
       'summary', 'description', 'bedrooms', 'description_header',
       'description_footer', 'prepayment', 'utility_costs', 'owner_or_broker',
       'images_link', 'images_link_href', 'floor', 'total_floors',
       'price_per_m2', 'copy_not_allowed', 'no_broker_allowed',
       'images_attached'],
      dtype='object')

In [22]:
# extract whether poster is the owner depending on the owner-broker banner
data['is_owner'] = data['owner_or_broker'].str.contains('Omanik')

In [23]:
# convert prepayment to numeric
data['prepayment'] = pd.to_numeric(data['prepayment'].str.replace('€', '').str.strip(), errors='coerce')

In [24]:
# let's have all the column dropping in the last cell.
data = data.drop(['images_link', 'description_footer', 'floor_out_of_floors', 'owner_or_broker'], axis=1)

## Removing some extreme outliers

In [26]:
# two rows have 29 rooms listed.
# those are actually "shared housing" listing & no rooms cost 200 - the cheapest is 350. Let's drop those rows.
data = data.drop(data[data.rooms == 29].index)

In [27]:
#data[data.rooms.isna()] # 9 rows, let's leave those in for "comparison with no room nr"
#data[data.rooms == 8] # 2 rows, seem correct
#data[data.rooms == 6] # 5 rows
#data[data.rooms == 5] # 22 rows, that's good enough

In [28]:
# row with index 2145 has price 1€ and area as 1 m2. Not going to bother finding out what's happening.
data = data.drop(2145)

In [29]:
# since price is the feature we're trying to predict and area is a good indication of price,
# let's drop all rows where price or area is NaN
#data = data.drop(data[data.rooms == 29].index)
data = data.drop(data[(data['price'].isna()) | (data['area'].isna())].index)

In [30]:
# one of the objects has a build year above 20000, that's actually supposed to be 2022
data.loc[1877, 'build_year'] = 2022

In [31]:
# 3 rows have a price above 15000. those are either sale listings or bad typos, not worth figuring out.
data = data.drop(data[data.price > 15000].index)

In [32]:
# 1 post is a "combo posting" of 5 apartments where area is only 1 apt, but price is 5 apt. Not going to figure all the details out
data = data.drop(data[data.price_per_m2 > 60].index)

In [33]:
# manually verified that the 3 is_owner banners that are NaN are truly posted by the owner 2449 2479 2572
# replace is_owner NaN with True
# data[data.is_owner.isna() == True]
data.loc[[2449, 2479, 2572], 'is_owner'] = True

In [34]:
data.is_owner = data.is_owner.astype(bool)

In [35]:
# one listing is actually for a storage room
data = data.drop(data[data.area == 3].index)

## Extract summer and winter utility costs from utility_costs

In [37]:
data.utility_costs.unique()[390:]

array(['30 € / -', '86 € / 162 €', '70 € / 200 €', '75 € / 140 €',
       '45 € / 85 €', '81 € / 138 €', '88 € / -', '220 € / 250 €',
       '80 € / 280 €'], dtype=object)

In [38]:
# replace € with '', strip()
data['utility_costs'] = data['utility_costs'].str.replace('€', '')
# split from /
data[['utility_summer', 'utility_winter']] = data['utility_costs'].str.split('/', n=1, expand=True)
# to_numeric
data['utility_summer'] = pd.to_numeric(data['utility_summer'], errors='coerce')
data['utility_winter'] = pd.to_numeric(data['utility_winter'], errors='coerce')
# drop utility_costs
data = data.drop(['utility_costs'], axis=1)

In [39]:
# if one exists and other doesn't, copy value from one to the other.
# this is not a perfect solution, but still gives us about 30 extra rows to work with.
data.loc[(data['utility_summer'].isna()) & (data['utility_winter'].notna()), 'utility_summer'] = data['utility_winter']
data.loc[(data['utility_winter'].isna()) & (data['utility_summer'].notna()), 'utility_winter'] = data['utility_summer']

In [40]:
# 2 posts are badly formatted and very difficult to parse
data = data.drop([510, 1693])

In [41]:
# 1 post (2133) had an utility bill of over 100 000€ summer/winter. let's replace that with NaN
# 2 posts have utility bills equal to the rent, while saying that utility bills are fixed
data.loc[2133, 'utility_summer'] = np.nan
data.loc[2133, 'utility_winter'] = np.nan

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2655 entries, 0 to 2675
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   listing_link        2655 non-null   object 
 1   listing_link_href   2655 non-null   object 
 2   price               2655 non-null   float64
 3   rooms               2646 non-null   float64
 4   area                2655 non-null   float64
 5   build_year          2066 non-null   float64
 6   condition           2435 non-null   float64
 7   energy_mark         1207 non-null   float64
 8   summary             2654 non-null   object 
 9   description         2612 non-null   object 
 10  bedrooms            1836 non-null   float64
 11  description_header  2266 non-null   object 
 12  prepayment          560 non-null    float64
 13  images_link_href    2653 non-null   object 
 14  floor               2475 non-null   float64
 15  total_floors        2475 non-null   float64
 16  price_per_m

## Extracting address fields from listing_link

In [44]:
data['listing_link'].str.split(',').apply(lambda x: len(x)).unique() # address separated by ',' returns len 4, 5, 3, 6, 8, 7

array([4, 5, 3, 6, 7], dtype=int64)

In [45]:
data[data['listing_link'].str.split(',').apply(lambda x: len(x) == 5)].listing_link

2            Tartumaa, Tartu, Tartu linn, Kesklinn, Oru 2
3       Pärnumaa, Pärnu, Pärnu linn, Rannarajoon, Papl...
8           Tartumaa, Tartu, Tartu linn, Karlova, Turu 29
10       Harjumaa, Tallinn, Kristiine, Kristiine, Kotka 1
13         Tartumaa, Tartu, Tartu linn, Kesklinn, Riia 26
                              ...                        
2665    Tartumaa, Tartu, Tartu linn, Kesklinn, Väike-T...
2668    Tartumaa, Tartu, Tartu linn, Kesklinn, Vanemui...
2670    Tartumaa, Tartu, Tartu linn, Tammelinn, Soinas...
2673    Harjumaa, Tallinn, Kesklinn, Juhkentali, Liiva...
2675    Harjumaa, Rae vald, Rae, Rae küla, Dolomiidi t...
Name: listing_link, Length: 880, dtype: object

In [46]:
# COUNTY, MUNICIPALITY?, CITY, (DISTRICT)*, STREET_ADDRESS
# I think we will try to go with the first two fields for county and municipality or city, then take the last field for a street address
def extract_address_components(address):
    parts = address.split(',')
    county = parts[0].strip()
    # replacing "vald" (abbr. for municipality) reduces complexity and unique variables while resulting a small data loss that the street address often solves
    mun_or_city = parts[1].replace(' vald', '').strip()
    # replacing "tn" (abbreviation for street) makes it more suitable for different APIs, for example, OpenStreetMap, if we decide to use that
    street_adr = parts[-1].replace(' tn', '').strip()
    return county, mun_or_city, street_adr

data[['county', 'mun_or_city', 'street_adr']] = data['listing_link'].apply(lambda x: pd.Series(extract_address_components(x)))
data = data.drop(['listing_link'], axis=1)

In [47]:
data.street_adr.unique()[:100]

array(['Voorimehe 1', 'Kivi 25', 'Oru 2', 'Papli 20', 'Pikaliiva 5',
       'Väike-Patarei 1/3', 'Pebre 3', 'Õismäe tee 175', 'Turu 29',
       'Mahla 67', 'Kotka 1', 'Aasa 5', 'Mere pst 4', 'Riia 26',
       'Keskuse 12', 'Pärnu mnt 32', 'Ravila 48', 'Kesk 11', 'Koidula 26',
       'Suur-Lossi 16', 'Tõnismägi 11a', 'Pikk 40', 'Pikk 33',
       'Koidu 62--', 'Hane 4-7', 'Valli 4', 'Meierei 30', 'Randla 13',
       'Raua 34', 'Asunduse 9', 'Ahtme mnt 57', 'Kuklase 3', 'Raadiku 14',
       'Pilve 4-16', 'Aiandi 5/3', 'J. Sütiste tee 39', 'Ümera',
       'Sõmera 4a', 'Mäealuse 9', 'Riia 9', 'Tamme 24', 'Dunkri 2',
       'Vanemuise', 'Pallasti 33', 'Talli 2', 'Pargi 1', 'Rüütli 22',
       'A. Puškini 51', 'Puiestee', 'Kangelaste prospekt 10a',
       'F. R. Faehlmanni 8', 'F. R. Faehlmanni 6', 'Kalda tee 14',
       'Kungla 18', 'J. V. Jannseni 4', 'Pille 7/4', 'Uus 13c',
       'Hariduse 12', 'Tildri 7', 'Pikk 5', 'Partisani 9', 'Pärnu mnt 33',
       'Lutsu 14', 'Kooli 6a', 'Vallikraav

## Extracting info from summary                 

In [49]:
#maja tüübid
data['house_type'] = data['summary'].str.extract(r'(?i)(Kivimaja|Paneelmaja|Puitmaja|Maja)', expand=False)

# View rows again
data['house_type'] = data['house_type'].str.capitalize()
print(data[['house_type']].iloc[50:101])

     house_type
50     Kivimaja
52     Kivimaja
53   Paneelmaja
54     Puitmaja
55     Kivimaja
56     Kivimaja
57          NaN
58          NaN
59     Kivimaja
60          NaN
61          NaN
62     Kivimaja
63     Kivimaja
64         Maja
65     Kivimaja
66     Kivimaja
67     Kivimaja
68     Kivimaja
69   Paneelmaja
70   Paneelmaja
71          NaN
72   Paneelmaja
73     Puitmaja
74          NaN
75          NaN
76     Kivimaja
77          NaN
78     Puitmaja
80     Kivimaja
81          NaN
82     Kivimaja
83     Kivimaja
84     Puitmaja
85     Kivimaja
86          NaN
87          NaN
88          NaN
89     Kivimaja
90          NaN
91     Kivimaja
92     Kivimaja
93   Paneelmaja
94     Kivimaja
95     Kivimaja
96   Paneelmaja
97          NaN
98     Kivimaja
99     Kivimaja
100         NaN
101    Kivimaja
102    Kivimaja


In [50]:
# TODO
print(data['summary'].iloc[9])

Korteriomand

Köök: keraamiline pliit, avatud köök, külmik 
Sanruum: dušš, pesumasin 
Lisainfo: köök, mööbel, tsentraalne vesi, ühistransport, parkett, garderoob, seinakapp 
Side ja turvalisus: trepikoda lukus


In [51]:
#which types of heating are there
#is it okay that some have more than 1??
data['heating'] = data['summary'].str.extract(r'Küte ja ventilatsioon:\s*(.*?)(?:\r\n|\r|\n|$)', expand=False).str.strip()

print(data['heating'].head(10))

0                                     elektriküte
1                                        keskküte
2                                 keskküte, kamin
3    Õhk-vesisoojuspump, põrandaküte, elektriküte
4                                             NaN
5                                        keskküte
6                         õhksoojuspump, ahjuküte
7                                        keskküte
8                                        keskküte
9                                             NaN
Name: heating, dtype: object


In [52]:
#kas mööbel tuleb kaasa korteriga või ei? BOOLEAN
data['furnished'] = data['summary'].str.contains(r'\bmööbel\b|\bmööbli võimalus\b', case=False, regex=True)

# Check the results
print(data['furnished'].head(19))

0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8     False
9      True
10     True
11    False
12     True
13     True
14     True
15     True
16     True
17    False
18     True
Name: furnished, dtype: object


In [53]:
#kas pesumasin on ka korteris?
data['washing_machine'] = data['summary'].str.contains(r'\bpesumasin\b', case=False, regex=True)
print(data['washing_machine'].head(19))

0      True
1      True
2      True
3      True
4     False
5      True
6      True
7      True
8     False
9      True
10     True
11    False
12     True
13     True
14     True
15     True
16     True
17    False
18     True
Name: washing_machine, dtype: object


In [54]:
#kas on rõdu?
data['balcony'] = data['summary'].str.contains(r'\brõdu\b', case=False, regex=True)
print(data['balcony'].head(19))

0     False
1      True
2      True
3     False
4     False
5      True
6     False
7      True
8     False
9     False
10    False
11     True
12     True
13    False
14     True
15    False
16    False
17     True
18    False
Name: balcony, dtype: object


In [55]:
# TODO
#kas on hoopis hoov??? boolean väärtus
data['yard'] = data['summary'].str.contains(r'\bhoov\b', case=False, regex=True)
print(data['yard'].head(19))

0     False
1     False
2      True
3      True
4     False
5     False
6      True
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18     True
Name: yard, dtype: object


In [56]:
# TODO
#kas on hoopis terass??? boolean väärtus
data['terrace'] = data['summary'].str.contains(r'\bterrass\b', case=False, regex=True)
print(data['terrace'].head(19))

0     False
1     False
2     False
3      True
4     False
5      True
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
Name: terrace, dtype: object


In [57]:
#kas parkida saab tasuta?
data['parking'] = data['summary'].str.contains(r'\bparkimine tasuta\b', case=False, regex=True)
print(data['parking'].head(19))

0     False
1      True
2      True
3      True
4      True
5     False
6      True
7     False
8     False
9     False
10     True
11     True
12     True
13     True
14     True
15    False
16    False
17     True
18     True
Name: parking, dtype: object


In [58]:
#kas on internet, turvauks, videovalve? boolean väärtused kõik
data['has_internet'] = data['summary'].str.contains(r'\bInternet\b', case=False, regex=True)
data['has_security_door'] = data['summary'].str.contains(r'\bTurvauks\b', case=False, regex=True)
data['has_video_surveillance'] = data['summary'].str.contains(r'\bVideovalve\b', case=False, regex=True)

print(data[['has_internet', 'has_security_door', 'has_video_surveillance']].head(10))

  has_internet has_security_door has_video_surveillance
0         True              True                  False
1         True              True                  False
2         True              True                  False
3         True             False                  False
4        False             False                  False
5        False             False                  False
6         True              True                  False
7         True              True                   True
8        False             False                  False
9        False             False                  False


In [133]:
#kas summarys on toodud kas seal on ka panipaik? boolean väärtus

data['has_storage'] = data['summary'].str.contains(r'\bPanipaik\b', case=False, regex=True)
print(data['has_storage'])

0       False
1       False
2        True
3       False
4        True
        ...  
2671     True
2672     True
2673    False
2674     True
2675     True
Name: has_storage, Length: 2655, dtype: object


In [59]:
data['summary'].iloc[2]

'Korteriomand, kivimaja\r\n\r\nKöök: gaasipliit, avatud köök, külmik, köögimööbel \r\nSanruum: dušš, uus torustik, vann, pesumasin \r\nKüte ja ventilatsioon: keskküte, kamin \r\nLisainfo: rõdu 18\xa0m², parkett, pakettaknad, mööbel, TV, katusekorter, kinnine hoov, uus elektrijuhtmestik, seinakapp, panipaik, parkimine tasuta \r\nSide ja turvalisus: Internet, telefon, kaabelTV, valvesüsteem paigaldatud, turvauks, trepikoda lukus, naabrivalve \r\nÜmbrus: ümbruses eramud ja korterelamud, teed heas seisukorras, asub keskuses'

## Extracting info from description

In [157]:
data['description'].iloc[100]

'KORTER\r\nAnname üürile renoveeritud ja möbleeritud korteri Paldiski linnas.\r\nKorteri koosseisu kuulub avar elutuba koos köögiga, osaliselt eraldatud magamistuba, esik ning WC koos duširuumiga. \r\n\r\nTegemist on väga sooja, hubase ja valgusküllase koduga. \r\nPakutav korter asub esimesel korrusel, seega sobib hästi ka väikse lapsega perekonnale või vanemale inimesele.\r\n\r\nMAJA/PARKIMINE\r\nRae 36 kortermaja on täielikult renoveeritud 15 aastat tagasi. Kõik tehnosüsteemid on uued. Maja ümbrus ja trepikoda näeb kena välja ja on kogu aeg hooldatud.\r\nParkimine maja taga kinnises hoovis, kindel parkimiskoht. Hooviala avaneb telefoniga helistades.\r\nLisaks on olemas turvaline panipaik hoone 0-korrusel. \r\n\r\nÜürilepingu sõlmimisel tuleb tasuda:\r\n- Esimese kuu üüri ettemaks 350€\r\n- Tagatisraha 350€\r\n- Lepingutasu 350€'

In [62]:

data.columns

Index(['listing_link_href', 'price', 'rooms', 'area', 'build_year',
       'condition', 'energy_mark', 'summary', 'description', 'bedrooms',
       'description_header', 'prepayment', 'images_link_href', 'floor',
       'total_floors', 'price_per_m2', 'copy_not_allowed', 'no_broker_allowed',
       'images_attached', 'is_owner', 'utility_summer', 'utility_winter',
       'county', 'mun_or_city', 'street_adr', 'house_type', 'heating',
       'furnished', 'washing_machine', 'balcony', 'yard', 'terrace', 'parking',
       'has_internet', 'has_security_door', 'has_video_surveillance'],
      dtype='object')

In [183]:
data['has_advance_payment'] = data['description'].str.contains(
    r'1 kuu (ettemaks|üür)',  # Match phrases like "1 kuu ettemaks" or "1 kuu üür"
    case=False, 
    regex=True
)

print(data['has_advance_payment'].head(20))

0     False
1      True
2     False
3      True
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19     True
Name: has_advance_payment, dtype: object


  data['has_advance_payment'] = data['description'].str.contains(


In [185]:
data['description'].iloc[5]


'ÜÜRILE ANDA HEA ASUKOHAGA VÄGA ILUS KAHETOALINE KORTER!\r\n \r\n Rendile anda stiilne ja kena kahetoaline korter Põhja-Tallinnas!\r\n Korter on vaba vaatamiseks ja üleandmiseks alates 14. november.\r\n Kõik eluks vajalik on koheselt olemas!\r\n Täismöbleeritud!\r\n \r\n Põrandaid katab puitparkett.\r\n Rendile anda koos kogu sisustusega.\r\n Magamistoas on suur walk-in garderoob ja paigaldatud on kaasaaegne köögimööbel koos tehnikaga.\r\n Ruumilahendus on hästi läbi mõeldud.\r\n Korteril on mõnus terrass!\r\n \r\n Küttesüsteemid ja kommunikatsioonid:\r\n Korteris on keskküte.\r\n Kõikjal on vesipõrandaküte.\r\n Soojustagastusega ventilatsioonisüsteem.\r\n \r\n Minimaalne üürimisperiood on 12 kuud.\r\n \r\n Lepingu sõlmimisel küsitakse esimene üür, tagatisraha ühe kuu üüri ulatuses ning lepingutasu ühe kuu üürisumma + käibemaks.\r\n \r\n Igakuisele üürisummale 800 eurot lisanduvad igakuised kommunaalkulud! \r\n \r\n Helista ja küsi lisa!'

In [169]:
data['description'].iloc[3]

'UUS HIND!\r\n Üürile anda koheselt kahetoaline terrassiga korter Pärnu rannarajoonis!\r\n Korter on kevadel remonditud.\r\n \r\n Korter asub elamu 1. korrusel ning koosneb avatud köögiosast, elutoast, magamistoast, wc-duširuumist, esikuosast ja lisaks suurest terrassist hommikupäikese poole.\r\n Olemas vajalik sisustus, kodumasinad- nõudepesumasin, pesumasin, külmkapp koos sügavkülmaga, veekeetja, televiisor, õhksoojuspump, mikrolaineahi.\r\n \r\n Auto saab mugavalt tasuta parkida elamu ette üks autokoht.\r\n Küte korteris: elekter + õhksoojuspump.\r\n \r\n Üüritakse välja koheselt ja pikemaks perioodiks. \r\n Üürisumma on 450 eurot kuus, millele lisanduvad kommunaalmaksud.\r\n \r\n Tasumine: \r\n 1 kuu ettemaks 450 eurot\r\n tagatisraha 450 eurot\r\n maakleritasu 400 eurot\r\n \r\n Helista ja kirjuta lisainfo saamiseks!\r\n Kohtumiseni!'

## Extract info from description header - is it in all caps (bool)? Count !? and something like that maybe?

In [147]:
# TODO
#is the header all in caps or not
data['description_header_is_all_caps'] = data['description_header'].str.isupper()
data['description_header_is_all_caps'] = data['is_all_caps'].fillna(False)

print(data['description_header_is_all_caps'])


0        True
1       False
2       False
3       False
4       False
        ...  
2671     True
2672    False
2673     True
2674    False
2675    False
Name: description_header_is_all_caps, Length: 2655, dtype: bool


In [149]:
#äkki ka pikkus näitab midagi, et kui pikk või lühike header on loendades charactere: 
data['description_header_length'] = data['description_header'].str.len()
data['description_header_length'] = data['header_length'].fillna(0)

print(data['description_header_length'])

0       30.0
1       40.0
2        0.0
3       50.0
4       47.0
        ... 
2671    41.0
2672    50.0
2673    38.0
2674    50.0
2675    38.0
Name: description_header_length, Length: 2655, dtype: float64


In [145]:
print(data['description_header'])

0                          STUUDIOKORTER RAEKOJA PLATSIL!
1                Möbleeritud rõduga korter, parkimiskoht.
2                                                     NaN
3       Pikaajalise üürile terrassiga korter rannarajo...
4         Super Pakkumine! Parkimiskoht, panipaik hinnas!
                              ...                        
2671            UUS HIND! TULE KLIENDIPÄEVALE 19.11.2024!
2672    Lepingutasu al 149€ |Tasuta WiFi|Depovaba võim...
2673               UUS REMONT, UUS MÖÖBEL! OLE 1. ELANIK!
2674    Lepingutasu al 149€ |Tasuta WiFi|Depovaba võim...
2675               A-energiaklassi kodu! Sisustus hinnas!
Name: description_header, Length: 2655, dtype: object


## Determine whether to try to analyse pictures from images_link_href and assign a rating to those

In [66]:
# TODO

## Remove links (listing_link_href, images_link_href) before fitting

In [68]:
# TODO