In [1]:
# Requests for handling HTTP get and other requests
import requests
import time # import for playing nice and not getting blocked
import pandas as pd
# from BeautifulSoup4 import BeatifulSoup if installed through pip install BeautifulSoup4
# 
from bs4 import BeautifulSoup

In [None]:
# if bs4 not found install it with
#  !pip install beatifulsoup4
# https://www.crummy.com/software/BeautifulSoup/

In [2]:
centrs = "https://www.ss.com/lv/real-estate/flats/riga/centre/sell/"

In [3]:
req = requests.get(centrs)
req.status_code

200

In [4]:
req.text[:200]

'<!DOCTYPE html>\r\n<HTML><HEAD>\r\n<title>SS.COM Dzīvokļi - Rīga - Centrs, Cenas, Pārdod - Sludinājumi</title>\r\n<meta http-equiv="Content-Type" CONTENT="text/html; charset=UTF-8">\r\n<meta name="viewport" c'

In [5]:
soup = BeautifulSoup(req.text, 'lxml') # lxml is improved parser, a little bit more lenient
soup.title

<title>SS.COM Dzīvokļi - Rīga - Centrs, Cenas, Pārdod - Sludinājumi</title>

In [6]:
type(soup)

bs4.BeautifulSoup

In [7]:
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all
tables = soup.find_all("table") # finds ALL elements matching our filter
len(tables) 

7

In [8]:
# find finds just one match
headline = soup.find("tr", {"id":"head_line"}) # this is a shorter way of finding by tr AND this element having particular id
headline

<tr id="head_line">
<td class="msg_column" colspan="3" width="70%">
<span style="float:left;"> Sludinājumi
</span>
<span align="right" class="msg_column" style="float:right;text-align:right;padding-right:3px;">
<noindex>
<a class="a19" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4S.html" rel="nofollow">datums</a></noindex></span>
</td>
<td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4SFDwT.html" rel="nofollow" title="">Iela</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4SelM=.html" rel="nofollow" title="">Ist.</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4QelM=.html" rel="nofollow" title="">m2</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4XelM=.html" rel="nofollow" title

In [9]:
type(headline)

bs4.element.Tag

In [10]:
headtds = headline.find_all("td")
# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/td
headtds

[<td class="msg_column" colspan="3" width="70%">
 <span style="float:left;"> Sludinājumi
 </span>
 <span align="right" class="msg_column" style="float:right;text-align:right;padding-right:3px;">
 <noindex>
 <a class="a19" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4S.html" rel="nofollow">datums</a></noindex></span>
 </td>,
 <td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4SFDwT.html" rel="nofollow" title="">Iela</a></noindex></td>,
 <td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4SelM=.html" rel="nofollow" title="">Ist.</a></noindex></td>,
 <td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4QelM=.html" rel="nofollow" title="">m2</a></noindex></td>,
 <td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/centre/sell/fDgSeF4XelM=.html" rel="nofollow" title="

In [12]:
len(headtds)

8

In [14]:
headline.text

'\n\n\xa0Sludinājumi\r\n\n\n\ndatums\n\nIelaIst.m2StāvsSērijaCena, m2Cena'

In [13]:
headcolumns = [el.text for el in headtds[1:]] #.text gets us content even from children and grandchildren
headcolumns

['Iela', 'Ist.', 'm2', 'Stāvs', 'Sērija', 'Cena, m2', 'Cena']

In [15]:
# lets combine the above cells into a function which will always get us columns
def getColList(soup):
    column_list = ["description","url"] # we decided to that we need these two column names no matter the html
    headline = soup.find("tr", {"id":"head_line"})
    headtds = headline.find_all("td")
    headcolumns = [el.text for el in headtds[1:]] # this will get all column names starting with 2nd in HTML
    column_list += headcolumns
    return column_list

In [16]:
column_names = getColList(soup)
column_names

['description',
 'url',
 'Iela',
 'Ist.',
 'm2',
 'Stāvs',
 'Sērija',
 'Cena, m2',
 'Cena']

In [17]:
trows = soup.find_all('tr')
len(trows)


39

In [18]:
# hardest part in this 
# how to filter only specific rows
# we need to find something that is unique to our apartment rows but not to the extra rows
# remember we id is not guaranteed
apt_rows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]
len(apt_rows)

30

In [19]:
apt_rows[-1]

<tr id="tr_47253842"><td class="msga2 pp0"><input id="c47253842" name="mid[]" type="checkbox" value="47253842_1106_0"/></td><td class="msga2"><a href="/msg/lv/real-estate/flats/riga/centre/bxbeg.html" id="im47253842"><img alt="" class="isfoto foto_list" src="https://i.ss.com/gallery/4/613/153165/30632889.th2.jpg"/></a></td><td class="msg2"><div class="d1"><a class="am" data="JTk3JTg5JThDaCU5QSVBNCVFMiU4NSU4RWclOTglQTQlOUUlODclOEFvJTk4JUEwJTlCJTg3JTkxaA==|fTY7gm" href="/msg/lv/real-estate/flats/riga/centre/bxbeg.html" id="dm_47253842">Cenā autostāvvieta, mēbelēts. 
Saulains dzīvoklis rekonstruētā </a></div></td><td c="1" class="msga2-o pp6" nowrap="">Valdemāra 37</td><td c="1" class="msga2-o pp6" nowrap="">3</td><td c="1" class="msga2-o pp6" nowrap="">75</td><td c="1" class="msga2-o pp6" nowrap="">5/6</td><td c="1" class="msga2-o pp6" nowrap="">Renov.</td><td c="1" class="msga2-o pp6" nowrap="">2,396 €</td><td c="1" class="msga2-o pp6" nowrap="">179,700  €</td></tr>

In [20]:
# lets make a function from the above doodle and make it work on most pages on SS
def getRowList(soup):
    trows = soup.find_all('tr')
    aprows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]
    return aprows

In [21]:
row_tds = apt_rows[0].find_all('td')
len(row_tds)

10

In [22]:
row_tds

[<td class="msga2 pp0"><input id="c47931160" name="mid[]" type="checkbox" value="47931160_1106_0"/></td>,
 <td class="msga2"><a href="/msg/lv/real-estate/flats/riga/centre/boejl.html" id="im47931160"><img alt="" class="isfoto foto_list" src="https://i.ss.com/gallery/4/662/165490/33097893.th2.jpg"/></a></td>,
 <td class="msg2"><div class="d1"><a class="am" data="dyVBMSU5Q2V6JTg4JUMxZSU5Q3clOUQlOUZpeiU4QXZrJTk5eSU5QiU5OGlxJTg0dQ==|Ahh3ATE4g" href="/msg/lv/real-estate/flats/riga/centre/boejl.html" id="dm_47931160"><b>В самом сердце Риги продаётся квартира
 
 Квартира после капитал</b></a></div></td>,
 <td c="1" class="msga2-o pp6" nowrap=""><b>Kalniņa 1</b></td>,
 <td c="1" class="msga2-o pp6" nowrap=""><b>1</b></td>,
 <td c="1" class="msga2-o pp6" nowrap=""><b>26</b></td>,
 <td c="1" class="msga2-o pp6" nowrap=""><b>4/5</b></td>,
 <td c="1" class="msga2-o pp6" nowrap=""><b>Renov.</b></td>,
 <td c="1" class="msga2-o pp6" nowrap=""><b>3,038</b> €</td>,
 <td c="1" class="msga2-o pp6" nowrap

In [23]:
row_tds[0].text

''

In [24]:
row_tds[1].text

''

In [25]:
row_tds[1].attrs

{'class': ['msga2']}

In [26]:
img = row_tds[1].find("img")
img

<img alt="" class="isfoto foto_list" src="https://i.ss.com/gallery/4/662/165490/33097893.th2.jpg"/>

In [27]:
img.get("src")

'https://i.ss.com/gallery/4/662/165490/33097893.th2.jpg'

In [28]:
row_tds[2].text

'В самом сердце Риги продаётся квартира\r\n\r\nКвартира после капитал'

In [29]:
row_tds[-1].text

'79,000  €'

In [30]:
a = row_tds[1].find('a')
a

<a href="/msg/lv/real-estate/flats/riga/centre/boejl.html" id="im47931160"><img alt="" class="isfoto foto_list" src="https://i.ss.com/gallery/4/662/165490/33097893.th2.jpg"/></a>

In [None]:
a.attrs

In [31]:
a.get('href')

'/msg/lv/real-estate/flats/riga/centre/boejl.html'

In [32]:
def getRow(row,colist=column_names):
    row_tds = row.find_all('td')
    rowDict = {}
    if len(row_tds) <3: # a little sanity check
        print("Hmm bad row")
        return rowDict
    
    rowDict[colist[0]] = row_tds[2].text # so the big assumption is that we always get description in 3rd column
    rowDict[colist[1]] = "https://ss.com" + row_tds[1].find('a').get('href')
    for td,key in zip(row_tds[3:],colist[2:]): 
        rowDict[key] = td.text
    return rowDict

In [33]:
getRow(apt_rows[0])

{'description': 'В самом сердце Риги продаётся квартира\r\n\r\nКвартира после капитал',
 'url': 'https://ss.com/msg/lv/real-estate/flats/riga/centre/boejl.html',
 'Iela': 'Kalniņa 1',
 'Ist.': '1',
 'm2': '26',
 'Stāvs': '4/5',
 'Sērija': 'Renov.',
 'Cena, m2': '3,038 €',
 'Cena': '79,000  €'}

In [34]:
def getRows(rowlist,colist=column_names):
    return [getRow(row) for row in rowlist]


In [35]:
row_ads = getRows(apt_rows)
row_ads[-3:]

[{'description': 'Īpašnieks piedāvā 4-istabu dzīvokli 1912. gadā celtā renovācijas',
  'url': 'https://ss.com/msg/lv/real-estate/flats/riga/centre/bcnjdn.html',
  'Iela': 'Alūksnes 1',
  'Ist.': '4',
  'm2': '97',
  'Stāvs': '3/5',
  'Sērija': 'Renov.',
  'Cena, m2': '1,254 €',
  'Cena': '121,625  €'},
 {'description': 'Šobrīd tiek sakopta un sakārtota 1912. gadā celta ēka Alūksnes i',
  'url': 'https://ss.com/msg/lv/real-estate/flats/riga/centre/ageik.html',
  'Iela': 'Alūksnes 1',
  'Ist.': '1',
  'm2': '24',
  'Stāvs': '2/6',
  'Sērija': 'Renov.',
  'Cena, m2': '1,371 €',
  'Cena': '32,900  €'},
 {'description': 'Cenā autostāvvieta, mēbelēts. \r\nSaulains dzīvoklis rekonstruētā ',
  'url': 'https://ss.com/msg/lv/real-estate/flats/riga/centre/bxbeg.html',
  'Iela': 'Valdemāra 37',
  'Ist.': '3',
  'm2': '75',
  'Stāvs': '5/6',
  'Sērija': 'Renov.',
  'Cena, m2': '2,396 €',
  'Cena': '179,700  €'}]

In [36]:
dtemp = pd.DataFrame(row_ads, columns=column_names)
dtemp.shape

(30, 9)

In [37]:
dtemp.head()

Unnamed: 0,description,url,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena
0,В самом сердце Риги продаётся квартира\r\n\r\n...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Kalniņa 1,1,26,4/5,Renov.,"3,038 €","79,000 €"
1,"Kрасивая квартира в самом центре Риги, по адре...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Merķeļa 17,2,60,2/4,P. kara,"2,667 €","160,000 €"
2,Pārdodam elegantu 5 istabu dzīvokli pilsētas c...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Marijas 16,5,172,1/7,P. kara,"1,599 €","275,000 €"
3,"Gaumīgs, mēbelēts divstāvu dzīvoklis ar balkon...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Pulkv. Brieža 7,3,104,6/7,Renov.,"2,423 €","252,000 €"
4,Īpašnieks pārdod dzīvokli ar pēlēko apdari jau...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Kungu 25,1,24,5/6,Jaun.,"2,083 €","49,990 €"


In [38]:
def getDFfromURL(url):
    # print("getting data from", url)
    req = requests.get(url)
    if req.status_code != 200:
        print("Request Fail with", req.status_code)
        return None # maybe return empty dataframe here
    soup = BeautifulSoup(req.text, 'lxml')
    column_names = getColList(soup)
    rowlist = getRowList(soup)
    rows = getRows(rowlist)
    return pd.DataFrame(rows, columns=column_names)
    

In [39]:
df = getDFfromURL(centrs)

In [40]:
df.shape

(30, 9)

In [41]:
df.head()

Unnamed: 0,description,url,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena
0,В самом сердце Риги продаётся квартира\r\n\r\n...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Kalniņa 1,1,26,4/5,Renov.,"3,038 €","79,000 €"
1,"Kрасивая квартира в самом центре Риги, по адре...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Merķeļa 17,2,60,2/4,P. kara,"2,667 €","160,000 €"
2,Pārdodam elegantu 5 istabu dzīvokli pilsētas c...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Marijas 16,5,172,1/7,P. kara,"1,599 €","275,000 €"
3,"Gaumīgs, mēbelēts divstāvu dzīvoklis ar balkon...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Pulkv. Brieža 7,3,104,6/7,Renov.,"2,423 €","252,000 €"
4,Īpašnieks pārdod dzīvokli ar pēlēko apdari jau...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Kungu 25,1,24,5/6,Jaun.,"2,083 €","49,990 €"


In [42]:

# https://developer.mozilla.org/en-US/docs/Web/HTML/Element/a
anchors = soup.find_all("a")
len(anchors)

98

In [43]:
anchors[:5]

[<a href="/" title="Sludinājumi"><img alt="Sludinājumi" border="0" class="page_header_logo" src="https://i.ss.com/img/p.gif"/></a>,
 <a class="a_menu" href="/lv/real-estate/flats/new/" title="Iesniegt Sludinājumu">Iesniegt Sludinājumu</a>,
 <a class="a_menu" href="/lv/login/" title="Mani Sludinājumi">Mani Sludinājumi</a>,
 <a class="a_menu" href="/lv/real-estate/flats/riga/centre/search/" title="Meklēt sludinājumus">Meklēšana</a>,
 <a class="a_menu" href="/lv/favorites/" title="Memo">Memo</a>]

In [44]:
navs = [anchor for anchor in anchors if anchor.get("name") == "nav_id"] # get will get us None if no "name" exists
len(navs)

10

In [45]:
navs[0]

<a class="navi" href="/lv/real-estate/flats/riga/centre/sell/page26.html" name="nav_id" rel="prev"><img border="0" height="5" src="https://i.ss.com/img/s_left.png" style="padding-bottom:2px;" width="9"/> Iepriekšējie</a>

In [46]:
navs[0].attrs

{'name': 'nav_id',
 'rel': ['prev'],
 'class': ['navi'],
 'href': '/lv/real-estate/flats/riga/centre/sell/page26.html'}

In [47]:
navs[0]['href']

'/lv/real-estate/flats/riga/centre/sell/page26.html'

In [49]:
# could use regular expression here but we can use split 
afterpage = navs[0]['href'].split("/page")[-1] # -1 might be safer than 1 in case there is region with name page
afterpage

'26.html'

In [50]:
beforedot = afterpage.split(".html")[0]
beforedot

'26'

In [51]:
lastpage = int(beforedot)
lastpage

26

In [52]:
def getAllLocalUrls(url):
    """Get a list of all urls including paginated pages"""
    results = [url] # default is just the url if no extra pages found
    req = requests.get(url)
    if req.status_code != 200:
        print(f"Bad response! {req.status_code}")
        return []
    soup = BeautifulSoup(req.text, 'lxml')
    # we just need a one element
    prevanchor = soup.find('a', {"rel":"prev"})
    if prevanchor == None:
        return results
    href = prevanchor.attrs.get('href')
    lastPageNum = int(href.split('page')[-1].split('.')[0])
    print("Last page is",lastPageNum)
    nurls = [f"{url}page{n}.html" for n in range(2,lastPageNum+1)]
    results += nurls
    return results

In [53]:
centrs

'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/'

In [54]:
local_urls = getAllLocalUrls(centrs)
len(local_urls),local_urls[:3],local_urls[-1]

Last page is 26


(26,
 ['https://www.ss.com/lv/real-estate/flats/riga/centre/sell/',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page2.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page3.html'],
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page26.html')

In [55]:
dlast = getDFfromURL(local_urls[-1])
dlast.shape

(21, 9)

In [56]:
dlast

Unnamed: 0,description,url,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena
0,Māja nodota ekspluatācijā 2020.gada aprīlī. Pā...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Klijānu 16,3,43,3/9,Jaun.,"1,558 €","67,000 €"
1,Kičīgs divlīmeņu mansarda dzīvoklis ar balkonu...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Klusā 11,3,68,5/5,Renov.,"2,187 €","148,720 €"
2,Īpašnieks pārdod 2 istabu dzīvokli ar atsevišķ...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Brīvības 147,2,60,4/7,P. kara,"1,067 €","64,000 €"
3,"Pārdodu 1 istabas dzīvokli klusā vietā, tuvu p...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Duntes 24a,1,24,2/2,P. kara,370 €,"8,890 €"
4,Piedāvājam iegādāties 3. istabu dzīvokli rekon...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Miera 105,3,71,6/6,Renov.,"1,756 €","124,700 €"
5,"3D tūre, izstaigā dzīvokli, esot mājās. Links ...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Krasta 11,3,67,6/6,Renov.,"1,537 €","103,000 €"
6,"Izolētas istabas - 25, 20 un 12 kv. m. \r\nLie...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Artilērijas 1,3,96,3/6,P. kara,917 €,"88,000 €"
7,Elegants dzīvoklis projektā Parker's. \r\n\r\n...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Melngaiļa 8,2,43,3/7,Jaun.,"3,263 €","140,300 €"
8,Elegants dzīvokļu ēku projekts Parker's. \r\n\...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Valdemāra 41,5,153,4/7,Jaun.,"3,570 €","546,200 €"
9,"Kompakts, daļēji labiekārtots 3 istabu dzīvokl...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Blaumaņa 12,3,62,3/6,Renov.,"2,613 €","161,980 €"


In [57]:
def get_all_ads_df(start_url, save_excel_path=None):
    df_list=[]
    local_urls = getAllLocalUrls(start_url)
    for url in local_urls:
        print(f"Gathering data from {url}")
        df_list.append(getDFfromURL(url))
        time.sleep(0.3) # we need this to play nice!
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html
    big_df = pd.concat(df_list)
    if save_excel_path:
        big_df.to_excel(save_excel_path)
    return big_df
    

In [58]:
centrs

'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/'

In [59]:
d = get_all_ads_df(centrs, "centrs_06_10.xlsx")

Last page is 26
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page2.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page3.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page4.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page5.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page6.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page7.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page8.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page9.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page10.html
Gathering data from https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page11.html
Gathering data from http

In [60]:
d.shape

(771, 9)

In [61]:
d.head()

Unnamed: 0,description,url,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena
0,В самом сердце Риги продаётся квартира\r\n\r\n...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Kalniņa 1,1,26,4/5,Renov.,"3,038 €","79,000 €"
1,"Kрасивая квартира в самом центре Риги, по адре...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Merķeļa 17,2,60,2/4,P. kara,"2,667 €","160,000 €"
2,Pārdodam elegantu 5 istabu dzīvokli pilsētas c...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Marijas 16,5,172,1/7,P. kara,"1,599 €","275,000 €"
3,"Gaumīgs, mēbelēts divstāvu dzīvoklis ar balkon...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Pulkv. Brieža 7,3,104,6/7,Renov.,"2,423 €","252,000 €"
4,Īpašnieks pārdod dzīvokli ar pēlēko apdari jau...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Kungu 25,1,24,5/6,Jaun.,"2,083 €","49,990 €"


In [62]:
d.tail()

Unnamed: 0,description,url,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena
16,В продаже двухкомнатная квартира с эксклюзивно...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Klijānu 16,2,54,3/9,Jaun.,"1,244 €","67,200 €"
17,Remontējams divistabu dzīvoklis vēsturiskā ēkā...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Valdemāra 69,2,37,3/6,Renov.,"1,594 €","58,990 €"
18,Продаётся солнечная четырёхкомнатная квартира ...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Brīvības 114,4,124,3/6,P. kara,"1,293 €","160,342 €"
19,Mēbelēts divistabu dzīvoklis renovētā vēsturis...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Valdemāra 69,2,37,2/6,P. kara,"2,270 €","83,990 €"
20,Piedāvājumā 4 istabu dzīvoklis vēsturiskā namā...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Čaka 36,4,84,3/5,P. kara,"1,821 €","152,935 €"


In [63]:
d.shape

(771, 9)

In [64]:
d[['Floor','MaxFloor']] = d.Stāvs.str.split("/",expand=True) 
d.head()

Unnamed: 0,description,url,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,Floor,MaxFloor
0,В самом сердце Риги продаётся квартира\r\n\r\n...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Kalniņa 1,1,26,4/5,Renov.,"3,038 €","79,000 €",4,5
1,"Kрасивая квартира в самом центре Риги, по адре...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Merķeļa 17,2,60,2/4,P. kara,"2,667 €","160,000 €",2,4
2,Pārdodam elegantu 5 istabu dzīvokli pilsētas c...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Marijas 16,5,172,1/7,P. kara,"1,599 €","275,000 €",1,7
3,"Gaumīgs, mēbelēts divstāvu dzīvoklis ar balkon...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Pulkv. Brieža 7,3,104,6/7,Renov.,"2,423 €","252,000 €",6,7
4,Īpašnieks pārdod dzīvokli ar pēlēko apdari jau...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Kungu 25,1,24,5/6,Jaun.,"2,083 €","49,990 €",5,6


In [66]:
d.sort_values(by="Floor",ascending=False).head()

Unnamed: 0,description,url,Iela,Ist.,m2,Stāvs,Sērija,"Cena, m2",Cena,Floor,MaxFloor
0,Piedāvājam iegādāties lielisku 2 istabu dzīvok...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Klusā 18,2,35,9/9,Jaun.,"1,629 €","57,000 €",9,9
27,Предлагаем арендопокупку. \r\n\r\nПросторная и...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Brīvības 230,3,74,9/10,Jaun.,"1,865 €","138,000 €",9,10
17,"Kapitālais remonts, Specprojekts, 43 m² platīb...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Brīvības 162/2,2,43,8/8,Specpr.,"1,395 €","60,000 €",8,8
12,Мансарда с верандой на крыше\r\n\r\nУникальная...,https://ss.com/msg/lv/real-estate/flats/riga/c...,Pulkv. Brieža 13,2,100,7/7,Renov.,"1,950 €","195,000 €",7,7
26,"Plaša terase un balkons. Ar pilnu apdari, indi...",https://ss.com/msg/lv/real-estate/flats/riga/c...,Stabu 100,2,122,7/7,Jaun.,"1,694 €","206,700 €",7,7


# TODO 
# Try with different starting address not only centrs
## Maybe combine regions
## See how it would work with maybe cars
## Data engineering make new columns based on existing ones, clean some columns
### Changing floors 2/6 to columns 2 and 6, clean up Euro signs
## Sorting, Describing, Grouping by regions etc