In [1]:
# Requests for handling HTTP get and other requests
import requests
import time
import pandas as pd
# from BeautifulSoup4 import BeatifulSoup if installed through pip install BeautifulSoup4
# 
from bs4 import BeautifulSoup


In [None]:
!pip install beatifulsoup4

In [2]:
maskavas = "https://www.ss.com/lv/real-estate/flats/riga/maskavas-priekshpilseta/sell/"
centrs = "https://www.ss.com/lv/real-estate/flats/riga/centre/sell/"

In [None]:
dflist = pd.read_html(maskavas) # pandas attempts to read all tables on the url so dflist
type(dflist)

In [None]:
for df in dflist:
    print(df.shape)

In [None]:
df = dflist[4]
df.head()

In [None]:
lvcitylist = pd.read_html("https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_Latvia")
len(lvcitylist)

In [None]:
for citydf in lvcitylist:
    print(citydf.shape)

In [None]:
lvcitydf = lvcitylist[1]
lvcitydf.head()

In [None]:
req = requests.get(maskavas)
req.status_code # https://en.wikipedia.org/wiki/List_of_HTTP_status_codes

In [None]:
req.text[:200]

In [None]:
req.text.index("tr_47482426") # we could fine the line that interests us manually using Python string index command

In [None]:
req.text[14907:15100]
# we could attempt to parse this manually, but there is no need for HTML since we have Beautiful Soup which does it for us

In [None]:
soup = BeautifulSoup(req.text, 'lxml') 
# lxml is a type of parser, a bit better than standard
# soup = BeautifulSoup(req.text) 
type(soup)

In [None]:
allrows = soup.find_all('tr') # so we want to find all table rows (no matter the table)
len(allrows)
# so what would be common to our rows which contain an advertisement

In [None]:
allrows[12]

In [None]:
allrows[12].get('id')

In [None]:
allrows[12]

In [None]:
aprows = [row for row in allrows if "id" in row.attrs and "tr_" in row.get('id')] #full check
len(aprows)

In [None]:
aprows = [row for row in allrows if "tr_" in row.get('id',[])] # we pass empty list as fallback when id is not found
len(aprows)

In [None]:
aprows = [row for row in allrows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ] # this is more precise
# of course above could have been done with regular expression for more difficult cases
len(aprows)

In [None]:
aprows[-1]

In [None]:
firstrow = aprows[0]
firstrow

In [None]:
len(firstcells)

In [None]:
firstcells[-1].text

In [None]:
anchor = firstcells[1].find('a')
anchor

In [None]:
anchor.get('href')

In [None]:
firstcells[2].find('a').text

In [None]:
firstcells = firstrow.find_all('td')
firstcells

In [None]:
def getRowDict(row):
    row_tds = row.find_all('td')
    rowDict = {}
    if len(row_tds) != 10: # a little sanity check
        print("Hmm bad row")
        return rowDict
    rowDict['url'] = "https://ss.com" + row_tds[1].find('a').get('href')
    rowDict['desc'] = row_tds[2].text
    rowDict['street'] = row_tds[3].text
    rowDict['rooms'] = row_tds[4].text
    rowDict['sqr_m'] = row_tds[5].text
    rowDict['floor'] = row_tds[6].text
    rowDict['build_type'] = row_tds[7].text
    rowDict['price_sqm'] = row_tds[8].text
    rowDict['price_raw'] = row_tds[9].text
    rowDict['currency'] = row_tds[9].text.split()[1] # currency is always after multiple spaces in price
    rowDict['price'] = row_tds[9].text.split()[0].replace(',','') # we do need comma
    return rowDict
    

In [None]:
mydatadict = getRowDict(aprows[7])
mydatadict

In [None]:
myrowlist = []
for row in aprows:
    myrowlist.append(getRowDict(row))
myrowlist[:5]

In [45]:
def processAptRows(rows):
    rowlist = []
    for row in rows:
        rowlist.append(getRowDict(row))
    return rowlist

In [None]:
myrows = processAptRows(aprows)
myrows[:2]

In [None]:
df = pd.DataFrame(myrowlist)
df.head()

In [None]:
df.to_excel('maskavas_pardod.xlsx')

In [None]:
maskavas

In [None]:
maskavas.split("/")[-3]

In [46]:
# lets put this in one big function
def getExcelFromUrl(url):
    req = requests.get(url)
    if req.status_code != 200:
        print("Bad request", req.status_code)
        return None
    soup = BeautifulSoup(req.text, 'lxml')
    trows = soup.find_all('tr')
    aprows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]
    myrows = processAptRows(aprows)
    df = pd.DataFrame(myrows)
    print(f"Got DF in shape {df.shape}")
    df.to_excel(f"{url.split('/')[-3]}.xlsx") #we just need a unique and descriptive name for our sheet
    

In [None]:
getExcelFromUrl(maskavas)

In [None]:
getExcelFromUrl("https://www.ss.com/lv/real-estate/flats/riga/imanta/sell/")

In [None]:
# aprows = aprows[:-1] # if we do not need the last one, so we just take it off


In [3]:
# now we need to find a list of all pages for certain region
req = requests.get(centrs)
req.status_code

200

In [4]:
soup = BeautifulSoup(req.text, 'lxml') 
soup.title

<title>SS.COM Dzīvokļi - Rīga - Centrs, Cenas, Pārdod - Sludinājumi</title>

In [5]:
navs = soup.find_all(name="nav_id")
len(navs)

0

In [6]:
len(soup.find_all(name=True))

873

In [7]:
anchors = soup.find_all("a")
len(anchors)

98

In [14]:
type(anchors)

bs4.element.ResultSet

In [9]:
anchors[:5]

[<a href="/" title="Sludinājumi"><img alt="Sludinājumi" border="0" class="page_header_logo" src="https://i.ss.com/img/p.gif"/></a>,
 <a class="a_menu" href="/lv/real-estate/flats/new/" title="Iesniegt Sludinājumu">Iesniegt Sludinājumu</a>,
 <a class="a_menu" href="/lv/login/" title="Mani Sludinājumi">Mani Sludinājumi</a>,
 <a class="a_menu" href="/lv/real-estate/flats/riga/centre/search/" title="Meklēt sludinājumus">Meklēšana</a>,
 <a class="a_menu" href="/lv/favorites/" title="Memo">Memo</a>]

In [15]:
alist = [anchor for anchor in anchors] #list comprehension would conver anchors, same as list(anchors)
# list(anchors)
len(alist),type(alist)

(98, list)

In [17]:
navs = [anchor for anchor in anchors if anchor.get("name") == "nav_id"] # get will get us None if no "name" exists
len(navs)

10

In [12]:
navs[0]

<a class="navi" href="/lv/real-estate/flats/riga/centre/sell/page23.html" name="nav_id" rel="prev"><img border="0" height="5" src="https://i.ss.com/img/s_left.png" style="padding-bottom:2px;" width="9"/> Iepriekšējie</a>

In [18]:
navs[0].attrs

{'name': 'nav_id',
 'rel': ['prev'],
 'class': ['navi'],
 'href': '/lv/real-estate/flats/riga/centre/sell/page23.html'}

In [13]:
navs[0]['href']

'/lv/real-estate/flats/riga/centre/sell/page23.html'

In [19]:
navs[0]['href'].split("/page")[1]

'23.html'

In [21]:
navs[0]['href'].split("/page")[0]

'/lv/real-estate/flats/riga/centre/sell'

In [22]:
lastpage = navs[0]['href'].split("/page")[1].split(".")[0]
lastpage, type(lastpage)

('23', str)

In [26]:
lastpage = int(lastpage)
lastpage

23

In [24]:
centrs

'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/'

In [27]:
pagelist = [centrs]
extrapages = [f"{centrs}" for n in range(2,lastpage+1)]
extrapages[:5]

['https://www.ss.com/lv/real-estate/flats/riga/centre/sell/',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/']

In [29]:
pagelist = [centrs]
extrapages = [f"{centrs}/page{n}.html" for n in range(2,lastpage+1)]
extrapages[:5],extrapages[-5:]

(['https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page2.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page3.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page4.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page5.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page6.html'],
 ['https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page19.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page20.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page21.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page22.html',
  'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page23.html'])

In [30]:
pagelist += extrapages # I add extrapages list to pagelist (flat)
pagelist[:5]

['https://www.ss.com/lv/real-estate/flats/riga/centre/sell/',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page2.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page3.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page4.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page5.html']

In [40]:
def getLocalPageList(url):
    localPageList = [url]
    req = requests.get(url)
    if req.status_code != 200:
        print("Bad Status Code", req.status_code)
        return
    soup = BeautifulSoup(req.text, 'lxml') 
    anchors = soup.find_all("a")
    navs = [anchor for anchor in anchors if anchor.get("name") == "nav_id"] # get will get us None if no "name" exists
    if len(navs) == 0:
        return localPageList # means we only have the first page and that is it
    lastpage = navs[0]['href'].split("/page")[1].split(".")[0]
    lastpage = int(lastpage) # could have done in the previous page
    extrapages = [f"{centrs}page{n}.html" for n in range(2,lastpage+1)]
    localPageList += extrapages
    return localPageList

In [35]:
getLocalPageList(centrs)

['https://www.ss.com/lv/real-estate/flats/riga/centre/sell/',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page2.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page3.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page4.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page5.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page6.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page7.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page8.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page9.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page10.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page11.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page12.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page13.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page14.html

In [33]:
getLocalPageList("https://www.ss.com/lv/real-estate/flats/riga/imanta/sell/")

['https://www.ss.com/lv/real-estate/flats/riga/imanta/sell/',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page2.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page3.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page4.html',
 'https://www.ss.com/lv/real-estate/flats/riga/centre/sell//page5.html']

In [41]:
getLocalPageList(maskavas)

['https://www.ss.com/lv/real-estate/flats/riga/maskavas-priekshpilseta/sell/']

In [37]:
maskavas

'https://www.ss.com/lv/real-estate/flats/riga/maskavas-priekshpilseta/sell/'

In [43]:
allpages = getLocalPageList("https://www.ss.com/lv/real-estate/flats/riga/all/")
len(allpages)

166

In [44]:
166*30 # should be a bit more than 4963 (at this particular time)

4980

In [42]:
url = "https://www.ss.com/lv/real-estate/flats/riga/"
baseurl = "https://ss.com"
postfix = "sell/"

In [47]:
# task get column information
req = requests.get("https://www.ss.com/lv/real-estate/flats/riga/all/")
req.status_code

200

In [48]:
soup = BeautifulSoup(req.text, 'lxml')
soup.title

<title>SS.COM Dzīvokļi - Rīga, Cenas - Visi sludinājumi</title>

In [50]:
headline = soup.find("tr", {"id":"head_line"}) # this is a shorter way of finding by tr AND this element having particular id
headline

<tr id="head_line">
<td class="msg_column" colspan="3" width="70%">
<span style="float:left;"> Sludinājumi
</span>
<span align="right" class="msg_column" style="float:right;text-align:right;padding-right:3px;">
<noindex>
<a class="a19" href="/lv/real-estate/flats/riga/all/fDgSeF4S.html" rel="nofollow">datums</a></noindex></span>
</td>
<td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/all/fDgSeF4bEFV8FQ==.html" rel="nofollow" title="">Pagasts</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/all/fDgSeF4SelM=.html" rel="nofollow" title="">Ist.</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/all/fDgSeF4QelM=.html" rel="nofollow" title="">m2</a></noindex></td><td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/all/fDgSeF4XelM=.html" rel="nofollow" title="">Stāvs</a></noindex></td><td c

In [54]:
headtds = headline.find_all("td")
len(headtds)

7

In [55]:
headtds

[<td class="msg_column" colspan="3" width="70%">
 <span style="float:left;"> Sludinājumi
 </span>
 <span align="right" class="msg_column" style="float:right;text-align:right;padding-right:3px;">
 <noindex>
 <a class="a19" href="/lv/real-estate/flats/riga/all/fDgSeF4S.html" rel="nofollow">datums</a></noindex></span>
 </td>,
 <td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/all/fDgSeF4bEFV8FQ==.html" rel="nofollow" title="">Pagasts</a></noindex></td>,
 <td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/all/fDgSeF4SelM=.html" rel="nofollow" title="">Ist.</a></noindex></td>,
 <td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/all/fDgSeF4QelM=.html" rel="nofollow" title="">m2</a></noindex></td>,
 <td class="msg_column_td" nowrap=""><noindex><a class="a18" href="/lv/real-estate/flats/riga/all/fDgSeF4XelM=.html" rel="nofollow" title="">Stāvs</a></noindex></td>,
 <td 

In [56]:
headcolumns = [el.text for el in headtds[1:]] #.text gets us content even from children and grandchildren
headcolumns

['Pagasts', 'Ist.', 'm2', 'Stāvs', 'Sērija', 'Cena']

In [57]:
def getColList(soup):
    column_list = ["description","url"]
    headline = soup.find("tr", {"id":"head_line"})
    headtds = headline.find_all("td")
    headcolumns = [el.text for el in headtds[1:]]
    column_list += headcolumns
    return column_list

In [58]:
mycolist = getColList(soup)
mycolist

['description', 'url', 'Pagasts', 'Ist.', 'm2', 'Stāvs', 'Sērija', 'Cena']

In [60]:
carcols = getColList(BeautifulSoup(requests.get("https://www.ss.com/lv/transport/cars/volkswagen/").text, 'lxml'))
carcols

['description', 'url', 'Modelis', 'Gads', 'Tilp.', 'Nobrauk.', 'Cena']

In [67]:
def getRow(row,colist):
    row_tds = row.find_all('td')
    rowDict = {}
    if len(row_tds) <3: # a little sanity check
        print("Hmm bad row")
        return rowDict
    
    rowDict[colist[0]] = row_tds[2].text
    rowDict[colist[1]] = "https://ss.com" + row_tds[1].find('a').get('href')
    for td,key in zip(row_tds[3:],colist[2:]): 
        rowDict[key] = td.text
    return rowDict
    

In [62]:
soup.title

<title>SS.COM Dzīvokļi - Rīga, Cenas - Visi sludinājumi</title>

In [63]:
trows = soup.find_all('tr')
aprows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]

In [64]:
getRows(aprows[0], mycolist)

{'description': 'Īpašnieki piedāvā iegādāties skaistu 3 istabas dzīvokli, 89 kv. ',
 'url': 'https://ss.com/msg/lv/real-estate/flats/riga/agenskalns/ankni.html',
 'Pagasts': 'ĀgenskalnsNometņu 18',
 'Ist.': '3',
 'm2': '89',
 'Stāvs': '1/4',
 'Sērija': 'Renov.',
 'Cena': '80,000  €'}

In [65]:
carsoup = BeautifulSoup(requests.get("https://www.ss.com/lv/transport/cars/volkswagen/").text, 'lxml')
carsoup.title

<title>SS.COM Vieglie auto - Volkswagen, Cenas - Sludinājumi</title>

In [66]:
trows = carsoup.find_all('tr')
aprows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]
getRows(aprows[0], carcols)

{'description': 'Sakarā ar cita auto iegādi tiek tirgots tehniski labā stāvoklī auto. Protams',
 'url': 'https://ss.com/msg/lv/transport/cars/volkswagen/sharan/cbnpj.html',
 'Modelis': 'Sharan',
 'Gads': '2006',
 'Tilp.': '1.9D',
 'Nobrauk.': '284 tūkst.',
 'Cena': '3,200  €'}

In [68]:
soup.title

<title>SS.COM Dzīvokļi - Rīga, Cenas - Visi sludinājumi</title>

In [72]:
def getRowList(url):
    req = requests.get(url)
    if req.status_code != 200:
        print("Bad request", req.status_code)
        return [] # more likely to work even if one bad request goes through
    soup = BeautifulSoup(req.text, 'lxml')
    trows = soup.find_all('tr')
    aprows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]
    return aprows

In [73]:
def getAdList(myurl):
    soup = BeautifulSoup(requests.get(myurl).text, 'lxml') # no status check
    adlist = []
    colist = getColList(soup)
    urlist = getLocalPageList(myurl)
    for url in urlist[:3]: #FIXME  for url in urlist: 
        print(f"Getting Rows From {url}")
        rowlist = getRowList(url)
        for row in rowlist:
            adlist.append(getRow(row, colist))
        print(f"Finished Processing {url}")
        time.sleep(0.3) # good practice to add a little pause
    return adlist
    

In [70]:
allRiga = "https://www.ss.com/lv/real-estate/flats/riga/all/sell/"

In [74]:
adlist = getAdList(allRiga)

Getting Rows From https://www.ss.com/lv/real-estate/flats/riga/all/sell/
Finished Processing https://www.ss.com/lv/real-estate/flats/riga/all/sell/
Getting Rows From https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page2.html
Finished Processing https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page2.html
Getting Rows From https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page3.html
Finished Processing https://www.ss.com/lv/real-estate/flats/riga/centre/sell/page3.html


In [75]:
len(adlist)

90

In [76]:
adlist[:2]

[{'description': 'Только закончен ремонт. \r\n1-к квартира, 4/4 этаж, находится в Ке',
  'url': 'https://ss.com/msg/lv/real-estate/flats/riga/kengarags/dbeek.html',
  'Pagasts': 'ĶengaragsMaskavas 387',
  'Ist.': '1',
  'm2': '31',
  'Stāvs': '4/4',
  'Sērija': 'Hrušč.',
  'Cena': '25,500  €'},
 {'description': 'Продается двухкомнатная квартира в хорошем развитом районе, треб',
  'url': 'https://ss.com/msg/lv/real-estate/flats/riga/imanta/gkfxn.html',
  'Pagasts': 'ImantaSlokas 195',
  'Ist.': '2',
  'm2': '39',
  'Stāvs': '1/5',
  'Sērija': 'LT proj.',
  'Cena': '36,500  €'}]

In [77]:
df = pd.DataFrame(adlist)
df.head()

Unnamed: 0,Cena,Ist.,Pagasts,Stāvs,Sērija,description,m2,url
0,"25,500 €",1,ĶengaragsMaskavas 387,4/4,Hrušč.,"Только закончен ремонт. \r\n1-к квартира, 4/4 ...",31,https://ss.com/msg/lv/real-estate/flats/riga/k...
1,"36,500 €",2,ImantaSlokas 195,1/5,LT proj.,Продается двухкомнатная квартира в хорошем раз...,39,https://ss.com/msg/lv/real-estate/flats/riga/i...
2,"50,000 €",4,ĀgenskalnsKuldīgas 48,1/2,P. kara,Pārdodu dzīvokli Āgenskalna priežu rajonā Kuld...,54,https://ss.com/msg/lv/real-estate/flats/riga/a...
3,"69,500 €",3,Maskavas priekšpilsētaJēkabpils 2,2/5,Specpr.,Plašs un gaišs trīs istabu dzīvoklis pašā Rīga...,73,https://ss.com/msg/lv/real-estate/flats/riga/m...
4,"69,500 €",3,centrsSadovņikova 29,2/5,Specpr.,Plašs un gaišs trīs istabu dzīvoklis pašā Rīga...,73,https://ss.com/msg/lv/real-estate/flats/riga/c...


In [78]:
df.to_excel("allRiga.xlsx")

In [None]:
def getRegionList(url, baseurl = "https://ss.com", postfix = "sell/"):
    req = requests.get(url)
    if req.status_code != 200:
        print(f"Bad response! {req.status_code}")
        return []
    soup = BeautifulSoup(req.text, 'lxml')
    allanchors = soup.find_all('a')
    acats = [el for el in allanchors if 'class' in el.attrs\
             and 'a_category' in el.attrs['class']]
    regionlist = [baseurl + el.attrs['href'] + postfix for el in acats]
#     regiontuples = [(el.text, baseurl + el.attrs['href'] + postfix)\
#                     for el in acats]
    # we do not want the last one with all
    return regionlist[:-1]
    

In [None]:
myregions = getRegionList(url)
len(myregions)

In [None]:
def getAllLocalUrls(url):
    """Get a list of all urls including paginated pages"""
    results = [url] # default is just the url if no extra pages found
    req = requests.get(url)
    if req.status_code != 200:
        print(f"Bad response! {req.status_code}")
        return []
    soup = BeautifulSoup(req.text, 'lxml')
    # we just need a one element
    prevanchor = soup.find('a', {"rel":"prev"})
    if prevanchor == None:
        return results
    href = prevanchor.attrs.get('href')
    lastPageNum = int(href.split('page')[-1].split('.')[0])
    print(lastPageNum)
    nurls = [f"{url}page{n}.html" for n in range(2,lastPageNum+1)]
    results += nurls
    return results

In [None]:
myresults = getAllLocalUrls(myregions[0])
myresults

In [None]:
mysplit = href.split(".")
mysplit

In [None]:
splitbypage = mysplit[0].split("page")
splitbypage

In [None]:
int(splitbypage[1]),splitbypage[1],type(splitbypage[1])

In [None]:
def getAllUrls(url="https://www.ss.com/lv/real-estate/flats/riga/"):
    urlist = []
    reglist = getRegionList(url)
    for region in reglist:
        print(f"Working on {region}")
        regurls = getAllLocalUrls(region)
        urlist += regurls
        time.sleep(0.3)
    return urlist

In [None]:
urlist = getAllUrls()
len(urlist)

In [None]:
def getAllRows(url):
    req = requests.get(url)
    if req.status_code != 200:
        print(f"Bad response! {req.status_code}")
        return []
    soup = BeautifulSoup(req.text, 'lxml')
    allrows = soup.find_all('tr')
    adrows = [el for el in allrows\
              if 'id' in el.attrs and 'tr_' in el.attrs['id']]
    return adrows[:-1] # since the last one is not an ad

In [None]:
other = getAllRows("https://ss.com/lv/real-estate/flats/riga/other/sell/")
len(other)

In [None]:
def getRow(row):
    # get all table data cells
    tds = row.find_all('td')
    adlist = []
    adlist.append(tds[1].find('a').attrs['href'])
    for td in tds[2:]:
        adlist.append(td.text)
    return tuple(adlist)

In [None]:
getRow(other[0])

In [None]:
def processAllUrls(urlist):
    results = []
    for url in urlist:
        print(f"Processing {url}")
        rows = getAllRows(url)
        for row in rows:
            results.append(getRow(row))
        time.sleep(0.3)
    return results

In [None]:
myrooms = processAllUrls(urlist)
len(myrooms)

In [None]:
myrooms[:5]

In [None]:
len(myrooms)/9

In [None]:
# originally we messed up and used += instead of mylist.append(newitem)
fixmyrooms = [tuple(myrooms[n*9:n*9+9]) for n in range(len(myrooms)//9)]
len(fixmyrooms)

In [None]:
fixmyrooms[:5]

In [None]:
base  = "https://www.ss.com"
fixmyrooms = [tuple([base+el[0]] + list(el[1:])) for el in myrooms]
fixmyrooms[:3]

In [None]:
df = pd.DataFrame(fixmyrooms)
df.head()

In [None]:
import datetime

In [None]:
now = datetime.datetime.now()
now

In [None]:
now.month

In [None]:
now.hour

In [None]:
now.minute

In [None]:
now.second

In [None]:
now = datetime.datetime.now()
df.to_csv(f'apartments\
_{now.month}_{now.hour}_{now.minute}_{now.second}.csv')

In [None]:
df.to_excel('apartments.xlsx')

In [None]:
def getColumnNames(url, colNames = ['URL', 'Description']):
    result = []
    result += colNames
    req = requests.get(url)
    if req.status_code != 200:
        print(f"Bad response! {req.status_code}")
        return []
    soup = BeautifulSoup(req.text, 'lxml')
    columnrow = soup.find('tr', {'id':'head_line'})
    tds = columnrow.find_all('td')
    for td in tds[1:]:
        result.append(td.text)
    return result
    

In [None]:
columnNames = getColumnNames("https://www.ss.com/lv/real-estate/flats/riga/other/sell/")

In [None]:
columnNames

In [None]:
columnNames

In [None]:
df.columns = columnNames
df.head()

In [None]:
%%writefile util.py
def getFileName(prefix="apartments", postfix="xlsx", hasYear=False):
    now = datetime.datetime.now()
    if hasYear:
        return f"{prefix}_{now.year}_{now.month}_{now.hour}\
_{now.minute}_{now.second}.{postfix}"
    else:
        return f"{prefix}_{now.month}_{now.hour}\
_{now.minute}_{now.second}.{postfix}"

In [None]:
import util

In [None]:
util.getFileName()

In [None]:
getFileName("rooms", postfix="txt",hasYear=True)

In [None]:
getFileName("cars", postfix="xlsx")

In [None]:
now = datetime.datetime.now()
df.to_csv(getFileName())

In [None]:
df.to_excel('apartments.xlsx')

In [None]:
df.to_excel(getFileName())

In [None]:
from IPython.display import HTML

def create_download_link( df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

In [None]:
import base64

In [None]:
create_download_link(df)