In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time 

In [3]:
def getUrlList(url, prefix='https://www.ss.com', postfix='sell/', tag='a', class_='a_category'):
    req = requests.get(url)
    if req.status_code != 200:
        print(f'Unexpected status code {req.status_code}. Stopping parse')
        return [] #return early and often principle
    soup = BeautifulSoup(req.text, 'lxml') # could skip soup variable as well but keeping for readability
    return [ prefix + el['href'] + postfix for el in soup.find_all(tag, class_) ]
    # What else could we pass as argument? How could our return fail?

In [4]:
def processRow(row, baseurl='https://www.ss.com'):
    ritems = []
    tds = row.find_all('td')
    ritems.append(baseurl + tds[1].a['href'])
    ritems.append(tds[2].text.strip().replace('\r','').replace('\n', ''))
    for td in tds[3:-1]:
        ritems.append(td.text)
    ritems.append(int(tds[-1].text.split()[0].replace(',','')))
    ritems.append(tds[-1].text.split()[1])
    return ritems

In [5]:
def processRows(rows):
    rowlist=[]
    for row in rows:
        rowlist.append(processRow(row))
    return rowlist

In [11]:
def getRows(url):
    req = requests.get(url)
    rows = []
    if req.status_code != 200:
        print("Bad Request"+req.status_code)
        return
    soup = BeautifulSoup(req.text, 'lxml')
    print("Processing: "+ str(soup.title))
    # this will be specific to ss.lv and ss.com
    alltrs = soup.find_all('tr')
    for el in alltrs:
        if 'id' in el.attrs and 'tr_' in el.attrs['id']:
            rows.append(el)
    rows = rows[:-1] # we do not need the last one nor do we need to store
    return rows

In [7]:
def processPage(url):
    rows = getRows(url)
    mylist = processRows(rows)
    return mylist # could return processRows(rows)

In [8]:
def processPages(urls):
    results = []
    for url in urls:
        print("Processing: "+url)
        results += processPage(url)
        time.sleep(0.1)
    return results

In [9]:
url = "https://www.ss.com/lv/real-estate/wood/"

In [12]:
mylist = processPages(getUrlList(url))
len(mylist)

Processing: https://www.ss.com/lv/real-estate/wood/riga-region/sell/
Processing: <title>SS.COM Mežs - Rīgas rajons, Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/aizkraukle-and-reg/sell/
Processing: <title>SS.COM Mežs - Aizkraukle un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/aluksne-and-reg/sell/
Processing: <title>SS.COM Mežs - Alūksne un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/balvi-and-reg/sell/
Processing: <title>SS.COM Mežs - Balvi un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/bauska-and-reg/sell/
Processing: <title>SS.COM Mežs - Bauska un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/cesis-and-reg/sell/
Processing: <title>SS.COM Mežs - Cēsis un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/daugav

136

In [13]:
mylist[10]

['https://www.ss.com/msg/lv/real-estate/wood/aizkraukle-and-reg/daudzeses-pag/jelin.html',
 'Pārdod nekustamo īpašumu - meža platības "Pērkoni 1" 6Ha Daudzeses pag. , Jaungelgav',
 'Daudzeses pag.',
 '6 ha.',
 7800,
 '€']

In [14]:
df = pd.DataFrame(mylist)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,https://www.ss.com/msg/lv/real-estate/wood/rig...,Pārdod pieauguša meža audzi. Daļēji izstrādāts...,Ropažu nov.Ropaži,10 ha.,12500,€
1,https://www.ss.com/msg/lv/real-estate/wood/rig...,"Pārdod cirsmu 80m3 Priedes, īpašums ceļa malā,...",Stopiņu nov.Upeslejas,80 m²,3500,€
2,https://www.ss.com/msg/lv/real-estate/wood/rig...,"Īpašnieks pārdod cirsmu. Var izstrādāt priedi,...",Carnikavas nov.Garciems,1 ha.,12000,€
3,https://www.ss.com/msg/lv/real-estate/wood/rig...,AS Privatbank meitas uzņēmuma īpašums. Zemes g...,Garkalnes nov.Makstenieki,3.45 ha.,95000,€
4,https://www.ss.com/msg/lv/real-estate/wood/rig...,Saimnieks pārdod neizcirstu mežu ar zemi glezn...,Krimuldas pag.Turaida,9.20 ha.,39000,€


In [15]:
df.to_excel('forests.xlsx')