In [8]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

In [1]:
def processRow(row, baseurl='https://www.ss.com'):
    ritems = []
    tds = row.find_all('td')
    ritems.append(baseurl + tds[1].a['href'])
    ritems.append(tds[2].text.strip().replace('\r','').replace('\n', ''))
    for td in tds[3:-1]:
        ritems.append(td.text)
    ritems.append((tds[-1].text.split()[0].replace(',','')))
    if len(tds[-1].text.split()) > 1:
        ritems.append(tds[-1].text.split()[1])
    else:
        ritems.append('N/A')
    return ritems

In [2]:
def processRows(rows):
    rowlist=[]
    for row in rows:
        rowlist.append(processRow(row))
    return rowlist

In [3]:
def getRows(url):
    req = requests.get(url)
    rows = []
    if req.status_code != 200:
        print("Bad Request"+req.status_code)
        return rows
    soup = BeautifulSoup(req.text, 'lxml')
    print("Processing: "+ str(soup.title))
    # this will be specific to ss.lv and ss.com
    alltrs = soup.find_all('tr')
    for el in alltrs:
        if 'id' in el.attrs and 'tr_' in el.attrs['id']:
            rows.append(el)
    if len(rows) > 0:
        rows.pop() # we do not need the last one nor do we need to store
    return rows

In [4]:
def processPage(url):
    rows = getRows(url)
    mylist = processRows(rows)
    return mylist # could return processRows(rows)

In [5]:
def processPages(urls):
    results = []
    for url in urls:
        print("Processing: "+url)
        results += processPage(url)
        time.sleep(0.2)
    return results

In [6]:
def getUrlList(url, prefix='https://www.ss.com', postfix='sell/',\
               tag='a', class_='a_category'):
    req = requests.get(url)
    if req.status_code != 200:
        print(f'Unexpected status code {req.status_code}. Stopping parse')
        return [] #return early and often principle
    soup = BeautifulSoup(req.text, 'lxml') # could skip soup variable as well but keeping for readability
    return [ prefix + el['href'] + postfix for el in soup.find_all(tag, class_) ]
    # What else could we pass as argument? How could our return fail?

In [7]:
def getExcel(url, fname = "sellers_"):
    urlist = getUrlList(url)
    datalist = processPages(urlist)
    df = pd.DataFrame(datalist)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    df.to_excel(f'{fname}{timestr}.xlsx')

In [9]:
getExcel('https://www.ss.com/lv/real-estate/wood')

Processing: https://www.ss.com/lv/real-estate/wood/riga-region/sell/
Processing: <title>SS.COM Mežs - Rīgas rajons, Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/aizkraukle-and-reg/sell/
Processing: <title>SS.COM Mežs - Aizkraukle un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/aluksne-and-reg/sell/
Processing: <title>SS.COM Mežs - Alūksne un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/balvi-and-reg/sell/
Processing: <title>SS.COM Mežs - Balvi un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/bauska-and-reg/sell/
Processing: <title>SS.COM Mežs - Bauska un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/cesis-and-reg/sell/
Processing: <title>SS.COM Mežs - Cēsis un raj., Cenas, Pārdod - Sludinājumi</title>
Processing: https://www.ss.com/lv/real-estate/wood/daugav