In [1]:
import requests
import time 
import pandas as pd
 
from bs4 import BeautifulSoup

In [2]:

url = "https://www.ss.com/en/agriculture/agricultural-machinery/tractors/"
url

'https://www.ss.com/en/agriculture/agricultural-machinery/tractors/'

In [3]:
req = requests.get(url) 
req.status_code

200

In [4]:
soup = BeautifulSoup(req.text, 'lxml') 

soup.title

<title>SS.COM Agricultural machinery - Tractors, Prices - Advertisements</title>

In [5]:
def getColList(soup):
    column_list = ["description","url"] 
    headline = soup.find("tr", {"id":"head_line"})
    headtds = headline.find_all("td")
    headcolumns = [el.text for el in headtds[1:]] 
    column_list += headcolumns
    return column_list

In [6]:
column_names = getColList(soup)
column_names

['description', 'url', 'Brand', 'Year', 'Price']

In [7]:
def getRowList(soup):
    trows = soup.find_all('tr')
    aprows = [row for row in trows if row.get('id',"").startswith("tr_") and not row.get('id',"").startswith("tr_bnr") ]
    return aprows

In [8]:
def getRow(row, colist=column_names):
    row_tds = row.find_all('td')
    rowDict = {}
    if len(row_tds) < 3: # a little sanity check
        print("Hmm bad row")
        return rowDict
    
    rowDict[colist[0]] = row_tds[2].text # so the big assumption is that we always get description in 3rd column
    rowDict[colist[1]] = "https://ss.com" + row_tds[1].find('a').get('href')
    for td,key in zip(row_tds[3:],colist[2:]): 
        rowDict[key] = td.text
    return rowDict

In [9]:
def getRows(rowlist,colist=column_names):
    return [getRow(row, colist=colist) for row in rowlist]

In [10]:
def getDFfromURL(url):
    
    req = requests.get(url)
    if req.status_code != 200:
        print("Request Fail with", req.status_code)
        return None 
    soup = BeautifulSoup(req.text, 'lxml')
    column_names = getColList(soup)
    rowlist = getRowList(soup)
    rows = getRows(rowlist,colist=column_names)
    return pd.DataFrame(rows, columns=column_names)

In [11]:
tractors = "https://www.ss.com/en/agriculture/agricultural-machinery/tractors/"
idf = getDFfromURL(tractors)
idf.head()

Unnamed: 0,description,url,Brand,Year,Price
0,Transporta pakalpojumi līdz 8 t. Hidrauliski n...,https://ss.com/msg/en/agriculture/agricultural...,-,-,30 €
1,Nopirkšu mazlietotu traktoru T16 MG originālā ...,https://ss.com/msg/en/agriculture/agricultural...,T16MgT16Mg,-,buy
2,Pārdod Volvo BM frontālo iekrāvēju ar kausu 2m...,https://ss.com/msg/en/agriculture/agricultural...,VolvoBm frontalais,-,"6,500 €"
3,Fastraks 14t pašmassa. Atrums pa ceļõu 80 km ....,https://ss.com/msg/en/agriculture/agricultural...,Jsb7270,2012,"60,000 €"
4,Netto cena 25000 Eur. \r\nPārdodu ļoti labu tr...,https://ss.com/msg/en/agriculture/agricultural...,CaseMx135,2000,"30,250 €"


In [14]:
# encode data to utf-8
idf.to_csv("tractors.csv", encoding="utf-8")
# idf.to_csv("traktoru_pardosana.csv")  # ... and here I get error in opening the created .csv file and as a result error for the further step

In [15]:
traktori = pd.read_csv('tractors.csv')
traktori

Unnamed: 0.1,Unnamed: 0,description,url,Brand,Year,Price
0,0,Transporta pakalpojumi līdz 8 t. Hidrauliski n...,https://ss.com/msg/en/agriculture/agricultural...,-,-,30 €
1,1,Nopirkšu mazlietotu traktoru T16 MG originālā ...,https://ss.com/msg/en/agriculture/agricultural...,T16MgT16Mg,-,buy
2,2,Pārdod Volvo BM frontālo iekrāvēju ar kausu 2m...,https://ss.com/msg/en/agriculture/agricultural...,VolvoBm frontalais,-,"6,500 €"
3,3,Fastraks 14t pašmassa. Atrums pa ceļõu 80 km ....,https://ss.com/msg/en/agriculture/agricultural...,Jsb7270,2012,"60,000 €"
4,4,Netto cena 25000 Eur. \r\nPārdodu ļoti labu tr...,https://ss.com/msg/en/agriculture/agricultural...,CaseMx135,2000,"30,250 €"
5,5,"Pārdod uzņēmums, eur 6900 + Pvn. 12 cilindru m...",https://ss.com/msg/en/agriculture/agricultural...,KirovecK-701,1991,"8,349 €"
6,6,Netto cena 35000Eur. \r\nPārdodu ļoti labu tra...,https://ss.com/msg/en/agriculture/agricultural...,ValtraT151,2009,"42,350 €"
7,7,Pārdodu normālu traktoru darba kārtībā. Kādam ...,https://ss.com/msg/en/agriculture/agricultural...,BelarusMtz,1995,"2,500 €"
8,8,Pārdodu labu Belīti darba kārtībā . Pielec uz ...,https://ss.com/msg/en/agriculture/agricultural...,BelarusMtz,1992,"2,250 €"
9,9,"Kravu pārvadājumi pa Latviju, Igauniju, Lietuv...",https://ss.com/msg/en/agriculture/agricultural...,KravuPārvadājumi,-,1 €


In [16]:
skis = "https://www.ss.com/lv/entertainment/sports/ski/skis/sell/filter/riga_f/fDgSeF4QEFF8FQ==.html"
idf = getDFfromURL(skis)
idf.head()

Unnamed: 0,description,url,Ražotājs,Modelis,Gads,Garums,Stāv.,Cena
0,"Pilnīgi jaunas, iegādāties var Rīgas centrā. A...",https://ss.com/msg/lv/entertainment/sports/ski...,Salomon,Xdr,2021,160,jaun.,220 €
1,Pārdodu ļoti labā stāvoklī freestyle slēpes K2...,https://ss.com/msg/lv/entertainment/sports/ski...,K2,Domain,2012,174,lietota,300 €
2,"В комплекте ботинки Salomon размер 44-45, палк...",https://ss.com/msg/lv/entertainment/sports/ski...,Head,Extreme,2010,150,lietota,50 €
3,"Pārdod Fischer The Curv (2017 g. , 178cm/sidec...",https://ss.com/msg/lv/entertainment/sports/ski...,Fischer,The Curv,2017,178,lietota,250 €
4,Pussezonu brauktas Fis slaloma slēpes. \r\nGar...,https://ss.com/msg/lv/entertainment/sports/ski...,Atomic,Redster S9,2022,165,lietota,700 €


In [18]:
idf.shape

(30, 8)