In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import warnings
import re

warnings.filterwarnings("ignore")

def findData(child):                    
    title = child.find("span", class_="title")
    price = child.find("span", class_="price")
    price = price.text.split("€")[0]
    link = child.find("a", class_="result")
    tags = child.find("ul", class_="tags")
    
    
    tags_dict = {}
    for tag in tags:
        if tag.text != "\n":
            name = tag.text.split(":")[0].strip()
            value = tag.text.split(":")[1].strip()
            tags_dict[name] = value

    tags_dict['link'] = link['href']
    tags_dict['price'] = price
    tags_dict['title'] = title.text.strip()
    return tags_dict


url = """https://www.index.hr/oglasi/osobni-automobili/gid/27?pojamZup=-2&tipoglasa=1&
      sortby=1&elementsNum=100&grad=0&naselje=0&cijenaod=0&cijenado=10000000&num=1"""
rows = []

for i in range(1,50):
    page = re.sub('num=1$', f'num={i}', url)
    response = requests.get(page, verify=False)
    soup = BeautifulSoup(response.text)
    children = soup.select("div.OglasiRezHolder:not(.oglasiHolderBanners)")

    try:
        for child in children:
            row = findData(child)
            rows.append(row)
    
        df = pd.DataFrame.from_dict(rows)
        df = df.rename({'Godište': 'year', 'Starost': 'condition'}, axis = 1)
        df.to_csv('auti.csv')
    
    except Exception as error:
        df.to_csv('auti_test.csv')

df.head()

Unnamed: 0,year,km,condition,link,price,title,kW,Gorivo,Godina modela
0,2008,270.0,Rabljeno,https://www.index.hr/oglasi/fiat-punto-14i/oid...,2.35,Fiat Punto 14i,,,
1,2016,157.259,Rabljeno,https://www.index.hr/oglasi/volvo-v40-2-0-d2-n...,12.2,Volvo V40 2.0 D2*Navi*Pdc*Od 1. vlasnika*Izvrs...,88.0,,
2,2017,100.0,Rabljeno,https://www.index.hr/oglasi/mercedes-benz-glc-...,34.5,"Mercedes-Benz GLC 220d AMG ***BURMESTER, 360ka...",125.0,,
3,2006,,Rabljeno,https://www.index.hr/oglasi/bmw-serija-5-e60-5...,9.0,BMW serija 5 E60 530d,,,
4,2011,230.0,Rabljeno,https://www.index.hr/oglasi/peugeot-207-1-4-hd...,3.6,Peugeot 207 1.4 hdi,50.0,,


In [4]:
def parse_one_car(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    features_element = soup.find_all(class_ = "features-wrapper")
    car_dict = {}

    for element in features_element:
        children = element.find_all('ul')
        for child in children:
            li_elements = child.find_all('li')
            label = li_elements[0].text.strip() 
            value = li_elements[1].text.strip() 
            car_dict[label] = value
    return car_dict

rows = []
for car in df["link"]:
    row = parse_one_car(car)
    rows.append(row)
    
extra_df = pd.DataFrame.from_dict(rows)
final_df = pd.concat([df, extra_df], axis=1)
final_df.to_csv('auti_test.csv')

In [5]:
final_df.head()

Unnamed: 0,year,km,condition,link,price,title,kW,Gorivo,Godina modela,Tip:,...,Snaga motora kW,Marka:,Model:,Prosječna potrošnja goriva l/100km,Radni obujam cm3,Garancija za vozilo,Godina prve registracije,Gorivo.1,Očuvanost vozila,Naplata
0,2008,270.0,Rabljeno,https://www.index.hr/oglasi/fiat-punto-14i/oid...,2.35,Fiat Punto 14i,,,,14i,...,,,,,,,,,,
1,2016,157.259,Rabljeno,https://www.index.hr/oglasi/volvo-v40-2-0-d2-n...,12.2,Volvo V40 2.0 D2*Navi*Pdc*Od 1. vlasnika*Izvrs...,88.0,,,2.0 D2,...,88.0,,,,,,,,,
2,2017,100.0,Rabljeno,https://www.index.hr/oglasi/mercedes-benz-glc-...,34.5,"Mercedes-Benz GLC 220d AMG ***BURMESTER, 360ka...",125.0,,,"220d AMG ***BURMESTER, 360kamere, KUKA, 20"", L...",...,125.0,Mercedes-Benz,GLC,7.0,2.143,,,,,
3,2006,,Rabljeno,https://www.index.hr/oglasi/bmw-serija-5-e60-5...,9.0,BMW serija 5 E60 530d,,,,E60 530d,...,,,,10.0,2.998,,,,,
4,2011,230.0,Rabljeno,https://www.index.hr/oglasi/peugeot-207-1-4-hd...,3.6,Peugeot 207 1.4 hdi,50.0,,,1.4 hdi,...,50.0,,,5.0,1.4,,,,,


In [21]:
import pandas as pd

df = pd.read_csv("auti_test.csv")
df.drop(columns=["Naplata", "Gorivo", "Unnamed: 0", "Gorivo",
        "Garancija za vozilo", "Godina prve registracije",
        "Načini plaćanja", "Razlog prodaje", "Ostali podaci o vozilu",        
        "Godina modela", "Gorivo.1", "Očuvanost vozila",], inplace=True)


In [33]:
df.isna().sum()

year                                     0
km                                     371
condition                              172
link                                     0
price                                    0
title                                    0
kW                                     373
Tip:                                   973
Motor                                  534
Stanje vozila                          689
Prijeđeni kilometri                    379
Godina proizvodnje                       8
Godina modela.1                        888
Prodavač                                 8
Registriran do                        1326
Boja vozila                            433
Broj stupnjeva na mjenjaču             687
Broj vrata                              64
Oblik karoserije                       168
Ovjes                                  557
Starost                                180
Vlasnik                                889
Vrsta pogona                           375
Vrsta mjenj

In [34]:
df.drop(columns=["Prosječna potrošnja goriva l/100km"], inplace=True)