# Imports

In [1]:
import io
import pickle
import time

import numpy as np
import pandas as pd
import requests
from tqdm import tqdm 

from bs4 import BeautifulSoup  # for processing html structure of website

import selenium  # Python Selenium
from selenium import webdriver  # for specifying webdriver
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.support import expected_conditions as EC  # for checking visibility of an element
from selenium.webdriver.support.ui import WebDriverWait  # this three enable waiting until sth is displayed on website
from selenium.webdriver.common.by import By  # for checking element visibility by XPath
from webdriver_manager.chrome import ChromeDriverManager  # chromedriver for automatized access to Chrome
from selenium.common.exceptions import NoSuchElementException, TimeoutException

# Request URL

In [4]:
# Parameters Assumed:

# Opłata na Start - 10%
# Okres Umowy (miesiące) - 36
# Roczny limit km - 20 tys.

stan = "uzywane"
period = "36"
oplata_na_start = "10"
mileage_limit = "20000"

In [5]:
website_url = f"https://automarket.pl/samochody/{stan}/leasing?is_company=0&period={period}&ow_proc={oplata_na_start}&mileage_limit={mileage_limit}&"

# Drivers

In [None]:
if not hasattr(ChromeDriverManager, 'installed'):
    chromepath = ChromeDriverManager().install()
    print(chromepath)

In [6]:
service_chrome = Service(executable_path = chromepath) 
options_chrome = webdriver.ChromeOptions()
driver_chrome = webdriver.Chrome(service = service_chrome, options = options_chrome) # opens Chrome

In [7]:
driver_chrome.maximize_window() # maximizes browser's window
driver_chrome.get(website_url) # opens a website

# Accept Politykę Prywatności

In [8]:
xpath_polityka_prywtnosci_button = '''//*[@id="onetrust-accept-btn-handler"]'''

WebDriverWait(driver_chrome, 120).until(EC.visibility_of_element_located((By.XPATH, xpath_polityka_prywtnosci_button))) 

polityka_prywtnosci_button = driver_chrome.find_element("xpath", xpath_polityka_prywtnosci_button) # finds the button
polityka_prywtnosci_button.click() # clicks the button

# Collecting Links

## Select '90' for 'Ofert na Stronie'

In [9]:
xpath_oferty_na_stronie_90_button = '''//*[@id="__nuxt"]/div/main/div/div/div/div/div[2]/div[2]/div[4]/div[1]/div/div[3]'''

WebDriverWait(driver_chrome, 30).until(EC.visibility_of_element_located((By.XPATH, xpath_oferty_na_stronie_90_button))) 

oferty_na_stronie_90_button = driver_chrome.find_element("xpath", xpath_oferty_na_stronie_90_button) # finds the button
oferty_na_stronie_90_button.click() # clicks the button

In [54]:
# add some time to load the page 
time.sleep(30)

## Collect Links - By Going Through Each Page

### Finding the number of maximum Pages Button

In [10]:
xpath_page_number_button = '//*[@id="__nuxt"]/div/main/div/div/div/div/div[2]/div[2]/div[4]/div[2]/nav'

WebDriverWait(driver_chrome, 30).until(EC.visibility_of_element_located((By.XPATH, xpath_page_number_button)))
pagination_nav = driver_chrome.find_element(By.XPATH, xpath_page_number_button)

page_buttons = pagination_nav.find_elements(By.XPATH, ".//a[@aria-current='page']")

In [None]:
# We use try/except because some buttons are not numbers - ValueError: invalid literal for int() with base 10: ''
for button in page_buttons:
    try:
        page_numbers = int(button.text)
        if page_numbers > 1:
            max_page = page_numbers
    except ValueError:
        continue

print(page_numbers)

### Saving links by itterating through all pages

In [None]:
page_links = {}

for i in range(1, max_page + 1):
    # Extract links from the current page
    xpath_link = '''//*[@id="__nuxt"]/div/main/div/div/div/div/div[2]/div[2]'''
    WebDriverWait(driver_chrome, 30).until(EC.visibility_of_element_located((By.XPATH, xpath_link)))

    # Div containing the car listings
    car_listing_div = driver_chrome.find_element(By.XPATH, xpath_link)
    
    # All <a> tags inside the div container with href containing "/oferta/"
    car_links = car_listing_div.find_elements(By.XPATH, './/a[contains(@href, "/oferta/")]')

    page_i_links = [link.get_attribute('href') for link in car_links]

    page_links[i] = page_i_links

    print(f"Page Number '{i}' - DONE")
    
    if i < max_page:  # Only click the next button if we are not at the last page
        next_button_xpath = "//a[contains(@name, 'Następna strona')]"
        
        WebDriverWait(driver_chrome, 30).until(EC.element_to_be_clickable((By.XPATH, next_button_xpath)))
        next_button = driver_chrome.find_element(By.XPATH, next_button_xpath)
        next_button.click()

        # Wait for page to load
        time.sleep(10)

In [17]:
df_page_links = pd.DataFrame([(page_number, link) for page_number, links in page_links.items() for link in links], columns=['Page Number', 'Links'])

In [36]:
df_page_links.to_pickle('Outputs/df_page_links.pkl')

# Access Links and Collect Data

In [33]:
data = []

for i, link in enumerate(df_page_links['Links']):
    
    webpage_html = requests.get(link)  # request for a webpage structure
    soup = BeautifulSoup(webpage_html.text, 'html.parser')  # one need to format it into a BeautifulSoup object before proceeding
    
    # Numer Oferty, Price, Lokalizacja
    numer_oferty = soup.find('p', class_='text-base text-pko-secondary mt-5 text-center mb-8 md:ml-5 md:my-0 leading-4').text.split(': ')[1]
    
    try:
        price = soup.find('p', class_='text-2xl lg:text-[32px] leading-8 font-semibold text-pko-primary') or soup.find('p', class_='text-2xl lg:text-[32px] leading-8 font-semibold !text-pko-red-500 text-pko-primary')
        if price:
            price = price.text.replace('\xa0', ' ')
        else:
            price = None  # Handle the case where neither class is found
    except Exception as e:
        price = None  # Handle any exceptions that occur
    
    lokalizacja_samochodu = soup.find('p', class_='text-sm leading-sm py-3 md:py-0 text-pko-grey-1000 leading-[14px]').find('span').text.strip()

    # Car Model
    whole_model = soup.find('h1')
    car_model = whole_model.find_all('span')[0].text.strip()
    car_model_type = whole_model.find_all('span')[1].text.strip()

    # Informajce Podstawowe
    informacje_podstawowe = soup.find('h2').find_next().find_all('span')
    rok_produkcji = informacje_podstawowe[1].text
    przebieg = informacje_podstawowe[3].text
    naped = informacje_podstawowe[5].text
    skrzynia_biegow = informacje_podstawowe[7].text

    # More Informajce Podstawowe
    attributes = ["Pojemność silnika", "Moc silnika", "Kolor", "Typ nadwozia", "Liczba miejsc", "Kraj pochodzenia"]
    pojazd_info = {attr: soup.find(string=lambda text: text and attr in text).find_next().text for attr in attributes}

    pojemnosc_silnika = pojazd_info["Pojemność silnika"]
    moc_silnika = pojazd_info["Moc silnika"]
    kolor = pojazd_info["Kolor"]
    typ_nadwozia = pojazd_info["Typ nadwozia"]
    liczba_miejsc = pojazd_info["Liczba miejsc"]
    kraj_pochodzenia = pojazd_info["Kraj pochodzenia"]

    # Wymiary
    dimensions = ["Długość", "Szerokość", "Wysokość", "Rozstaw osi", "Masa całkowita"]
    dimension_values = {dim: soup.find(string=lambda text: text and dim in text).find_next().text for dim in dimensions}

    dlugosc = dimension_values["Długość"]
    szerokosc = dimension_values["Szerokość"]
    wysokosc = dimension_values["Wysokość"]
    rozstaw_osi = dimension_values["Rozstaw osi"]
    masa_calkowita = dimension_values["Masa całkowita"]

    # Append the collected data to the list
    data.append({
        'Numer Oferty': numer_oferty,
        'Price': price,
        'Lokalizacja': lokalizacja_samochodu,
        'Car Model': car_model,
        'Car Model Type': car_model_type,
        'Rok Produkcji': rok_produkcji,
        'Przebieg': przebieg,
        'Naped': naped,
        'Skrzynia Biegow': skrzynia_biegow,
        'Pojemnosc Silnika': pojemnosc_silnika,
        'Moc Silnika': moc_silnika,
        'Kolor': kolor,
        'Typ Nadwozia': typ_nadwozia,
        'Liczba Miejsc': liczba_miejsc,
        'Kraj Pochodzenia': kraj_pochodzenia,
        'Dlugosc': dlugosc,
        'Szerokosc': szerokosc,
        'Wysokosc': wysokosc,
        'Rozstaw Osi': rozstaw_osi,
        'Masa Calkowita': masa_calkowita
    })

    print(f"link_{i} DONE")

# Create a DataFrame from the collected data
df_car_scrapped_data = pd.DataFrame(data)

link_0 DONE
link_1 DONE
link_2 DONE
link_3 DONE
link_4 DONE
link_5 DONE
link_6 DONE
link_7 DONE
link_8 DONE
link_9 DONE
link_10 DONE
link_11 DONE
link_12 DONE
link_13 DONE
link_14 DONE
link_15 DONE
link_16 DONE
link_17 DONE
link_18 DONE
link_19 DONE
link_20 DONE
link_21 DONE
link_22 DONE
link_23 DONE
link_24 DONE
link_25 DONE
link_26 DONE
link_27 DONE
link_28 DONE
link_29 DONE
link_30 DONE
link_31 DONE
link_32 DONE
link_33 DONE
link_34 DONE
link_35 DONE
link_36 DONE
link_37 DONE
link_38 DONE
link_39 DONE
link_40 DONE
link_41 DONE
link_42 DONE
link_43 DONE
link_44 DONE
link_45 DONE
link_46 DONE
link_47 DONE
link_48 DONE
link_49 DONE
link_50 DONE
link_51 DONE
link_52 DONE
link_53 DONE
link_54 DONE
link_55 DONE
link_56 DONE
link_57 DONE
link_58 DONE
link_59 DONE
link_60 DONE
link_61 DONE
link_62 DONE
link_63 DONE
link_64 DONE
link_65 DONE
link_66 DONE
link_67 DONE
link_68 DONE
link_69 DONE
link_70 DONE
link_71 DONE
link_72 DONE
link_73 DONE
link_74 DONE
link_75 DONE
link_76 DONE
link_77 D

In [34]:
df_car_scrapped_data

Unnamed: 0,Numer Oferty,Price,Lokalizacja,Car Model,Car Model Type,Rok Produkcji,Przebieg,Naped,Skrzynia Biegow,Pojemnosc Silnika,Moc Silnika,Kolor,Typ Nadwozia,Liczba Miejsc,Kraj Pochodzenia,Dlugosc,Szerokosc,Wysokosc,Rozstaw Osi,Masa Calkowita
0,207347,982 zł,Swarzędz,OPEL Corsa,Corsa 1.2 Edition S&S,2023,40 087 km,PB,Manualna,1199 cm3,75 KM,Czerwony,Hatchback,5,PL,406 cm,176.5 cm,143.3 cm,253.8 cm,1055 kg
1,144362,982 zł,Swarzędz,CITROEN C3,C3 1.2 PureTech GPF Feel,2022,23 781 km,PB,Manualna,1199 cm3,82 KM,Czerwony,Hatchback,5,PL,399.6 cm,174.9 cm,147.4 cm,254 cm,980 kg
2,210467,1 040 zł,Klaudyn,HYUNDAI i30,i30 1.0 T-GDI Classic + DCT,2021,62 698 km,PB,Automatyczna,998 cm3,120 KM,grafitowy,Hatchback,5,pl,434 cm,179.5 cm,145.5 cm,265 cm,1246 kg
3,181402,1 094 zł,Dziekanów Leśny,CITROEN C3,C3 1.2 PureTech Shine,2023,29 960 km,PB,Manualna,1199 cm3,110 KM,Czerwony,Hatchback,5,PL,399.6 cm,174.9 cm,147.4 cm,254 cm,1090 kg
4,145997,1 138 zł,Pęcice,KIA Rio,Rio 1.2 M,2021,78 080 km,PB,Manualna,1197 cm3,84 KM,Niebieski,Hatchback,5,PL,406.5 cm,172.5 cm,145 cm,258 cm,1045 kg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1651,207302,10 049 zł,Białystok,JAGUAR F-Pace,F-Pace 2.0 P250 mHEV AWD R-Dynamic HSE,2024,8 500 km,PB,Automatyczna,1997 cm3,249 KM,Niebieski,SUV,5,PL,473.1 cm,207 cm,165.1 cm,287.4 cm,1897 kg
1652,205534,13 526 zł,Katowice,OPEL Astra,Astra VI 1.2 T GS Line S&S aut,2023,27 267 km,PB,Automatyczna,1199 cm3,130 KM,Srebrny (jasnoszary),Hatchback,5,PL,437.4 cm,186 cm,144.1 cm,267.5 cm,1371 kg
1653,192253,13 677 zł,Dziekanów Leśny,OPEL Astra,Astra VI 1.2 T GS Line S&S aut,2023,35 953 km,PB,Automatyczna,1199 cm3,130 KM,Niebieski,Hatchback,5,PL,437.4 cm,186 cm,144.1 cm,267.5 cm,1371 kg
1654,192286,13 677 zł,Dziekanów Leśny,OPEL Astra,Astra VI 1.2 T GS Line S&S aut,2023,39 425 km,PB,Automatyczna,1199 cm3,130 KM,Niebieski,Hatchback,5,PL,437.4 cm,186 cm,144.1 cm,267.5 cm,1371 kg


In [35]:
df_car_scrapped_data.to_pickle('Outputs/df_car_scrapped_data.pkl')