In [4]:
pip install webdriver_manager

Collecting webdriver_manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Installing collected packages: webdriver_manager
Successfully installed webdriver_manager-4.0.2
Note: you may need to restart the kernel to use updated packages.


In [25]:
import time
import re
import random
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from pymongo import MongoClient

# --- KONFIGURACJA ---
BASE_URL = "https://www.otomoto.pl/osobowe"

# Lista marek (długa na początku, krótka na końcu)
KNOWN_BRANDS = [
    "Mercedes-Benz", "Alfa Romeo", "Land Rover", "Aston Martin", "Great Wall", "Cupra", "Tesla",
    "Audi", "BMW", "Chevrolet", "Citroën", "Citroen", "Dacia", "Dodge", "Fiat", "Ford", 
    "Honda", "Hyundai", "Infiniti", "Jaguar", "Jeep", "Kia", "Lexus", "Mazda", "Mini", 
    "Mitsubishi", "Nissan", "Opel", "Peugeot", "Porsche", "Renault", "Seat", "Skoda", 
    "Smart", "Subaru", "Suzuki", "Toyota", "Volkswagen", "Volvo", "SsangYong"
]

# Słowa kluczowe, które oznaczają, że model składa się z 2 słów
# np. "Seria" -> bierzemy "Seria" + "3"
TWO_WORD_PREFIXES = ["Seria", "Klasa", "Class", "Series", "Range", "Grand"]

def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument("window-size=1920,1080") 
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
    
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

def scrape_otomoto_loop(num_pages=3):
    try:
        client = MongoClient('mongodb://localhost:27017/')
        db = client['otomoto_project']
        collection = db['raw_listings']
        print("Połączono z MongoDB.")
    except Exception as e:
        print("Błąd połączenia z MongoDB!")
        return

    driver = setup_driver()
    
    # Regexy
    price_pattern = re.compile(r'(\d[\d\s]+)\s*(PLN|EUR)')
    year_pattern = re.compile(r'(20\d{2}|19\d{2})')
    mileage_pattern = re.compile(r'(\d[\d\s]+)\s*km')
    capacity_pattern = re.compile(r'(\d[\d\s]+)\s*cm3') 
    horsepower_pattern = re.compile(r'(\d[\d\s]+)\s*KM')

    try:
        for page in range(1, num_pages + 1):
            url = f"{BASE_URL}?page={page}"
            print(f"\n--- SCRAPOWANIE STRONY: {page}/{num_pages} ---")
            driver.get(url)
            
            if page == 1:
                time.sleep(2)
                try:
                    accept_btn = driver.find_element(By.ID, "onetrust-accept-btn-handler")
                    driver.execute_script("arguments[0].click();", accept_btn)
                except:
                    pass

            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.TAG_NAME, "article"))
                )
            except:
                print(f"Brak ogłoszeń na stronie {page}.")
                continue

            articles = driver.find_elements(By.TAG_NAME, "article")
            page_data = [] 

            for article in articles:
                try:
                    raw_text = driver.execute_script("return arguments[0].innerText;", article)
                    clean_one_line = " ".join(raw_text.split())
                    lines = [line.strip() for line in raw_text.split('\n') if line.strip()]

                    if "PLN" not in clean_one_line and "EUR" not in clean_one_line:
                        continue

                    # Ekstrakcja liczb
                    price_match = price_pattern.search(clean_one_line)
                    price_val = int(price_match.group(1).replace(" ", "")) if price_match else None
                    currency = price_match.group(2) if price_match else None
                    year_match = year_pattern.search(clean_one_line)
                    year = int(year_match.group(0)) if year_match else None
                    mileage_match = mileage_pattern.search(clean_one_line)
                    mileage = int(mileage_match.group(1).replace(" ", "")) if mileage_match else None
                    capacity_match = capacity_pattern.search(clean_one_line)
                    capacity = int(capacity_match.group(1).replace(" ", "")) if capacity_match else None
                    horsepower_match = horsepower_pattern.search(clean_one_line)
                    horsepower = int(horsepower_match.group(1).replace(" ", "")) if horsepower_match else None

                    # Ekstrakcja tekstu
                    fuel, transmission, seller, city, voivodeship = None, None, None, None, None
                    for line in lines:
                        if line in ["Benzyna", "Diesel", "Elektryczny", "Hybryda", "LPG", "Wodór"]: fuel = line
                        elif line in ["Automatyczna", "Manualna"]: transmission = line
                        elif "Firma" in line: seller = "Firma"
                        elif "Prywatny" in line: seller = "Prywatny sprzedawca"
                        elif "(" in line and ")" in line and len(line) < 50 and "Firma" not in line:
                            parts = line.split("(")
                            if len(parts) > 1:
                                city = parts[0].strip()
                                voivodeship = parts[1].replace(")", "").strip()

                    # --- LOGIKA MARKI I MODELU  ---
                    title = "Nieznany"
                    brand = "Inna"
                    model = "Inny"

                    for line in lines:
                        if "/" in line or line in ["Wyróżnione", "Nowe"] or (city and city in line):
                            continue
                        
                        title = line
                        
                        # Sprawdzamy marki
                        for known_brand in KNOWN_BRANDS:
                            if title.lower().startswith(known_brand.lower()):
                                brand = known_brand
                                
                                # Usuwamy markę z tytułu -> zostaje reszta
                                # np. "BMW Seria 3 320d" -> " Seria 3 320d"
                                remainder = title[len(known_brand):].strip()
                                remainder_words = remainder.split()
                                
                                if remainder_words:
                                    first_word = remainder_words[0]
                                    
                                    # LOGIKA SPECJALNA: Czy model składa się z 2 słów?
                                    # Sprawdzamy czy pierwsze słowo to np. "Seria", "Klasa", "Range"
                                    if first_word in TWO_WORD_PREFIXES and len(remainder_words) > 1:
                                        # Bierzemy dwa słowa
                                        model_candidate = f"{first_word} {remainder_words[1]}"
                                    else:
                                        # Standardowo bierzemy jedno słowo
                                        model_candidate = first_word
                                        
                                    model = model_candidate.replace(",", "").replace(".", "")
                                break
                        break

                    # Link
                    try:
                        link = article.find_element(By.TAG_NAME, "a").get_attribute("href")
                    except:
                        link = ""

                    item = {
                        "title": title,
                        "brand": brand,
                        "model": model,
                        "price": price_val,
                        "currency": currency,
                        "year": year,
                        "mileage_km": mileage,
                        "engine_capacity_cm3": capacity,
                        "horsepower_hp": horsepower,
                        "fuel_type": fuel,
                        "transmission": transmission,
                        "seller_type": seller,
                        "location_city": city,
                        "location_region": voivodeship,
                        "link": link,
                        "scraped_at": time.strftime("%Y-%m-%d %H:%M:%S")
                    }
                    page_data.append(item)

                except Exception as e:
                    continue
            
            if page_data:
                collection.insert_many(page_data)
                print(f"-> Zapisano {len(page_data)} aut ze strony {page}.")
            
            sleep_time = random.uniform(3, 6)
            time.sleep(sleep_time)

    except Exception as e:
        print(f"Błąd krytyczny: {e}")
    finally:
        driver.quit()

if __name__ == "__main__":
    scrape_otomoto_loop(num_pages=1000)

Połączono z MongoDB.

--- SCRAPOWANIE STRONY: 1/1000 ---
-> Zapisano 40 aut ze strony 1.

--- SCRAPOWANIE STRONY: 2/1000 ---
-> Zapisano 32 aut ze strony 2.

--- SCRAPOWANIE STRONY: 3/1000 ---
-> Zapisano 32 aut ze strony 3.

--- SCRAPOWANIE STRONY: 4/1000 ---
-> Zapisano 32 aut ze strony 4.

--- SCRAPOWANIE STRONY: 5/1000 ---
-> Zapisano 32 aut ze strony 5.

--- SCRAPOWANIE STRONY: 6/1000 ---
-> Zapisano 32 aut ze strony 6.

--- SCRAPOWANIE STRONY: 7/1000 ---
-> Zapisano 32 aut ze strony 7.

--- SCRAPOWANIE STRONY: 8/1000 ---
-> Zapisano 32 aut ze strony 8.

--- SCRAPOWANIE STRONY: 9/1000 ---
-> Zapisano 32 aut ze strony 9.

--- SCRAPOWANIE STRONY: 10/1000 ---
-> Zapisano 32 aut ze strony 10.

--- SCRAPOWANIE STRONY: 11/1000 ---
-> Zapisano 32 aut ze strony 11.

--- SCRAPOWANIE STRONY: 12/1000 ---
-> Zapisano 32 aut ze strony 12.

--- SCRAPOWANIE STRONY: 13/1000 ---
-> Zapisano 32 aut ze strony 13.

--- SCRAPOWANIE STRONY: 14/1000 ---
-> Zapisano 32 aut ze strony 14.

--- SCRAPOWANIE 