In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.by import By
import time

import pandas as pd
import os
from datetime import datetime

In [2]:
class FileAppender:
    """
    Appends if file existing new data to the file, otherwise creates new one.
    """

    @staticmethod
    def append_data_as_csv(data, name: str, current_datetime: datetime):
        """
        Converts data to DataFrame, checks does file exist and saves data.
        File has name in format {name}_{current_datetime}.csv
        File is saving in current folder.
        :param data: Data to be saved.
        :param name: Name of file without extension.
        :param current_datetime: Current date of file.
        """
        data_df = pd.DataFrame(data)
        is_file_exists = os.path.isfile(f'data/{name}.csv')

        if current_datetime is None:
            current_datetime = datetime.now()

        formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")

        data_df.to_csv(f'data/{name}_{formatted_datetime}.csv', mode='a', header=not is_file_exists, index=False)

In [3]:
class OtoDomScrapper:
    """
    Handles all operation for scrapping 'otodom.pl' website.
    """
    __webdriver: WebDriver
    __file_name = "test_data"
    __apartments_list = []
    __lp = 1
    __last_page_number = 0
    __file_date = datetime.now()

    def execute(self):
        try:
            self.__initialize_driver()

            self.__go_to_website(1)
            time.sleep(3)
            self.__accept_cookies()
            self.__set_last_page_number()
            self.__apartments_list.extend(self.__get_data_from_site())

            for i in range(2, self.__last_page_number + 1):
                self.__go_to_website(i)
                self.__apartments_list.extend(self.__get_data_from_site())

        finally:
            self.__webdriver.quit()

    def __initialize_driver(self):
        """
        Initializes the webdriver.
        """
        download_service = Service()
        self.__webdriver = webdriver.Chrome(service=download_service)

    def __go_to_website(self, page: int):
        """
        Goes to the website.
        :param page: Number of page to go.
        """
        print(f"Going to page {page}")
        self.__webdriver.get(f"https://www.otodom.pl/pl/wyniki/sprzedaz/mieszkanie/mazowieckie/warszawa/warszawa/warszawa?viewType=listing&page={page}")

    def __accept_cookies(self):
        """
        Finds the accept button on the website.
        Clicks on the accept button to accept cookies.
        """
        accept_button = self.__webdriver.find_element(By.ID, "onetrust-accept-btn-handler")
        accept_button.click()

    def __set_last_page_number(self):
        """
        Sets the last page number as global variable.
        """
        pagination_items = self.__webdriver.find_elements(By.CSS_SELECTOR, 'ul[data-cy="frontend.search.base-pagination.nexus-pagination"] li')

        last_page_number = max(
            int(item.text) for item in pagination_items if item.text.isdigit()
        )

        print(f"Last page number: {last_page_number}")
        self.__last_page_number = last_page_number

    def __get_data_from_site(self):
        """
        Gets data about apartments from the website.
        """
        advertisements = self.__webdriver.find_elements(By.CSS_SELECTOR, 'article[data-cy="listing-item"]')
        apartments_list = []

        for advertisement in advertisements:
            try:
                # 'Title' probably will be not necessary
                title = advertisement.find_element(By.CSS_SELECTOR, '[data-cy="listing-item-title"]').text
                print(f"Gets data from advertisement: {self.__lp}. {title}")
                price = advertisement.find_element(By.CSS_SELECTOR, '.css-2bt9f1').text
                localization = advertisement.find_element(By.CSS_SELECTOR, '.css-42r2ms').text
                link = advertisement.find_element(By.CSS_SELECTOR, 'a[data-cy="listing-item-link"]').get_attribute('href')

                # Gets details about rooms, area, floor
                details = advertisement.find_elements(By.CSS_SELECTOR, '.css-12dsp7a dt')
                value = advertisement.find_elements(By.CSS_SELECTOR, '.css-12dsp7a dd')

                details_dict = {details.text: value.text for details, value in zip(details, value)}

                rooms_number = details_dict.get("Liczba pokoi", "Brak informacji")
                area = details_dict.get("Powierzchnia", "Brak informacji")
                floor = details_dict.get("Piętro", "Brak informacji")

                apartment = {
                    "lp": self.__lp,
                    "title": title,
                    "link": link,
                    "localization": localization,
                    "price": price,
                    "rooms_number": rooms_number,
                    "area": area,
                    "floor": floor,
                }

                self.__lp += 1
                apartments_list.append(apartment)

            except Exception as e:
                print(f"Błąd podczas przetwarzania ogłoszenia: {e}")

        FileAppender.append_data_as_csv(apartments_list, "apartments", self.__file_date)
        return apartments_list

    def get_list(self):
        """
        Returns apartments list.
        :return: Apartments list.
        """
        return self.__apartments_list

In [4]:
otodom_scrapper: OtoDomScrapper = OtoDomScrapper()

In [5]:
otodom_scrapper.execute()

Going to page 1
Last page number: 586
Gets data from advertisement: 1. Stylowe 3 pokoje z tarasem, garażem i komórką.
Gets data from advertisement: 2. Piękne mieszkanie okolice Blue City
Gets data from advertisement: 3. 3-pokoje Rossiniego/cena z garażem /Zielona Dolina
Gets data from advertisement: 4. Kompaktowe Mieszkanko Do Remontu
Gets data from advertisement: 5. Do sprzedania 3 pokoje na Ursynowie metro Kabaty
Gets data from advertisement: 6. ✅Zobacz>>Ustawne 4 pokoje przy metrze Bródno
Gets data from advertisement: 7. ✅Sprawdź>>> ustawne 3 pokojowe mieszkanie
Gets data from advertisement: 8. QBIK dwupoziomowy apartament z widokiem na zieleń
Gets data from advertisement: 9. 2 pokoje, po remoncie, metro, Ursynów, BEZPOŚREDNI
Gets data from advertisement: 10. Apartament | Umeblowany | Klimatyzacja
Gets data from advertisement: 11. 3-pokojowe mieszkanie 59m2 + ogródek
Gets data from advertisement: 12. 4-pokojowe mieszkanie 68m2 + 2 balkony
Gets data from advertisement: 13. 2-pokojowe

KeyboardInterrupt: 

In [16]:
display(otodom_scrapper.get_list())

[{'lp': 1,
  'title': '2 pokoje, Śródmieście, świeżo wyremontowane',
  'link': 'https://www.otodom.pl/pl/oferta/2-pokoje-srodmiescie-swiezo-wyremontowane-ID4u2pm',
  'localization': 'ul. Karmelicka, Muranów, Śródmieście, Warszawa, mazowieckie',
  'price': '1 140 000 zł',
  'rooms_number': '2 pokoje',
  'area': '47.4 m²',
  'floor': '3 piętro'},
 {'lp': 2,
  'title': 'Mieszkanie w dzielnicy Woli/Centrum',
  'link': 'https://www.otodom.pl/pl/oferta/mieszkanie-w-dzielnicy-woli-centrum-ID4tYd8',
  'localization': 'ul. Żelazna, Mirów, Wola, Warszawa, mazowieckie',
  'price': '650 000 zł',
  'rooms_number': '2 pokoje',
  'area': '38 m²',
  'floor': '7 piętro'},
 {'lp': 3,
  'title': '3-pokoje Rossiniego/cena z garażem /Zielona Dolina',
  'link': 'https://www.otodom.pl/pl/oferta/3-pokoje-rossiniego-cena-z-garazem-zielona-dolina-ID4rmLz',
  'localization': 'ul. Gioacchino Rossiniego, Kobiałka, Białołęka, Warszawa, mazowieckie',
  'price': '775 000 zł',
  'rooms_number': '3 pokoje',
  'area': '