In [None]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [33]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from tqdm.auto import tqdm
from selenium.webdriver.chrome.options import Options
import numpy as np
import pandas as pd
from multiprocessing import Pool, Manager
import random
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

In [None]:
def parse_flat(driver):
    flat_data = {}

    def safe_find(by, value):
        try:
            return driver.find_element(by, value).text
        except Exception:
            return None

    def safe_find_elements(by, value):
        try:
            return driver.find_elements(by, value)
        except Exception:
            return []

    location = safe_find(By.CLASS_NAME, 'a10a3f92e9--address-line--GRDTb')
    if location:
        parts = location.split(',')
        flat_data['okrug'] = parts[1].strip() if len(parts) > 1 else None
        flat_data['raion'] = parts[2].strip() if len(parts) > 2 else None
    else:
        flat_data['okrug'] = None
        flat_data['raion'] = None

    metro_data = {}
    metro_elements = safe_find_elements(By.CLASS_NAME, 'a10a3f92e9--underground--pjGNr')
    for elem in metro_elements:
        lines = elem.text.split('\n')
        if len(lines) >= 2:
            try:
                minutes = int(lines[1].split()[0])
            except Exception:
                minutes = None
            metro_data[lines[0]] = minutes
    flat_data['metro'] = metro_data if metro_data else None

    text_info = {}
    text_content = safe_find(By.CLASS_NAME, 'a10a3f92e9--container--tqDAE')
    if text_content:
        lines = text_content.split('\n')
        for key, value in zip(lines[::2], lines[1::2]):
            text_info[key] = value
    flat_data['text_info'] = text_info if text_info else None

    flat_data['price'] = safe_find(By.CLASS_NAME, 'a10a3f92e9--amount--ON6i1')

    desc = safe_find(By.CLASS_NAME, 'a10a3f92e9--title--vlZwT')
    if desc:
        desc_split = desc.split(' ')
        flat_data['desc'] = desc_split
        flat_data['rooms'] = desc_split[1][0] if len(desc_split) > 1 and len(desc_split[1]) > 0 else None
    else:
        flat_data['desc'] = None
        flat_data['rooms'] = None

    flat_data['living_complex'] = safe_find(By.CLASS_NAME, 'a10a3f92e9--link--A5SdC')

    flat_details = {}
    flat_det_text = safe_find(By.CLASS_NAME, 'a10a3f92e9--group--K5ZqN')
    if flat_det_text:
        lines = flat_det_text.split('\n')[1:]
        for a, b in zip(lines[::2], lines[1::2]):
            flat_details[a] = b
    flat_data['flat_details'] = flat_details if flat_details else None

    offer_details = {}
    offer_det_text = safe_find(By.CLASS_NAME, 'a10a3f92e9--right--_9uBM')
    if offer_det_text:
        lines = offer_det_text.split('\n')[2:]
        for a, b in zip(lines[::2], lines[1::2]):
            offer_details[a] = b
    flat_data['offer_details'] = offer_details if offer_details else None

    # Новый блок!
    try:
        fact_items = driver.find_elements(By.CSS_SELECTOR, 'div[data-name="OfferFactItem"]')
        for item in fact_items:
            spans = item.find_elements(By.TAG_NAME, 'span')
            if len(spans) >= 2:
                key = spans[0].text.strip()
                value = spans[1].text.strip()
                flat_data[key] = value
    except Exception as e:
        print("Error parsing offer facts:", e)

    return flat_data


def save_progress(results):
    df = pd.DataFrame(results)
    df.to_csv("flats_data_2.csv", index=False)
    print(f"Progress saved: {len(results)} flats.")


def wait_for_cards(driver, timeout=15):
    try:
        WebDriverWait(driver, timeout).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'article[data-name="CardComponent"]'))
        )
    except Exception as e:
        print("Timeout waiting for cards:", e)


def main():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--incognito")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")

    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
        "Mozilla/5.0 (iPhone14,3; U; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19A346 Safari/602.1"
    ]
    chrome_options.add_argument(f"--user-agent={random.choice(user_agents)}")

    driver = webdriver.Chrome(options=chrome_options)

    results = []
    total_required = 4040
    current_total = 0

    try:
        for page_num in tqdm(range(1, 55)):
            if current_total >= total_required:
                break

            print(f"Parsing page {page_num}")
            url = (
                "https://www.cian.ru/cat.php?deal_type=rent&engine_version=2"
                "&is_by_homeowner=1&offer_type=flat"
                f"&p={page_num}"
                "&region=1&type=4"
            )
            driver.get(url)
            wait_for_cards(driver)

            flats = driver.find_elements(By.CSS_SELECTOR, 'article[data-name="CardComponent"]')
            print(f"Found {len(flats)} flats on page {page_num}")

            for flat in flats:
                if current_total >= total_required:
                    break
                try:
                    links = flat.find_elements(By.TAG_NAME, 'a')
                    flat_links = [link for link in links if '/flat/' in link.get_attribute('href')]

                    if not flat_links:
                        print("No valid flat link found in this flat card, skipping.")
                        continue

                    link = flat_links[0]
                    link.click()
                    time.sleep(1)
                    driver.switch_to.window(driver.window_handles[-1])
                    time.sleep(1)

                    flat_data = parse_flat(driver)
                    results.append(flat_data)
                    current_total += 1

                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])

                except Exception as e:
                    print("Error processing flat:", e)
                    if len(driver.window_handles) > 1:
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])

            save_progress(results)
            time.sleep(1)

            if page_num == 54:
                try:
                    show_more = driver.find_element(By.CSS_SELECTOR, 'button[data-name="ShowMoreButton"]')
                    show_more.click()
                    time.sleep(2)
                except Exception as e:
                    print("Error clicking 'Show more':", e)

        while current_total < total_required:
            wait_for_cards(driver)

            flats = driver.find_elements(By.CSS_SELECTOR, 'article[data-name="CardComponent"]')
            print(f"Currently {len(flats)} cards displayed, total parsed = {current_total}")

            for flat in flats:
                if current_total >= total_required:
                    break
                try:
                    links = flat.find_elements(By.TAG_NAME, 'a')
                    flat_links = [link for link in links if '/flat/' in link.get_attribute('href')]

                    if not flat_links:
                        print("No valid flat link found in this flat card, skipping.")
                        continue

                    link = flat_links[0]
                    link.click()
                    time.sleep(1)
                    driver.switch_to.window(driver.window_handles[-1])
                    time.sleep(1)

                    flat_data = parse_flat(driver)
                    results.append(flat_data)
                    current_total += 1

                    driver.close()
                    driver.switch_to.window(driver.window_handles[0])

                except Exception as e:
                    print("Error processing flat (show more mode):", e)
                    if len(driver.window_handles) > 1:
                        driver.close()
                        driver.switch_to.window(driver.window_handles[0])

            save_progress(results)

            try:
                show_more_btn = driver.find_element(By.CSS_SELECTOR, 'button[data-name="ShowMoreButton"]')
                show_more_btn.click()
                time.sleep(2)
            except Exception as e:
                print("No more 'Show more' button or error clicking it:", e)
                break

    finally:
        driver.quit()

    save_progress(results)
    print("Parsing completed!")


if __name__ == "__main__":
    main()

  0%|          | 0/54 [00:00<?, ?it/s]

Parsing page 1
Found 28 flats on page 1
Progress saved: 28 flats.
Parsing page 2
Found 28 flats on page 2
Progress saved: 56 flats.
Parsing page 3
Found 28 flats on page 3
Progress saved: 84 flats.
Parsing page 4
Found 28 flats on page 4
Progress saved: 112 flats.
Parsing page 5
Found 28 flats on page 5
Progress saved: 140 flats.
Parsing page 6
Found 28 flats on page 6
Progress saved: 168 flats.
Parsing page 7
Found 28 flats on page 7
Progress saved: 196 flats.
Parsing page 8
Found 28 flats on page 8
