In [None]:
import requests
import pandas as pd
import json
import re
from bs4 import BeautifulSoup

# Параметры запроса
auth_cookies = {}
base_headers = {
    "Content-Type": "application/json"
}

# Контейнер для результатов
raw_listings = []

# Проход по страницам
for page_num in range(1, 26):
    payload = json.dumps({"page_number": page_num}).encode("utf-8")

    resp = requests.post(
        "https://api.cian.ru/ebc-analytics/event-enrichment/",
        cookies=auth_cookies,
        headers=base_headers,
        data=payload
    )

    data = resp.json().get("products")
    if data:
        raw_listings.extend(data)
    else:
        print(f"Страница {page_num}: пустой ответ")

df_raw_api = pd.DataFrame(raw_listings)
df_raw_api.head()


In [None]:
def extract_flat_attributes(block_text: str) -> dict:
    info = {}

    # площадь
    m_total = re.search(r"Общая площадь\s*([\d,]+)", block_text)
    if m_total:
        info["area_total"] = m_total.group(1)

    # кухня
    m_kitch = re.search(r"Площадь кухни\s*([\d,]+)", block_text)
    if m_kitch:
        info["area_kitchen"] = m_kitch.group(1)

    # этаж + этажность
    m_floor = re.search(r"Этаж\s*(\d+)\s*из\s*(\d+)", block_text)
    if m_floor:
        info["floor_current"] = m_floor.group(1)
        info["floor_max"] = m_floor.group(2)

    # год постройки
    m_year = re.search(r"Год постройки\s*(\d+)", block_text)
    if m_year:
        info["year"] = m_year.group(1)

    return info


In [None]:
flat_cards = []

if not df_raw_api.empty:
    for flat_id in set(df_raw_api.id):
        card_url = f"https://www.cian.ru/sale/flat/{flat_id}/"
        hdr = {
            "User-Agent": "Mozilla/5.0",
            "Accept": "text/html"
        }

        html = requests.get(card_url, headers=hdr).text
        soup = BeautifulSoup(html, "html.parser")

        # адрес
        address_nodes = soup.find_all("a", {"data-name": "AddressItem"})
        address = ", ".join(a.get_text(strip=True) for a in address_nodes)

        # метро
        metro_nodes = soup.find_all("li", {"data-name": "UndergroundItem"})
        metro_raw = [m.get_text(strip=True) for m in metro_nodes]
        metro_clean = list({re.sub(r"\d+\s*мин.*", "", m).strip() for m in metro_raw})

        # блок характеристик
        fact_block = soup.find_all("div", {"data-name": "ObjectFactoidsItem"})
        fact_text = ", ".join(f.get_text(strip=True) for f in fact_block)
        fact_text = fact_text.replace("\xa0", " ")

        extracted = extract_flat_attributes(fact_text)

        flat_cards.append([flat_id, address, metro_clean, extracted])

df_cards = pd.DataFrame(flat_cards, columns=["id", "address", "metro", "details"])
df_merged = df_raw_api.merge(df_cards, on="id", how="left")
df_merged.to_csv("cian_raw.csv", index=False)


In [None]:
import ast
import numpy as np

raw_df = pd.read_csv("cian_raw.csv")

formatted_records = []

for _, row in raw_df.iterrows():
    d = ast.literal_eval(row["details"]) if pd.notna(row["details"]) else {}

    rec = {
        "offer_id": row["id"],
        "address": row["address"],
        "district": re.search(r"р-н\s+([А-Яа-яA-Za-z]+)", row["address"]).group(1)
                    if re.search(r"р-н\s+([А-Яа-яA-Za-z]+)", row["address"]) else "Не указан",
        "price": float(str(row["price"]).replace("₽", "").replace(" ", "")),
        "area": float(str(d.get("area_total", "0")).replace(",", ".")),
        "rooms": 1,   # при необходимости доработать
        "floor": int(d.get("floor_current", 0)),
        "max_floor": int(d.get("floor_max", 0)),
        "build_year": int(d["year"]) if d.get("year") else None,
        "metro": row["metro"],
        "url": f"https://www.cian.ru{row.get('customUrl', '')}"
    }

    formatted_records.append(rec)

clean_df = pd.DataFrame(formatted_records).drop_duplicates(subset="offer_id")
clean_df.to_csv("cian_clean.csv", index=False)
clean_df.head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")
sns.set_context("talk")

print(clean_df.info())
print(clean_df.describe())

# Стоимость по районам
mean_price = clean_df.groupby("district")["price"].mean().sort_values()

plt.figure(figsize=(12,6))
mean_price.plot(kind="barh")
plt.title("Средняя стоимость по районам")
plt.xlabel("Цена, руб")
plt.tight_layout()
plt.show()

# Цена — площадь
plt.figure(figsize=(8,6))
plt.scatter(clean_df["area"], clean_df["price"], alpha=0.6)
plt.xlabel("Площадь")
plt.ylabel("Цена")
plt.title("Корреляция: площадь vs цена")
plt.show()

clean_df["price_m2"] = clean_df["price"] / clean_df["area"]

plt.figure(figsize=(12,6))
clean_df.groupby("district")["price_m2"].mean().sort_values().plot(kind="bar")
plt.title("Средняя цена за м²")
plt.ylabel("₽/м²")
plt.xticks(rotation=45)
plt.show()


In [None]:
feat = clean_df.copy()

feat["is_modern"] = (feat["build_year"] >= 2010).astype(int)
feat["age"] = 2025 - feat["build_year"]
feat["level_ratio"] = feat["floor"] / feat["max_floor"]

# Простая метрика удаленности центра
central_zones = {"Тверской", "Арбат", "Хамовники", "Якиманка"}
feat["center_score"] = feat["district"].apply(lambda x: 0 if x in central_zones else 1)

feat.to_csv("cian_clean_features.csv", index=False)
feat.head()
