In [1]:
import pandas as pd
import requests
import json
from tqdm import tqdm
from selectolax.parser import HTMLParser

In [2]:
def get_data_from_json(offers):
    result = []
    for offer in offers:
        if not offer.get('floorsOffered'):
            continue
        apartment = {
            'id': offer.get('offerId'),
            'price': offer['price'].get('value'),
            'area': offer['area'].get('value'),
            'rooms': offer.get('roomsTotalKey'),
            'ceilingHeight': offer.get('ceilingHeight'),
            'kitchen_space': offer['kitchenSpace'].get('value') if 'kitchenSpace' in offer else 'studio',
            'floor': offer.get('floorsOffered')[0],
            'floorsTotal': offer.get('floorsTotal'),
            'creationDate': offer.get('creationDate'),
            'seller': offer['author'].get('category'),
            'address': offer['location'].get('geocoderAddress'),
            'nearest_metro': offer['location']['metro']['name'] if 'metro' in offer['location'] else None,
            'time_to_metro': offer['location']['metro'].get('minTimeToMetro') if 'metro' in offer['location'] else None,
            'transport_to_metro': offer['location']['metro'].get('metroTransport') if 'metro' in offer['location'] else None,
            'branch_metro_color': offer['location']['metro'].get('lineColors')[0] if 'metro' in offer['location'] else None,
            'parks_count': len(offer['location'].get('parks')) if 'parks' in offer['location'] else 0,
            'nearest_park': offer['location']['parks'][0].get('name') if 'parks' in offer['location'] else None,
            'time_park': offer['location']['parks'][0].get('timeOnFoot') if 'parks' in offer['location'] else None,
            'distance_park': offer['location']['parks'][0].get('distanceOnFoot') if 'parks' in offer['location'] else None,
            'ponds_count': len(offer['location'].get('ponds')) if 'ponds' in offer['location'] else 0,
            'nearest_pond': offer['location']['ponds'][0].get('name') if 'ponds' in offer['location'] else None,
            'time_pond': offer['location']['ponds'][0].get('timeOnFoot') if 'ponds' in offer['location'] else None,
            'distance_pond': offer['location']['ponds'][0].get('distanceOnFoot') if 'ponds' in offer['location'] else None,
            'nearest_airport': offer['location']['airports'][0].get('name') if 'airports' in offer['location'] else None,
            'time_airport_via_car': offer['location']['airports'][0].get('timeOnCar') if 'airports' in offer['location'] else None,
            'distance_airport': offer['location']['airports'][0].get('distanceOnCar') if 'airports' in offer['location'] else None,
            'images': offer.get('large1242Images')
        }
        result.append(apartment)

    return result


In [65]:
def find_apartment_near_the_metro(metro: str):
    url = f"https://realty.yandex.ru/sankt-peterburg/kupit/kvartira/metro-{metro}/"
    aparts = []

    year_grid = [int(i) for i in range(1700, 2025)]
    for idx in range(len(year_grid) - 1):
        page = 1
        while True:
            params = (('page', page), ('builtYearMin', year_grid[idx]), ('builtYearMax', year_grid[idx + 1]))
            response = requests.get(url, params=params)
            response.encoding = 'utf-8'

            html = response.text
            tree = HTMLParser(html)
            if not (tree.css_first('script[id="initial_state_script"]')):
                break
            print(f"\r{(100 * idx / len(year_grid)):.1f}%\t Текущая страница: {page}\t Размер датасета: {len(aparts)}", end="")

            content = tree.css_first('script[id="initial_state_script"]').text()
            data_json = json.loads(content[23:-1])['map']['offers']['points']

            data_batch = get_data_from_json(data_json)
            aparts.extend(data_batch)

            page += 1

    result = pd.DataFrame(aparts)
    return result.drop_duplicates(subset=result.columns.difference(['images']).tolist())

In [66]:
tmp = find_apartment_near_the_metro('ploshchad-vosstaniya')

99.4%	 Текущая страница: 25	 Размер датасета: 16548

In [69]:
tmp['nearest_metro'].unique()

array(['Маяковская', 'Площадь Восстания'], dtype=object)