In [1]:
import pandas as pd
import requests
import json
import concurrent.futures
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from selectolax.parser import HTMLParser
from PIL import Image
from PIL.JpegImagePlugin import JpegImageFile
from io import BytesIO
from typing import List
from datetime import datetime

In [2]:
def get_data_from_json(offers):
    result = []
    for offer in offers:
        if not offer.get('floorsOffered'):
            continue
        apartment = {
            'id': offer.get('offerId'),
            'price': offer['price'].get('value'),
            'area': offer['area'].get('value'),
            'rooms': offer.get('roomsTotalKey'),
            'ceilingHeight': offer.get('ceilingHeight'),
            'kitchen_space': offer['kitchenSpace'].get('value') if 'kitchenSpace' in offer else 'studio',
            'floor': offer.get('floorsOffered')[0],
            'floorsTotal': offer.get('floorsTotal'),
            'creationDate': offer.get('creationDate'),
            'seller': offer['author'].get('category'),
            'address': offer['location'].get('geocoderAddress'),
            'nearest_metro': offer['location']['metro']['name'] if 'metro' in offer['location'] else None,
            'time_to_metro': offer['location']['metro'].get('minTimeToMetro') if 'metro' in offer['location'] else None,
            'transport_to_metro': offer['location']['metro'].get('metroTransport') if 'metro' in offer['location'] else None,
            'branch_metro_color': offer['location']['metro'].get('lineColors')[0] if 'metro' in offer['location'] else None,
            'parks_count': len(offer['location'].get('parks')) if 'parks' in offer['location'] else 0,
            'nearest_park': offer['location']['parks'][0].get('name') if 'parks' in offer['location'] else None,
            'time_park': offer['location']['parks'][0].get('timeOnFoot') if 'parks' in offer['location'] else None,
            'distance_park': offer['location']['parks'][0].get('distanceOnFoot') if 'parks' in offer['location'] else None,
            'ponds_count': len(offer['location'].get('ponds')) if 'ponds' in offer['location'] else 0,
            'nearest_pond': offer['location']['ponds'][0].get('name') if 'ponds' in offer['location'] else None,
            'time_pond': offer['location']['ponds'][0].get('timeOnFoot') if 'ponds' in offer['location'] else None,
            'distance_pond': offer['location']['ponds'][0].get('distanceOnFoot') if 'ponds' in offer['location'] else None,
            'nearest_airport': offer['location']['airports'][0].get('name') if 'airports' in offer['location'] else None,
            'time_airport_via_car': offer['location']['airports'][0].get('timeOnCar') if 'airports' in offer['location'] else None,
            'distance_airport': offer['location']['airports'][0].get('distanceOnCar') if 'airports' in offer['location'] else None,
            'images': offer.get('large1242Images')
        }
        result.append(apartment)

    return result


In [3]:
def scrap_page(url, params):
    try:
        response = requests.get(url, params=params)
        response.encoding = 'utf-8'
        html = response.text
        tree = HTMLParser(html)
        if not (tree.css_first('script[id="initial_state_script"]')):
            return None

        content = tree.css_first('script[id="initial_state_script"]').text()
        data_json = json.loads(content[23:-1])['map']['offers']['points']

        data_batch = get_data_from_json(data_json)
        return data_batch
        # return data_json
    
    except Exception as err:
        print(f"Ошибка в get_page_data: {err}")
        return None
    
def find_apartments(param: dict[str], max_workers=8):
    metro = param['metro']
    built_year_min = param['built_year_min'] if 'built_year_min' in param else 1700
    built_year_max = param['built_year_max'] if 'built_year_max' in param else datetime.now().year

    url = f"https://realty.yandex.ru/sankt-peterburg/kupit/kvartira/metro-{metro}/"
    years_range = range(built_year_min, built_year_max + 1)
    aparts = []

    def process_year(year):
        thread_aparts = []
        for page in range(1, 26):
            params = {'page': page, 'builtYearMin': year, 'builtYearMax': year}
            page_aparts = scrap_page(url, params)
            if not page_aparts:
                break
            thread_aparts.extend(page_aparts)
        return thread_aparts

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_year, year) for year in years_range]
        for future in tqdm(as_completed(futures), total=len(years_range), desc="Обработка годов"):
            local_data = future.result()
            if local_data:
                aparts.extend(local_data)

    result = pd.DataFrame(aparts)
    return result.drop_duplicates(subset=result.columns.difference(['images']).tolist())

In [None]:
df_metro_stations = []


In [26]:
stations = {
    "red_line": [
        "devyatkino", "grazhdanskiy-prospekt",  "akademicheskaya", "politekhnicheskaya", 
        "ploshchad-muzhestva", "lesnaya", "vyborgskaya", "ploshchad-lenina", "chernyshevskaya", 
        "ploshchad-vosstaniya", "vladimirskaya", "pushkinskaya", "tekhnologicheskiy-institut",  
        "baltiyskaya", "narvskaya", "kirovskiy-zavod",  "avtovo", "leninskiy-prospekt",  
        "prospekt-veteranov"
    ],

    "blue_line": [
        "parnas", "prospekt-prosveshcheniya", "ozerki", "udelnaya", "pionerskaya", 
        "chernaya-rechka", "petrogradskaya", "gorkovskaya", "nevskiy-prospekt", 
        "sennaya-ploshchad", "tekhnologicheskiy-institut", "frunzenskaya", "moskovskie-vorota", 
        "elektrosila", "park-pobedy", "moskovskaya", "zvezdnaya",  "kupchino"        
    ],

    "green_line": [
        "begovaya", "zenit", "primorskaya", "vasileostrovskaya", "gostiny-dvor", "mayakovskaya", 
        "ploshchad-aleksandra-nevskogo", "elizarovskaya", "lomonosovskaya", "proletarskaya", 
        "obukhovo", "rybatskoe"
    ],

    "orange_line": [
        "gorny-institut", "spasskaya", "dostoevskaya", "ligovskiy-prospekt", "novocherkasskaya", 
        "ploshchad-aleksandra-nevskogo", "ladozhskaya", "prospekt-bolshevikov", "ulitsa-dybenko"
    ],

    "purple_line": [ 
        "komendantsky-prospekt", "staraya-derevnya", "krestovskiy-ostrov", "chkalovskaya",
        "sportivnaya", "admiralteyskaya", "sadovaya", "zvenigorodskay", "obvodniy-kanal", 
        "volkovskaya", "bukharestskaya", "mezhdunarodnaya", "prospekt-slavy", "dunaiskaya", 
        "shushary" 
    ]
}

In [24]:
for metro in stations['orange_line']:
    param = {
        'metro': metro,
        'built_year_min': 1800,
        'built_year_max': 1810
    }
    res = find_apartments(param)
    print(f"metr: {metro}\tsize={res.shape[0]}")

Обработка годов:   0%|          | 0/11 [00:00<?, ?it/s]

Обработка годов: 100%|██████████| 11/11 [00:02<00:00,  3.84it/s]


metr: gorny-institut	size=3


Обработка годов: 100%|██████████| 11/11 [00:04<00:00,  2.33it/s]


metr: spasskaya	size=29


Обработка годов: 100%|██████████| 11/11 [00:03<00:00,  3.64it/s]


metr: dostoevskaya	size=31


Обработка годов: 100%|██████████| 11/11 [00:01<00:00,  6.43it/s]


metr: ligovskiy-prospekt	size=7


Обработка годов: 100%|██████████| 11/11 [00:01<00:00,  7.04it/s]


metr: novocherkasskaya	size=0


Обработка годов: 100%|██████████| 11/11 [00:02<00:00,  3.95it/s]


metr: ploshchad-aleksandra-nevskogo	size=1


Обработка годов: 100%|██████████| 11/11 [00:02<00:00,  4.69it/s]


metr: ladozhskaya	size=0


Обработка годов: 100%|██████████| 11/11 [00:01<00:00,  5.76it/s]


metr: prospekt-bolshevikov	size=0


Обработка годов: 100%|██████████| 11/11 [00:01<00:00,  6.11it/s]

metr: ulitsa-dybenko	size=0





In [5]:
def get_image(url):
    if ("https:" not in url):
        url = "https:" + url

    try:
        response = requests.get(url)
        response.raise_for_status()

        image = Image.open(BytesIO(response.content))
        return image
    except requests.exceptions.RequestException as err:
        print(f"Ошибка при загрузке изображения: {err}")
        return None

# Чтобы быстрее парсилось, можно сделать многопотоку
def get_images_optimizer(urls: List[str], num_thread: int=5) -> List[JpegImageFile]:
    images = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_thread) as executor:
        future_to_url = {executor.submit(get_image, url): url for url in urls}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                image = future.result()
                if image:
                    images.append(image)
            except Exception as e:
                print(f"Ошибка при обработке {url}: {e}")
    return images

Посмотрим на работу с многопоточкой и без

In [None]:
for image_url_list in tqdm(tmp['images'][:10]):
    if (image_url_list == None):
        continue
    for image_url in image_url_list:
        get_image(image_url)

In [None]:
for image_url_list in tqdm(tmp['images'][:10]):
    if (image_url_list == None):
        continue
    images = get_images_optimizer(image_url_list)

100%|██████████| 10/10 [00:08<00:00,  1.16it/s]


Как и ожидалось ускорение в (кол-во потоков) раз