Тема: Сбор данных о рынке недвижимости в Москве для последующего анализа и предсказания цены

Спарсим данные с сайта https://domclick.ru/

1. Подключение библиотек для прасинша и сохранения данных

In [1]:
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
import time
import pandas as pd

1.1. Раздел настроек

In [2]:
url = 'https://domclick.ru'
params = {'deal_type': 'sale', 'offer_type':'flat', 'category': 'living'}
MAX_PAGE = 100
DEBUG = False

1.2. Запросы к серверу

In [3]:
# Возвращает результат запроса
def html_from_url(url, params, timeout=50):
    msg = ''
    response = requests.Response()
    try:
        headers = {'accept': '*/&*',
                   'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
                                 '(KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
        response = requests.get(url, params=params, headers=headers, timeout=timeout)
        print(response.url)
    except requests.exceptions.ConnectionError:
        msg = 'Проблемы с интернет соединением'
    except requests.RequestException as e:
        msg = str(e)
    return response.text, response.status_code, msg

1.3.Функции обработки страницы

In [4]:
#обработка цены квартиры
def get_flat_clear_price(tag):
    spam = tag.text.replace(' ', '')
    return  float(spam[:-1]), spam[-1]

#обработка цены метра квратиры
def get_flat_clear_per_square_meter_price(tag):
    spam = float(tag.text.replace(' ₽/м²', '').replace(' ', ''))
    return  spam

#обработка доп. информации о квартире
def get_info_flat(tags):
    rooms = None
    square = None
    floor = None
    house_floor = None
    for i in range(len(tags)):
        if i==0:
            spam = tags[i].text
            if spam == 'студия':
                rooms = 0
            else:
                rooms = int(spam.split('-')[0])
        elif i==1:
            spam = tags[i].text
            square = float(spam.split(' ')[0].replace(',','.'))
        elif i==2:
            spam = tags[i].text
            floor = int(spam.split(' ')[0])
            house_floor = int(spam.split(' ')[-1])
        else:
            break
    return rooms, square, floor, house_floor 

#обработка доп. информации о метро
def get_info_metro(tag):
    metro_line = ''
    metro_station = ''
    if tag.find('span', {'class':'YBcx8'}) is not None:
        metro_line = tag.find('span', {'class':'YBcx8'})['title']
        metro_station = tag.find('span', {'class':'PAT-G'}).text.split(',')[0]
    return  metro_line, metro_station

#обработка 1 страницы
def parse_one_page_sales(url, tag, params, db_collection):
    html_doc, status_code, msg = html_from_url(url+tag, params)    
    next_page = None
    if status_code != 200:
        print(f'Ошибка загрузки страницы {url}: {msg}')
    else:
        soup = BeautifulSoup(html_doc, 'html.parser')    
        items = soup.findAll('a', {'class': '_3_q0k'})
        for item in items:
            href = f"{url}{item['href']}"
            value, currency = get_flat_clear_price(item.find('div', {'class':'_2LpFR'}))
            per_square_meter_price = get_flat_clear_per_square_meter_price(item.find('div', {'class':'_2OGaT'}).findChildren('div' , recursive=False)[0])
            rooms, square, floor, house_floor = get_info_flat(item.find('ul', {'class':'_1I9Cg'}).findAll('li' , {'class': '_2aoTm'}))
            metro_line, metro_station = get_info_metro(item.find('div', {'class':'sT_AF'}))            
            save_flat({'_id':href, 'value':value, 'currency':currency, 'psm_price':per_square_meter_price, 
                       'rooms':rooms, 'square':square, 'floor':floor, 'house_floor':house_floor, 
                       'metro_line':metro_line, 'metro_station':metro_station}, db_collection)
        if soup.find('a', {'rel': 'next'}) is not None:    
            next_page = f"{soup.find('a', {'rel': 'next'})['href']}"  
    return next_page

#обход страниц пока не дойдем до максимума или последней страницы
def parse_pages(url, params, debug):
    db_collection = init_db(debug)
    next_page = parse_one_page_sales(url, '/search', params, db_collection)
    i = 0
    while next_page is not None and i<MAX_PAGE:
        next_page = parse_one_page_sales(url, next_page, '', db_collection)
        i += 1
        time.sleep(5)


1.4. Функции для работы с базой

In [5]:
#Функция сохранения в базу данных
def save_flat(item, db_collection):
    if db_collection.count_documents({ '_id': item['_id']}) == 0:
        db_collection.insert_one(item)
    
def init_db(debug):
    client = MongoClient('mongodb://127.0.0.1:27017')
    db = client['test']
    db_collection = db.flats
    if debug:
        db_collection.drop()
    return db_collection
                     
def get_flats():
    db_collection = init_db(False)
    cursor = db_collection.find({'currency':'₽'})     
    return list(cursor)           

1.5. Вызов функции парсинга

In [6]:
parse_pages(url, params, False)

2.Построение модели предсказания цены

2.1. Подключение библиотек предсказания

In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score as r2
from sklearn.ensemble import RandomForestRegressor as RF

import warnings

warnings.filterwarnings('ignore')

2.2. Функции подготовки данных

In [8]:
def create_dummies(df, columns):
    df = pd.get_dummies(df)
    missing_cols = set( columns ) - set( df.columns )
    for c in missing_cols:
        df[c] = 0
    return df

# Возвращает список некатегориальных столбцов
def get_noncategorical_columns(df):
    return [c for c in df.columns if df[c].dtype.name != 'object']

In [9]:
def get_feats(df):
    feats = get_noncategorical_columns(df)
    feats.remove('psm_price')
    feats.remove('value')
    return feats

def prepare_data(df):
    df = create_dummies(df, get_feats(df))    
    return df

2.3. Загрузка данных из базы

In [10]:
flats = get_flats()
df = pd.DataFrame(flats)
df = df.drop({'_id'}, axis=1)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1980 entries, 0 to 1979
Data columns (total 9 columns):
currency         1980 non-null object
floor            1980 non-null int64
house_floor      1980 non-null int64
metro_line       1980 non-null object
metro_station    1980 non-null object
psm_price        1980 non-null float64
rooms            1980 non-null int64
square           1980 non-null float64
value            1980 non-null float64
dtypes: float64(3), int64(3), object(3)
memory usage: 139.3+ KB


In [12]:
df = prepare_data(df)
train, valid = train_test_split(df, test_size=0.3, random_state=62)
train.shape, valid.shape

((1386, 197), (594, 197))

In [13]:
train.describe()

Unnamed: 0,floor,house_floor,psm_price,rooms,square,value,currency_₽,metro_line_,metro_line_Арбатско-Покровская,metro_line_Бутовская,...,metro_station_Чкаловская,metro_station_Шаболовская,metro_station_Шелепиха,metro_station_Шипиловская,metro_station_Щелковская,metro_station_Щукинская,metro_station_Электрозаводская,metro_station_Юго-Западная,metro_station_Южная,metro_station_Ясенево
count,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,...,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0,1386.0
mean,12.714286,22.128427,353001.6,2.739538,108.698947,46299320.0,1.0,0.237374,0.064214,0.006494,...,0.005772,0.005772,0.005772,0.000722,0.003608,0.007937,0.000722,0.007937,0.002886,0.002165
std,12.996608,17.251362,270676.6,1.307842,64.702614,62806250.0,0.0,0.425627,0.245221,0.080349,...,0.075781,0.075781,0.075781,0.026861,0.059976,0.088765,0.026861,0.088765,0.053663,0.046491
min,1.0,2.0,132.0,0.0,14.0,6950.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,9.0,176350.8,2.0,60.15,12399250.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,17.0,264408.5,3.0,98.0,22517470.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,16.0,26.0,436052.2,3.0,140.15,54725000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,80.0,97.0,2296686.0,16.0,596.0,524956800.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
valid.describe()

Unnamed: 0,floor,house_floor,psm_price,rooms,square,value,currency_₽,metro_line_,metro_line_Арбатско-Покровская,metro_line_Бутовская,...,metro_station_Чкаловская,metro_station_Шаболовская,metro_station_Шелепиха,metro_station_Шипиловская,metro_station_Щелковская,metro_station_Щукинская,metro_station_Электрозаводская,metro_station_Юго-Западная,metro_station_Южная,metro_station_Ясенево
count,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,...,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0,594.0
mean,12.570707,21.558923,338521.4,2.658249,102.619175,41810140.0,1.0,0.225589,0.058923,0.003367,...,0.010101,0.003367,0.010101,0.001684,0.001684,0.008418,0.003367,0.006734,0.001684,0.0
std,12.293456,16.545631,253816.7,1.264127,61.195782,58355410.0,0.0,0.418322,0.235678,0.057977,...,0.100079,0.057977,0.100079,0.04103,0.04103,0.091437,0.057977,0.081853,0.04103,0.0
min,1.0,2.0,211.0,0.0,16.8,5290.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,9.0,175781.5,2.0,59.2,12325000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,17.0,239811.0,3.0,92.0,19850000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17.0,25.0,413023.2,3.0,134.275,45750000.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,77.0,97.0,1749856.0,10.0,522.0,550000000.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


2.4. Построение модели

In [15]:
feats = get_feats(df)
model = RF(random_state=128, n_estimators=200, max_depth=12, max_features= 35, min_samples_leaf=2)
model.fit(train.loc[:, feats], train['value'])
pred_train = model.predict(train.loc[:, feats])
score_r2 = r2(train['value'], pred_train)
score_r2

0.835614836662497

2.5. Предсказание на дополнительной выборке для оценки корректности модели

In [16]:
pred_valid =  model.predict(valid.loc[:, feats])
score_r2 = r2(valid['value'], pred_valid)
score_r2

0.7085451701120726

In [17]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature importances:")
for f, idx in enumerate(indices):
    print("{:2d}. feature '{:5s}' ({:.4f})".format(f + 1, feats[idx], importances[idx]))

Feature importances:
 1. feature 'square' (0.4593)
 2. feature 'rooms' (0.1711)
 3. feature 'house_floor' (0.0991)
 4. feature 'floor' (0.0416)
 5. feature 'metro_station_Парк культуры' (0.0243)
 6. feature 'metro_station_Кропоткинская' (0.0198)
 7. feature 'metro_line_Сокольническая' (0.0143)
 8. feature 'metro_station_Тверская' (0.0135)
 9. feature 'metro_line_Московское центральное кольцо' (0.0115)
10. feature 'metro_line_' (0.0114)
11. feature 'metro_line_Кольцевая' (0.0112)
12. feature 'metro_station_Чеховская' (0.0106)
13. feature 'metro_station_Фрунзенская' (0.0098)
14. feature 'metro_line_Калининская' (0.0095)
15. feature 'metro_station_' (0.0094)
16. feature 'metro_station_Спортивная' (0.0086)
17. feature 'metro_station_Ростокино' (0.0069)
18. feature 'metro_station_Ломоносовский проспект' (0.0056)
19. feature 'metro_line_Замоскворецкая' (0.0048)
20. feature 'metro_station_Смоленская' (0.0040)
21. feature 'metro_station_Деловой центр' (0.0039)
22. feature 'metro_station_Маяков