# Квартирография

Геоаналитические сервисы помогают оценивать привлекательность различных локаций с точки зрения инвестиций, определять перспективы и возможности развития территорий и инфраструктуры, а также формировать оптимальную цену для сдачи и продажи недвижимости.

В рамках данной задачи Вам необходимо на основе данных о районе, доме, квартире и других географических сведений построить модель, позволяющую предсказывать стоимость квартиры за квадратный метр.

Для построения модели Вам будут доступны различные данные о совершенных сделках с известным значением целевой переменной price_target, а также тестовый набор, для которых необходимо сделать прогноз.

Целевая переменная анонимизирована, то есть не является исходной ценой за квадратный метр из сделки, но зависит только от нее.

In [1]:
from warnings import simplefilter

simplefilter(action='ignore', category = DeprecationWarning)

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import model_selection, preprocessing, metrics
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [3]:
!pip install catboost -q


[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor

## Загрузим и предобработаем данные

In [5]:
X = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

y = X.price_target.values
X = X.drop(['price_target'], axis=1)

In [6]:
X.head()

Unnamed: 0,region_name_cat,district_cat,corpus_cat,developer_cat,agreement_date,floor,square,rooms_4,location_logs_count_mean,location_depth,...,location_public_transport_stop_position_w_mean_distance,location_public_transport_platform_w_mean_distance,location_water_w_mean_distance,location_university_w_mean_distance,location_leisure_w_mean_distance,location_pop_shop_cnt,hc_name_cat,interior_cat,class_cat,stage_cat
0,Город,58,1331,91,2012-04-13,10.0,78.44,3,23.131066,13.0,...,1.089198,0.894947,0.772872,1.309514,0.853183,7.0,36,0.0,97865,27728
1,Город,75,1677,91,2013-09-16,2.0,34.15,1,14.090185,13.0,...,-999.0,1.063211,0.84013,-999.0,1.147596,0.0,372,32413.0,97865,70661
2,Пригород,48,316,10,2014-07-31,17.0,59.85,2,19.453795,13.0,...,0.806682,0.832622,-999.0,-999.0,0.905435,1.0,336,8977.0,97865,12638
3,Пригород,48,1409,91,2012-12-30,12.0,67.53,2,13.178136,13.0,...,-999.0,-999.0,1.322121,-999.0,1.263878,0.0,154,32413.0,97865,70661
4,Пригород,48,1590,91,2014-06-20,5.0,58.13,2,20.69177,13.0,...,0.896906,0.825295,0.996629,-999.0,1.024595,2.0,154,32413.0,97865,12638


In [8]:
def preprocess(df):
    df['rooms_4'] = df['rooms_4'].replace('студия', '0')
    df['rooms_4'] = df['rooms_4'].replace('>=4', '4')
    df['interior_cat'] = df['interior_cat'].apply(str)
    df['agreement_date'] = df['agreement_date'].apply(lambda x: x[5:7])
    df['location_public_transport_platform_w_mean_distance'] = abs(df['location_public_transport_platform_w_mean_distance'])
    df['location_water_w_mean_distance'] = abs(df['location_water_w_mean_distance'])
    df['location_university_w_mean_distance'] = abs(df['location_university_w_mean_distance'])
    df['location_hotel_w_mean_distance'] = abs(df['location_hotel_w_mean_distance'])
    df.drop(['location_depth', 'location_depth.1', 'location_depth.2', 'location_logs_count_std', 'location_flash_mean_mean', 'location_hds_ratio_mean_mean'], axis=1, inplace=True)

    return df

In [9]:
X = preprocess(X)
test = preprocess(test)

In [10]:
cat_features = [
    'region_name_cat', 'district_cat', 'corpus_cat', 'developer_cat',
    'hc_name_cat', 'interior_cat', 'class_cat', 'stage_cat', 'agreement_date'
]
num_features = list(X.drop(cat_features, axis=1).columns)

features = num_features + cat_features

## Вариант 1: линейная регрессия на числовых признаках

Масштабируем данные и заполним пропуски

In [11]:
scaler = StandardScaler()
scaler.fit(X[num_features].values)

X_lr = pd.DataFrame(scaler.transform(X[num_features].values), index=X[num_features].index, columns=X[num_features].columns)
test_lr = pd.DataFrame(scaler.transform(test[num_features].values), index=test[num_features].index, columns=test[num_features].columns)

In [12]:
X_lr.fillna(X_lr.mean(), inplace=True)

In [13]:
X_lr.head()

Unnamed: 0,floor,square,rooms_4,location_logs_count_mean,location_hotel_w_mean_distance,location_pop_bank_w_mean_distance,location_natural_cnt,location_office_w_mean_distance,location_barrier_w_mean_distance,location_amenity_bank_w_mean_distance,...,location_suburb_cnt,location_bridge_cnt,location_college_cnt,location_amenity_pharmacy_w_mean_distance,location_public_transport_stop_position_w_mean_distance,location_public_transport_platform_w_mean_distance,location_water_w_mean_distance,location_university_w_mean_distance,location_leisure_w_mean_distance,location_pop_shop_cnt
0,-0.008869,0.873582,1.601354,0.218035,-0.607427,0.382954,-0.345344,0.156928,0.079648,0.182073,...,-1.156723,-0.395954,0.114118,0.143006,0.279382,-0.094533,-0.683116,-1.638924,-0.297994,-0.295389
1,-1.244175,-0.880849,-0.837108,-2.885851,1.648089,-2.610447,-0.345344,-6.414886,-0.1271,0.183646,...,-1.156723,-0.395954,-0.781347,0.144055,-3.590145,-0.092737,-0.682972,0.609838,2.501597,-1.584789
2,1.072023,0.137189,0.382123,-1.044434,1.648089,-2.610447,-0.345344,0.154765,-1.082868,0.182702,...,-1.156723,-0.395954,-0.781347,0.141858,0.278289,-0.095199,1.464669,0.609838,0.198868,-1.400589
3,0.299957,0.441412,0.382123,-3.198973,1.648089,-2.610447,-0.345344,-6.414886,3.307557,-5.469667,...,-1.156723,-0.395954,-0.781347,-7.02708,-3.590145,10.562138,-0.681935,0.609838,3.607331,-1.584789
4,-0.780936,0.069056,0.382123,-0.619416,1.648089,-2.610447,-0.345344,0.160016,1.022377,-5.469667,...,-1.156723,-0.395954,-0.781347,0.14157,0.278638,-0.095277,-0.682635,0.609838,1.331969,-1.216389


In [14]:
def custom_mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true))

mape_scorer = make_scorer(custom_mape)

In [15]:
kfold = KFold(n_splits=3, random_state=42, shuffle=True)

cv_result = cross_val_score(LinearRegression(), X_lr, y, cv = kfold, scoring=mape_scorer)

In [16]:
cv_result, cv_result.mean()

(array([0.13969457, 0.14072732, 0.14266085]), 0.14102757979808514)

## Вариант 2: катбуст на всех признаках

Заполним пропуски

In [17]:
for f in num_features:
    X[f].fillna(X[f].mean(), inplace=True)
    test[f].fillna(test[f].mean(), inplace=True)

for f in cat_features:
    X[f].fillna("", inplace=True)
    test[f].fillna("", inplace=True)

In [18]:
X.isna().sum().sum(), test.isna().sum().sum()

(0, 0)

In [19]:
cv_result = cross_val_score(CatBoostRegressor(cat_features=cat_features, n_estimators=300), X, y, cv = kfold, scoring=mape_scorer)

Learning rate set to 0.209321
0:	learn: 8173.3663541	total: 238ms	remaining: 1m 11s
1:	learn: 6923.5777486	total: 326ms	remaining: 48.6s
2:	learn: 5961.6645260	total: 408ms	remaining: 40.3s
3:	learn: 5193.9337405	total: 488ms	remaining: 36.1s
4:	learn: 4642.4951796	total: 572ms	remaining: 33.7s
5:	learn: 4237.4559474	total: 644ms	remaining: 31.6s
6:	learn: 3931.7448180	total: 726ms	remaining: 30.4s
7:	learn: 3711.4749041	total: 802ms	remaining: 29.3s
8:	learn: 3551.9808316	total: 873ms	remaining: 28.2s
9:	learn: 3400.0202361	total: 947ms	remaining: 27.5s
10:	learn: 3291.6033044	total: 1.03s	remaining: 27s
11:	learn: 3206.9882072	total: 1.11s	remaining: 26.7s
12:	learn: 3131.2733896	total: 1.2s	remaining: 26.5s
13:	learn: 3065.6228552	total: 1.28s	remaining: 26.2s
14:	learn: 3009.0424210	total: 1.36s	remaining: 25.8s
15:	learn: 2958.4267798	total: 1.45s	remaining: 25.7s
16:	learn: 2912.7457370	total: 1.53s	remaining: 25.4s
17:	learn: 2877.2144736	total: 1.61s	remaining: 25.2s
18:	learn:

In [20]:
cv_result, cv_result.mean()

(array([0.04118279, 0.04151884, 0.04194395]), 0.04154852876118665)

## Обучаем модель на всех тренировочных данных и делаем прогноз

In [21]:
cbr = CatBoostRegressor(n_estimators=300)
cbr.fit(X, y, cat_features=cat_features)

pred_cb = cbr.predict(test)

Learning rate set to 0.22317
0:	learn: 8127.2786386	total: 102ms	remaining: 30.4s
1:	learn: 6835.8785919	total: 204ms	remaining: 30.4s
2:	learn: 5809.1771259	total: 299ms	remaining: 29.6s
3:	learn: 5058.4810060	total: 397ms	remaining: 29.4s
4:	learn: 4527.0030818	total: 496ms	remaining: 29.3s
5:	learn: 4127.1453160	total: 596ms	remaining: 29.2s
6:	learn: 3833.6558261	total: 683ms	remaining: 28.6s
7:	learn: 3640.8551544	total: 763ms	remaining: 27.9s
8:	learn: 3451.7586767	total: 852ms	remaining: 27.5s
9:	learn: 3330.1206633	total: 954ms	remaining: 27.7s
10:	learn: 3238.3925632	total: 1.05s	remaining: 27.7s
11:	learn: 3154.7015963	total: 1.15s	remaining: 27.6s
12:	learn: 3088.6432339	total: 1.25s	remaining: 27.6s
13:	learn: 3021.1365092	total: 1.35s	remaining: 27.5s
14:	learn: 2973.5393140	total: 1.45s	remaining: 27.6s
15:	learn: 2927.9300208	total: 1.55s	remaining: 27.6s
16:	learn: 2890.7641604	total: 1.66s	remaining: 27.6s
17:	learn: 2847.0397859	total: 1.76s	remaining: 27.6s
18:	learn

In [22]:
pd.DataFrame({'target_price': pred_cb}).to_csv('baseline_catboost.csv', index=False)

## Что еще можно было попробовать:

* Кодирование категориальных признаков при помощи:
  * OneHotEncoding
  * MeanTargetEncoding

* Заполнение пропусков не средним, а медианой, уникальным значением или прогнозом вспомогательной модели

* Совершенно точно - подбор гиперпараметров модели (любой):
  * GridSearchCV - подбор по сетке
  * Optuna - байесовский алгоритм быстрого подбора гиперпараметров

* Конструирование новых признаков:
  * PolynomialFeatures
  * Придумывать признаки, исходя из смысла

* Снижение размерности:
  * Отбор признаков (SelectKBest, RFE, SelectFromModel)
  * Снижение размерности (PCA, KernelPCA)

* Использование нескольких моделей (возможно, взвешенное среднее или стекинг как результат):
  * KNN
  * LinearRegression, Ridge, Lasso, ElasticNet
  * SVR
  * DecisionTree, RandomForest, GradientBoosting
  * CatBoost, XGBoost, LightGBM
  * DeepLearning-модели