In [1]:
import pandas as pd
import numpy as np
import matplotlib as mlp
import seaborn as sns
import plotly as plt

In [2]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = pd.read_csv("data_step3v1.csv")

In [5]:
data.head()

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,sub_area,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,43,27.0,4.0,12.558974,1.827121,1983.0,1.909804,6.399301,2.107025,15.594247,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,34,19.0,3.0,12.558974,1.827121,1958.0,1.909804,6.399301,2.107025,15.864842,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,43,29.0,2.0,12.558974,1.827121,1958.0,1.909804,6.399301,2.107025,15.613141,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,89,50.0,9.0,12.558974,1.827121,2000.0,1.909804,6.399301,2.107025,15.914449,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,77,77.0,4.0,12.558974,1.827121,1915.0,1.909804,6.399301,2.107025,16.091227,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


Так как нам нужно собрать модель с наименьшим количеством признаков, причем таких, которые может внести пользователь модели, у которого нет большого количество данных, то попробуем выбрать такие признаки, которые вероятнее всего будут известны покупателю.

In [6]:
data = data[['full_sq', 'life_sq', 'floor', 'max_floor', 'build_year',  'num_room', 
             'kitch_sq', 'state', 'sub_area', 'metro_min_avto', 'mkad_km', 'usdrub', 
             'salary', 'mortgage_rate',  'unemployment', 'log_price_doc',  'product_type_OwnerOccupier']]

In [7]:
X = data.drop('log_price_doc', axis=1)
Y = data['log_price_doc']

In [8]:
splitter = TimeSeriesSplit(n_splits=5)

In [9]:
pipe = Pipeline([('imputer', StandardScaler()), ('regression', Lasso(max_iter=100000))])

parameters = {'imputer':[StandardScaler(), MinMaxScaler()],
              'regression':[Lasso(), Ridge()],
              'regression__alpha':np.logspace(-5, 5, 11)}

select_model = GridSearchCV(pipe, parameters, scoring='neg_mean_squared_error', cv=splitter)
select_model.fit(X, Y)

In [10]:
print(select_model.best_estimator_)

Pipeline(steps=[('imputer', MinMaxScaler()), ('regression', Ridge(alpha=10.0))])


In [11]:
pipe = Pipeline([('scaler', select_model.best_params_['imputer']), 
                 ('regression', select_model.best_params_['regression'])])

In [12]:
cv_result_pipe = cross_validate(pipe, X, Y, 
                                scoring='neg_mean_squared_error',
                                cv=splitter, return_train_score=True)

print(f"Среднее MSLE на тренировочных фолдах: {-np.mean(cv_result_pipe['train_score']).round(3)}")
print(f"Среднее MSLE на тестовых фолдах: {-np.mean(cv_result_pipe['test_score']).round(3)}")

Среднее MSLE на тренировочных фолдах: 0.309
Среднее MSLE на тестовых фолдах: 0.251


Мы получили результат не хуже того, который выбивали на полном датасете. Это не может не радовать. Посмотрим на лассо.

In [13]:
pipe_lasso = Pipeline([('imputer', StandardScaler()), ('regression', Lasso(max_iter=100000))])

parameters = {'imputer':[StandardScaler(), MinMaxScaler()],
              'regression':[Lasso()],
              'regression__alpha':np.logspace(-5, 5, 11)}

select_model_lasso = GridSearchCV(pipe_lasso, parameters, scoring='neg_mean_squared_error', cv=splitter)
select_model_lasso.fit(X, Y)

In [14]:
print(select_model_lasso.best_estimator_)

Pipeline(steps=[('imputer', StandardScaler()),
                ('regression', Lasso(alpha=0.1))])


In [15]:
pipe_lasso = Pipeline([('scaler', select_model_lasso.best_params_['imputer']), 
                 ('regression', select_model_lasso.best_params_['regression'])])

In [16]:
cv_result_pipe = cross_validate(pipe_lasso, X, Y, 
                                scoring='neg_mean_squared_error',
                                cv=splitter, return_train_score=True)

print(f"Среднее MSLE на тренировочных фолдах: {-np.mean(cv_result_pipe['train_score']).round(3)}")
print(f"Среднее MSLE на тестовых фолдах: {-np.mean(cv_result_pipe['test_score']).round(3)}")

Среднее MSLE на тренировочных фолдах: 0.331
Среднее MSLE на тестовых фолдах: 0.297


In [17]:
res = []
for i in range (len(X.columns)):
    if select_model_lasso.best_estimator_['regression'].coef_[i] != 0:
        res.append(X.columns[i])

In [18]:
res

['full_sq', 'num_room', 'sub_area']

Модель Лассо выбрала только 3 признака, но показала результат хуже, чем Ридж. Попробуем перебрать признаки руками.

In [19]:
def get_redundant_pairs(data):
    pairs_to_drop = set()
    cols = data.columns
    for i in range(0, data.shape[1]):
        for j in range(0, i+1):
            pairs_to_drop.add((cols[i], cols[j]))
    return pairs_to_drop

def get_top_abs_correlations(data, n=5):
    au_corr = data.corr().abs().unstack()
    labels_to_drop = get_redundant_pairs(data)
    au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return au_corr[0:n]

print("Top Absolute Correlations")
print(get_top_abs_correlations(data, 50))

Top Absolute Correlations
usdrub          mortgage_rate                 0.684690
salary          unemployment                  0.668092
metro_min_avto  mkad_km                       0.666116
usdrub          salary                        0.629110
sub_area        metro_min_avto                0.509787
state           product_type_OwnerOccupier    0.485826
salary          mortgage_rate                 0.470063
mortgage_rate   unemployment                  0.420291
sub_area        log_price_doc                 0.395936
floor           max_floor                     0.373873
usdrub          unemployment                  0.352825
num_room        log_price_doc                 0.347790
full_sq         num_room                      0.334760
sub_area        product_type_OwnerOccupier    0.320548
full_sq         log_price_doc                 0.271408
metro_min_avto  product_type_OwnerOccupier    0.235813
max_floor       build_year                    0.233876
build_year      product_type_OwnerOccup

Какой-то критичной зависимости не видно. 

In [20]:
data.describe()

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,sub_area,metro_min_avto,mkad_km,usdrub,salary,mortgage_rate,unemployment,log_price_doc,product_type_OwnerOccupier
count,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0
mean,54.214269,34.563293,7.670803,12.558974,1970.289144,1.909804,6.399301,2.107025,15.609483,4.961273,6.274764,38.029219,57670.694703,12.532303,0.014675,15.609483,0.361754
std,38.031487,46.488769,5.30539,5.595529,29.8661,0.705433,23.408855,0.655698,0.239373,6.553515,5.142492,9.553469,5290.487057,0.652565,0.003087,0.604574,0.480516
min,0.0,0.0,0.0,0.0,1691.0,0.0,0.0,1.0,13.815512,0.0,0.013626,28.8082,44898.7,11.4,0.008,11.512935,0.0
25%,38.0,22.0,3.0,9.0,1958.0,1.909804,5.0,2.0,15.472018,1.72128,2.633404,31.9267,55485.2,12.22,0.015,15.371548,0.0
50%,49.0,35.0,7.0,12.558974,1969.0,1.909804,6.399301,2.107025,15.587766,2.803299,5.46751,34.3768,61208.0,12.3,0.015,15.65199,0.0
75%,63.0,38.0,11.0,16.0,1996.0,2.0,8.0,2.107025,15.74455,4.831733,8.184752,38.5907,61208.0,12.62,0.017,15.931766,1.0
max,5326.0,7478.0,77.0,117.0,2018.0,19.0,2014.0,33.0,16.610747,61.438472,53.277832,69.4666,64310.0,14.71,0.017708,18.526041,1.0


Смущает максимальное значение площади, посмотрим поближе.
Все строки мне не нравятся, т.к. соотношение общей площади/жилой/кухни и цены у них не адеватные. Просто удалим их, много не потеряем.

In [21]:
data[data['full_sq'] > 500]

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,sub_area,metro_min_avto,mkad_km,usdrub,salary,mortgage_rate,unemployment,log_price_doc,product_type_OwnerOccupier
2764,729,44.0,12.0,12.558974,1983.0,1.909804,6.399301,2.107025,16.094773,2.685885,2.556467,32.8995,48830.4,12.2,0.008,16.399508,0.0
3513,5326,22.0,13.0,12.558974,1983.0,1.909804,6.399301,2.107025,15.41331,4.907833,3.666678,31.646,48830.4,12.26,0.008,15.742503,1.0
5958,634,38.0,3.0,12.558974,1983.0,1.909804,6.399301,2.107025,15.579179,1.746993,1.053626,30.2308,55485.2,12.69,0.017,16.137898,0.0
18367,634,35.16718,3.0,17.0,2000.0,2.0,0.0,2.107025,15.472018,4.721045,5.946908,34.8408,61208.0,12.3,0.015,15.642187,1.0
22818,637,637.0,18.0,19.0,2016.0,2.0,10.0,1.0,15.585972,1.482746,13.917815,38.5907,61208.0,12.29,0.015,15.368408,1.0
23744,603,35.16718,16.0,18.0,1958.0,2.0,1.0,2.107025,15.500634,23.43747,18.223918,40.9616,61208.0,12.44,0.015,15.698435,1.0


In [22]:
data = data[data.full_sq < 500]

In [23]:
data[data['life_sq'] > data['full_sq']]

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,sub_area,metro_min_avto,mkad_km,usdrub,salary,mortgage_rate,unemployment,log_price_doc,product_type_OwnerOccupier
1087,44,281.00000,6.000000,12.558974,1983.000000,1.909804,6.399301,2.107025,15.594247,2.590241,1.422391,30.1175,48830.4,11.83,0.008000,15.640060,0.0
1194,9,44.00000,3.000000,12.558974,1983.000000,1.909804,6.399301,2.107025,15.460857,2.737264,0.682752,30.0675,48830.4,11.83,0.008000,15.656060,0.0
1832,18,38.00000,7.670803,12.558974,1958.000000,1.909804,6.399301,2.107025,15.523197,2.334253,3.388578,29.6495,48830.4,12.09,0.008000,15.464169,0.0
1858,30,178.00000,4.000000,12.558974,1958.000000,1.909804,6.399301,2.107025,16.035762,2.236106,10.567644,29.6549,48830.4,12.09,0.008000,15.590463,0.0
1998,5,40.00000,5.000000,12.558974,1958.000000,1.909804,6.399301,2.107025,15.417157,1.882826,3.433968,29.3530,48830.4,12.09,0.008000,15.568183,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30102,34,35.16718,12.000000,17.000000,1912.195364,1.000000,0.000000,2.107025,15.277607,15.839451,11.751277,54.2467,64310.0,13.46,0.017708,15.167717,1.0
30160,34,35.16718,3.000000,3.000000,1912.195364,1.000000,1.000000,1.000000,14.795261,32.400256,36.665459,56.2170,64310.0,13.46,0.017708,14.688602,1.0
30182,27,35.16718,11.000000,12.000000,1912.195364,1.000000,1.000000,1.000000,14.970676,18.144916,13.333998,56.2170,64310.0,13.46,0.017708,14.739714,1.0
30234,34,35.16718,2.000000,3.000000,1958.000000,1.000000,1.000000,1.000000,14.795261,32.400256,36.665459,54.3715,64310.0,13.46,0.017708,14.500122,1.0


Тут тоже неадекватные значения. Для таких столбцов установим значение жилой площади исходя из среднего коэфа по датасету

In [24]:
data['life_sq'].mean() / data['full_sq'].mean()

0.6403594841014485

In [25]:
data['life_sq'][data['life_sq'] > data['full_sq']] = data['full_sq'][data['life_sq'] > data['full_sq']] * \
                                    (data['life_sq'].mean() / data['full_sq'].mean())

In [26]:
data[data['kitch_sq'] > 200]

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,sub_area,metro_min_avto,mkad_km,usdrub,salary,mortgage_rate,unemployment,log_price_doc,product_type_OwnerOccupier
10391,79,41.0,5.0,17.0,2013.0,3.0,2013.0,1.0,15.461166,3.121542,7.371716,32.9772,55485.2,12.41,0.017,15.80081,1.0
11542,38,23.0,11.0,14.0,1971.0,2.0,620.0,2.0,15.513687,1.861349,0.930193,31.7158,55485.2,12.37,0.017,15.65606,0.0
13113,31,19.0,5.0,1.0,1958.0,1.0,1970.0,3.0,15.846923,2.140336,6.072456,32.8727,55485.2,11.93,0.017,15.110238,0.0
21397,43,43.0,3.0,1.0,2014.0,1.0,2014.0,1.0,15.4364,7.450495,3.444627,35.9408,61208.0,12.22,0.015,15.150828,1.0
28758,31,16.0,2.0,12.0,1972.0,1.0,1974.0,3.0,15.252663,28.671244,20.174858,58.198,64310.0,14.71,0.017708,13.815512,0.0


С площадью кухни сделаем то же самое.

In [27]:
data['kitch_sq'].mean() / data['full_sq'].mean()

0.11863387800630142

In [28]:
data['kitch_sq'][data['kitch_sq'] > 200] = data['full_sq'][data['kitch_sq'] > 200] * \
                                            data['kitch_sq'].mean() / data['full_sq'].mean()

In [29]:
data.describe()

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,sub_area,metro_min_avto,mkad_km,usdrub,salary,mortgage_rate,unemployment,log_price_doc,product_type_OwnerOccupier
count,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0,30465.0
mean,53.94387,33.98061,7.67018,12.558438,1970.28582,1.909795,6.118439,2.107061,15.609484,4.96097,6.274511,38.029843,57670.998467,12.532336,0.014676,15.609439,0.361694
std,21.823717,16.891919,5.305086,5.595814,29.867059,0.705502,4.185703,0.655732,0.239376,6.553236,5.142203,9.554158,5290.391868,0.652621,0.003087,0.604607,0.480499
min,0.0,0.0,0.0,0.0,1691.0,0.0,0.0,1.0,13.815512,0.0,0.013626,28.8082,44898.7,11.4,0.008,11.512935,0.0
25%,38.0,21.0,3.0,9.0,1958.0,1.909804,5.0,2.0,15.472018,1.721263,2.633404,31.9267,55485.2,12.22,0.015,15.371548,0.0
50%,49.0,34.0,7.0,12.558974,1969.0,1.909804,6.399301,2.107025,15.587766,2.803299,5.46751,34.3768,61208.0,12.3,0.015,15.65199,0.0
75%,63.0,38.0,11.0,16.0,1996.0,2.0,8.0,2.107025,15.74455,4.830788,8.184752,38.5907,61208.0,12.62,0.017,15.931766,1.0
max,461.0,407.0,77.0,117.0,2018.0,19.0,123.0,33.0,16.610747,61.438472,53.277832,69.4666,64310.0,14.71,0.017708,18.526041,1.0


In [30]:
data[data['full_sq'] < 5]

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,sub_area,metro_min_avto,mkad_km,usdrub,salary,mortgage_rate,unemployment,log_price_doc,product_type_OwnerOccupier
11347,1,0.640359,10.0,17.0,2013.0,1.0,1.0,1.0,15.201596,7.288461,7.753876,31.8246,55485.2,12.37,0.017,15.026789,1.0
16267,1,1.0,1.0,1.0,1958.0,1.0,1.0,3.0,15.461166,3.121542,7.371716,36.1726,61208.0,12.29,0.015,15.310076,1.0
16767,1,1.0,1.0,1.0,1958.0,1.0,1.0,3.0,15.461166,3.239717,7.529248,35.1734,61208.0,12.29,0.015,15.872269,1.0
17180,1,1.0,1.0,1.0,1958.0,1.0,1.0,1.0,15.86643,3.477443,3.130875,35.6511,61208.0,11.97,0.015,16.385524,1.0
17965,0,0.0,0.0,0.0,1958.0,0.0,0.0,1.0,15.461166,5.595301,7.229763,35.8774,61208.0,11.97,0.015,15.302781,1.0
18566,1,1.0,1.0,1.0,1915.0,1.0,1.0,3.0,15.585972,1.482746,13.917815,34.5449,61208.0,12.3,0.015,15.713989,1.0
22169,1,1.0,1.0,1.0,1958.0,1.0,1.0,1.0,15.60223,2.596394,7.466001,36.8475,61208.0,12.29,0.015,15.649081,1.0
22433,1,0.640359,11.0,17.0,2014.0,1.0,1.0,1.0,16.035762,1.819185,11.190586,37.5279,61208.0,12.29,0.015,16.518428,1.0
22725,1,1.0,1.0,25.0,2014.0,1.0,1.0,1.0,15.472018,1.001468,5.628641,38.4274,61208.0,12.29,0.015,15.371548,1.0
22829,1,1.0,7.0,19.0,2015.0,3.0,1.0,1.0,15.201596,13.486605,9.248265,38.5907,61208.0,12.29,0.015,15.53681,1.0


Очевидно, что общая площадь 0-1 у квартир это ошибка. Данный признак сильно важный для нас, чтобы мы пренебрегли этими ошибками, он может сильно влиять на итог. Поэтому просто удалим такие строки. Их не сильно много, можем себе позволить.

In [31]:
data = data[data.full_sq > 5]

Посмотрим, смогли ли мы улучшить результат.

In [32]:
X = data.drop('log_price_doc', axis=1)
Y = data['log_price_doc']

In [33]:
splitter = TimeSeriesSplit(n_splits=5)

In [34]:
pipe = Pipeline([('imputer', StandardScaler()), ('regression', Lasso(max_iter=100000))])

parameters = {'imputer':[StandardScaler(), MinMaxScaler()],
              'regression':[Lasso(), Ridge()],
              'regression__alpha':np.logspace(-5, 5, 11)}

select_model = GridSearchCV(pipe, parameters, scoring='neg_mean_squared_error', cv=splitter)
select_model.fit(X, Y)

In [35]:
print(select_model.best_estimator_)

Pipeline(steps=[('imputer', StandardScaler()),
                ('regression', Ridge(alpha=0.001))])


In [36]:
pipe = Pipeline([('scaler', select_model.best_params_['imputer']), 
                 ('regression', select_model.best_params_['regression'])])

In [37]:
cv_result_pipe = cross_validate(pipe, X, Y, 
                                scoring='neg_mean_squared_error',
                                cv=splitter, return_train_score=True)

print(f"Среднее MSLE на тренировочных фолдах: {-np.mean(cv_result_pipe['train_score']).round(3)}")
print(f"Среднее MSLE на тестовых фолдах: {-np.mean(cv_result_pipe['test_score']).round(3)}")

Среднее MSLE на тренировочных фолдах: 0.275
Среднее MSLE на тестовых фолдах: 0.208


Смогли, и очень сильно. Yeah Science, Bitch! Едем дальше.
А дальше у нас есть два шага, которые сильно нам смажут итоговую ошибку, и уже сложно будет судить, как сильно мы улучшили качество нашей модели. Они будут отчасти жульничеством, но, тем не менее, они явно позволят нам достичь лучших результатов в прогнозах.

Первый из них - уберем выбросы. Да, это искусственно усредняет наши данные, но данные не совершенны, в них есть ошибки, и почистить выбросы не самая плохая идея.

In [38]:
top_quantile = data['log_price_doc'].quantile(0.975)
low_quantile = data['log_price_doc'].quantile(0.025)

print(f"Топ 2,5% значение таргета: {top_quantile.round(2)}")
print(f"Топ 97,5% значение таргета: {low_quantile.round(2)}")

Топ 2,5% значение таргета: 16.7
Топ 97,5% значение таргета: 13.82


In [39]:
data = data[(data['log_price_doc'] > low_quantile) & (data['log_price_doc'] < top_quantile)]

In [40]:
X = data.drop('log_price_doc', axis=1)
Y = data['log_price_doc']

In [41]:
splitter = TimeSeriesSplit(n_splits=5)

In [42]:
pipe = Pipeline([('imputer', StandardScaler()), ('regression', Lasso(max_iter=100000))])

parameters = {'imputer':[StandardScaler(), MinMaxScaler()],
              'regression':[Lasso(), Ridge()],
              'regression__alpha':np.logspace(-5, 5, 11)}

select_model = GridSearchCV(pipe, parameters, scoring='neg_mean_squared_error', cv=splitter)
select_model.fit(X, Y)

In [43]:
print(select_model.best_estimator_)

Pipeline(steps=[('imputer', StandardScaler()),
                ('regression', Ridge(alpha=0.001))])


In [44]:
pipe = Pipeline([('scaler', select_model.best_params_['imputer']), 
                 ('regression', select_model.best_params_['regression'])])

In [45]:
cv_result_pipe = cross_validate(pipe, X, Y, 
                                scoring='neg_mean_squared_error',
                                cv=splitter, return_train_score=True)

print(f"Среднее MSLE на тренировочных фолдах: {-np.mean(cv_result_pipe['train_score']).round(3)}")
print(f"Среднее MSLE на тестовых фолдах: {-np.mean(cv_result_pipe['test_score']).round(3)}")

Среднее MSLE на тренировочных фолдах: 0.135
Среднее MSLE на тестовых фолдах: 0.12


На этом мы пожалуй остановимя в части предсказания цены. В следующем ноутбуке соберем все в финальную версию модели.

In [46]:
data.to_csv('final_data.csv', index=False, index_label=False)