Импортируем необходимые библиотеки

In [1]:
import numpy as np
import pandas as pd

Считываем тренировочный датасет, пропущенные значения заполняем округленным средним по каждому признаку

In [2]:
df = pd.read_csv("data/Train.csv")
df.fillna(df.mean().apply(round), inplace=True)

Обработаем признак $date$. На первый взгляд может показаться, что этот признак не очень важный, но по мере обучения моделей оказывается, что его добавление в признаковое пространство даёт ощутимый прирост точности. Кроме того, можно проверить, что этот признак имеет самый большой коэффицент корреляции Пирсона с целевой переменной.

Обработка признака $date$ происходит следующим образом:
- меняем его тип с $str$ на $datetime$
- находим минимальное значение этого признака в тренировочном наборе
- считаем сколько дней прошло от минимальной даты до текущей и записываем результат в таблицу

In [3]:
df["date"] = pd.to_datetime(df["date"])
min_date = df.date.min()
df["date"] = df.date.apply(lambda x: (x - min_date).days)
df

Unnamed: 0,id,date,street_id,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,...,kw5,kw6,kw7,kw8,kw9,kw10,kw11,kw12,kw13,price
0,0,0,616,0.0,4,43,2,0,30.0,1.0,...,0,0,0,0,0,0,0,0,0,1738000
1,1,0,112,0.0,3,33,1,0,15.0,1.0,...,0,0,0,0,0,0,0,0,0,1169000
2,2,0,230,1.0,9,34,1,0,25.0,0.0,...,0,0,0,0,0,0,0,0,0,2821000
3,3,0,302,1.0,4,60,3,0,15.0,0.0,...,0,0,0,0,0,0,0,0,0,5714000
4,4,0,578,0.0,3,49,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,1660000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,425,612,0.0,3,36,1,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,3898000
99996,99996,425,573,0.0,4,51,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,8698000
99997,99997,425,550,1.0,9,48,2,0,30.0,0.0,...,0,0,0,0,0,0,0,0,0,6498000
99998,99998,425,595,1.0,10,51,2,1,15.0,1.0,...,0,0,0,0,0,0,0,0,0,9436000


Удалаем из датасета идентификаторы объявлений и отделяем целевую переменную от остального датасета

In [4]:
target = df.price
df.drop(columns=["id", "price"], inplace=True)

$street\_id$ очевидно является категориальным признаком, поэтому обработаем его соответствующим образом: найдем все возможные его значения и для каждого значения заведем отдельный бинарный признак, который будет показывать стоит ли наш дом на улице с этим id или нет. Сам же признак $street\_id$ удалим из датасета

In [5]:
number_of_streets = len(np.unique(df.street_id.values))
streets = np.zeros((df.shape[0], number_of_streets))
for i in range(df.shape[0]):
    streets[i][df.street_id[i]] = 1
streets = pd.DataFrame(streets)
df.drop(columns=["street_id"], inplace=True)
df = df.join(streets)
df

Unnamed: 0,date,build_tech,floor,area,rooms,balcon,metro_dist,g_lift,n_photos,kw1,...,662,663,664,665,666,667,668,669,670,671
0,0,0.0,4,43,2,0,30.0,1.0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,3,33,1,0,15.0,1.0,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1.0,9,34,1,0,25.0,0.0,4,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1.0,4,60,3,0,15.0,0.0,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,3,49,2,0,30.0,0.0,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,425,0.0,3,36,1,0,30.0,0.0,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,425,0.0,4,51,2,0,30.0,0.0,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,425,1.0,9,48,2,0,30.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,425,1.0,10,51,2,1,15.0,1.0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Отмасштабируем количественные признаки

In [6]:
from sklearn.preprocessing import StandardScaler

numerical_features = ['area', 'date', 'metro_dist', 'n_photos', 'rooms', 'floor']
data_numerical = df[numerical_features]

scaler = StandardScaler()
data_numerical = pd.DataFrame(scaler.fit_transform(data_numerical), columns=numerical_features)

data_upd = data_numerical.join(df.drop(columns=numerical_features))
data_upd

Unnamed: 0,area,date,metro_dist,n_photos,rooms,floor,build_tech,balcon,g_lift,kw1,...,662,663,664,665,666,667,668,669,670,671
0,-0.524136,-1.373816,0.989425,-0.769196,-0.137553,-0.340126,0.0,0,1.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.093616,-1.373816,-0.848161,-0.262680,-1.347986,-0.587339,0.0,0,1.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.036668,-1.373816,0.376896,0.750354,-1.347986,0.895941,1.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.443979,-1.373816,-0.848161,-0.262680,1.072879,-0.340126,1.0,0,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.182449,-1.373816,0.989425,-0.262680,-0.137553,-0.587339,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,-0.922772,0.662216,0.989425,0.243837,-1.347986,-0.587339,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,-0.068553,0.662216,0.989425,0.243837,-0.137553,-0.340126,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,-0.239397,0.662216,0.989425,-1.275713,-0.137553,0.895941,1.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,-0.068553,0.662216,-0.848161,-0.769196,-0.137553,1.143154,1.0,1,1.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


На этом подготовка данных заканчивается. Построим модель. \
Модель будем строить следующим образом: 
- сначала обучим простой регрессор ```Ridge```, который по сути является обычным линейным регрессором с $l_2$ - регуляризацией, и запишем его ответы в датасет, как отдельный признак
- после, на полученном датасете обучим ```GradientBoostingRegressor``` с параметрами ```loss="huber", n_estimators=600, max_depth=5, subsample=0.8``` и точно так же запишем его ответы в наш датасет, как отдельный признак
- наконец, на полученном после этих двух шагов датасете обучим финальный estimator, а именно ```GradientBoostingRegressor``` с параметрами ```loss="huber", n_estimators=700, max_depth=5, subsample=0.8```. 

Построенная модель в некотором смысле реализует "двухуровневый стеккинг", когда на вход финальной модели подаются ответы, полученные от другой стеккинг-модели. Такой подход является довольно эвристическим, но он показал чуть более хороший результат по сравнению с более понятной "одноуровневой" стеккинг-моделью, в которой сначала обучается ```Ridge``` регрессор, а потом его ответы, вместе с непосредственно обучающией выборкой, подаются на обучение регрессору ```GradientBoostingRegressor``` с параметрами ```loss="huber", n_estimators=2000, max_depth=5, subsample=0.8```. На самом деле, главная особенность первой модели заключается в том, что на втором шаге ее построения, за счет стеккинга с Ridge регрессором, получается уже достаточно хорошо обученная модель, ответы которой, вместе с обучающей выборкой и ответами Ridge регрессора, подаются на вход заключительной модели в стеке.

Реализация обеих моделей представлена ниже

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import  Ridge

# Первый вариант
# Score (MAE) при обучении на 75% обучающей выборки: 724754.51410
# Score (MAE) при обучении на всей обучающей выборки (score с leaderboard): 713026.55934

num_estims = [600, 700]
ridge = Ridge()
ridge.fit(data_upd, target)
data_upd["Ridge"] = ridge.predict(data_upd)
models = [("Ridge", ridge)]        
for n in num_estims:
    estimator = GradientBoostingRegressor(loss="huber", 
                                          n_estimators=n, 
                                          max_depth=5, 
                                          subsample=0.8, 
                                          verbose=1)
    estimator.fit(data_upd, target)
    estimator_name = "GBR_" + str(n)
    models.append((estimator_name, estimator))
    data_upd[estimator_name] = estimator.predict(data_upd)
    

        
# Второй вариант
# Score (MAE) при обучении на 75% обучающей выборки: 730245.56816

model = StackingRegressor([("Ridge", Ridge())], 
                          final_estimator=GradientBoostingRegressor(loss="huber", 
                                                                    n_estimators=2000, 
                                                                    max_depth=5,
                                                                    subsample=0.8,
                                                                    verbose=1), 
                          passthrough=True)
model.fit(data_upd, target)

После того, как модель обучена, остается только загрузить тестовый датасет, проделать над ним все те же манипуляции, что и с обучающей выборкой, сделать предсказание и записать ответ

In [7]:
data_test = pd.read_csv("data/Test.csv")
data_test.fillna(data_test.mean().apply(round), inplace=True)

data_test["date"] = pd.to_datetime(data_test["date"])
min_date = data_test.date.min()
data_test["date"] = data_test.date.apply(lambda x: (x - min_date).days)

streets = np.zeros((data_test.shape[0], number_of_streets))
for i in range(data_test.shape[0]):
    if data_test.street_id[i] < number_of_streets: streets[i][data_test.street_id[i]] = 1
streets = pd.DataFrame(streets)
data_test.drop(columns=["street_id", "id"], inplace=True)

data_numerical = data_test[numerical_features]
data_numerical = pd.DataFrame(scaler.transform(data_numerical), columns=numerical_features)

data_test_upd = (data_numerical.join(data_test.drop(columns=numerical_features))).join(streets)

data_test_upd

Unnamed: 0,area,date,metro_dist,n_photos,rooms,floor,build_tech,balcon,g_lift,kw1,...,662,663,664,665,666,667,668,669,670,671
0,0.387031,-1.373816,0.376896,-0.769196,1.072879,-1.081766,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.330083,-1.373816,0.009379,0.750354,-0.137553,1.143154,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.102291,-1.373816,-1.460690,-0.262680,-0.137553,-0.587339,1.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.979720,-1.373816,0.376896,-1.275713,-1.347986,-0.340126,0.0,0,1.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.216187,-1.373816,0.989425,-0.262680,1.072879,-0.340126,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,-0.524136,1.256258,0.989425,-1.275713,-0.137553,-0.834553,0.0,0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996,0.557875,1.256258,0.989425,2.269905,1.072879,0.401514,1.0,1,0.0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99997,-0.979720,1.256258,0.989425,-0.769196,-1.347986,-0.834553,1.0,1,0.0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998,-0.011605,1.256258,0.989425,-0.769196,-0.137553,-0.587339,1.0,1,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Предсказание для первой модели (аналогично обучению, нужно последовательно записывать ответы в датасет)
data_test_upd["Ridge"] = models[0][1].predict(data_test_upd)
data_test_upd["Boost_600"] = models[1][1].predict(data_test_upd)
pred_1 = models[2][1].predict(data_test_upd)


# Предсказание для второй модели
pred_2 = model.predict(data_test_upd.drop(columns=["Ridge", "Boost_600"]))

In [None]:
ans['id'] = list(range(100000, 200000))

# Ответы для первой модели записаны в файл SmirnovGS2.csv
ans['price'] = pred_1
ans.to_csv("SmirnovGS2.csv", index=False)

# Ответы для второй модели записаны в файл SmirnovGS.csv
ans['price'] = pred_2
ans.to_csv("SmirnovGS.csv", index=False)