In [71]:
from utils import *

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.compose import make_column_selector
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression

from xgboost import XGBRegressor, DMatrix, cv

## Data description

In [2]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])
tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [3]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [4]:
train_df = encode(train_df)

### Test dataset

In [5]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [6]:
train_df['is_train'] = 1
test_df['is_train'] = 0

coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_df['latitude']
all_df['longitude'] = coords_df['longitude']

## 3. Removing outliers

In [7]:
all_df = remove_outliers(all_df)

In [8]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [9]:
# train_df = remove_fake_prices(train_df)
idx_outliers = np.loadtxt('data/idx_outliers.txt').astype(int)
train_df = train_df.drop(idx_outliers)

## 4. Modeling

### full

In [None]:
X = train_df.drop(['sub_area', 'price_doc'], axis=1).copy()
y = np.log1p(train_df['price_doc'])
X_test = test_df.drop(['sub_area'], axis=1).copy()

In [None]:
dtrain = DMatrix(X, label=y)

In [None]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.05,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 5,
          'reg_alpha': 1,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

In [None]:
cv_results = cv(
    params,
    dtrain,
    num_boost_round=5000,
    early_stopping_rounds=50,
    nfold=5,
    shuffle=True,
    metrics={'rmse'},
    verbose_eval=True,
    seed=42,
)

cv_results

[0]	train-rmse:8.30445+0.00179	test-rmse:8.30447+0.00740
[1]	train-rmse:7.89017+0.00170	test-rmse:7.89021+0.00721
[2]	train-rmse:7.49667+0.00161	test-rmse:7.49672+0.00721
[3]	train-rmse:7.12276+0.00153	test-rmse:7.12279+0.00711
[4]	train-rmse:6.76758+0.00145	test-rmse:6.76761+0.00692
[5]	train-rmse:6.43023+0.00138	test-rmse:6.43025+0.00686
[6]	train-rmse:6.10967+0.00132	test-rmse:6.10971+0.00678
[7]	train-rmse:5.80513+0.00125	test-rmse:5.80517+0.00663
[8]	train-rmse:5.51585+0.00119	test-rmse:5.51590+0.00648
[9]	train-rmse:5.24103+0.00114	test-rmse:5.24110+0.00631
[10]	train-rmse:4.97995+0.00107	test-rmse:4.97996+0.00621
[11]	train-rmse:4.73195+0.00102	test-rmse:4.73198+0.00613
[12]	train-rmse:4.49635+0.00097	test-rmse:4.49638+0.00606
[13]	train-rmse:4.27254+0.00092	test-rmse:4.27263+0.00599
[14]	train-rmse:4.05992+0.00087	test-rmse:4.06000+0.00589
[15]	train-rmse:3.85801+0.00083	test-rmse:3.85809+0.00582
[16]	train-rmse:3.66614+0.00079	test-rmse:3.66623+0.00571
[17]	train-rmse:3.48389+

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,8.304450,0.001786,8.304470,0.007400
1,7.890175,0.001697,7.890209,0.007210
2,7.496672,0.001610,7.496724,0.007209
3,7.122759,0.001530,7.122786,0.007114
4,6.767581,0.001453,6.767611,0.006920
...,...,...,...,...
2402,0.081266,0.000512,0.127576,0.001777
2403,0.081258,0.000512,0.127578,0.001777
2404,0.081247,0.000514,0.127578,0.001776
2405,0.081238,0.000516,0.127578,0.001775


In [49]:
cv_results['test-rmse-mean'].argmin(), cv_results['test-rmse-mean'].min()

(2406, 0.12757579999999996)

### full (0.8)

In [55]:
X = train_df.drop(['sub_area', 'price_doc'], axis=1).copy()
y = np.log1p(train_df['price_doc'])
X_test = test_df.drop(['sub_area'], axis=1).copy()

In [56]:
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2, random_state=42)
X, y = X_train, y_train,

In [57]:
dtrain = DMatrix(X, label=y)

In [58]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.05,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 5,
          'reg_alpha': 1,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

In [59]:
cv_results = cv(
    params,
    dtrain,
    num_boost_round=5000,
    early_stopping_rounds=50,
    nfold=5,
    shuffle=True,
    metrics={'rmse'},
    verbose_eval=True,
    seed=42,
)

cv_results

[0]	train-rmse:8.30560+0.00267	test-rmse:8.30558+0.01089
[1]	train-rmse:7.89137+0.00254	test-rmse:7.89141+0.01056
[2]	train-rmse:7.49788+0.00242	test-rmse:7.49794+0.01044
[3]	train-rmse:7.12399+0.00230	test-rmse:7.12406+0.01013
[4]	train-rmse:6.76884+0.00219	test-rmse:6.76890+0.00987
[5]	train-rmse:6.43148+0.00208	test-rmse:6.43156+0.00976
[6]	train-rmse:6.11093+0.00198	test-rmse:6.11096+0.00946
[7]	train-rmse:5.80643+0.00188	test-rmse:5.80649+0.00919
[8]	train-rmse:5.51713+0.00179	test-rmse:5.51722+0.00886
[9]	train-rmse:5.24234+0.00171	test-rmse:5.24234+0.00859
[10]	train-rmse:4.98126+0.00162	test-rmse:4.98129+0.00835
[11]	train-rmse:4.73327+0.00155	test-rmse:4.73331+0.00802
[12]	train-rmse:4.49767+0.00146	test-rmse:4.49771+0.00775
[13]	train-rmse:4.27387+0.00140	test-rmse:4.27389+0.00744
[14]	train-rmse:4.06126+0.00134	test-rmse:4.06130+0.00715
[15]	train-rmse:3.85934+0.00128	test-rmse:3.85939+0.00700
[16]	train-rmse:3.66749+0.00122	test-rmse:3.66755+0.00669
[17]	train-rmse:3.48524+

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,8.305604,0.002668,8.305585,0.010894
1,7.891367,0.002537,7.891406,0.010557
2,7.497880,0.002416,7.497943,0.010441
3,7.123994,0.002299,7.124056,0.010133
4,6.768838,0.002187,6.768903,0.009872
...,...,...,...,...
2100,0.079232,0.000494,0.129159,0.003350
2101,0.079221,0.000494,0.129158,0.003351
2102,0.079210,0.000494,0.129159,0.003353
2103,0.079199,0.000494,0.129158,0.003353


In [60]:
cv_results['test-rmse-mean'].argmin(), cv_results['test-rmse-mean'].min()

(2104, 0.12915539999999998)

### `product_type == 'Investment'`

In [33]:
train_df_0 = train_df[train_df['product_type'] == 0]
test_df_0 = test_df[test_df['product_type'] == 0]

X = train_df_0.drop(['sub_area', 'price_doc'], axis=1).copy()
y = np.log1p(train_df_0['price_doc'])
X_test_0 = test_df_0.drop(['sub_area'], axis=1).copy()

In [34]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.05,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 5,
          'reg_alpha': 1,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

In [35]:
dtrain = DMatrix(X, label=y)

In [36]:
cv_results = cv(
    params,
    dtrain,
    num_boost_round=5000,
    early_stopping_rounds=50,
    nfold=5,
    shuffle=True,
    metrics={'rmse'},
    verbose_eval=True,
    seed=42,
)

cv_results

[0]	train-rmse:8.40581+0.00066	test-rmse:8.40581+0.00272
[1]	train-rmse:7.98644+0.00063	test-rmse:7.98646+0.00267
[2]	train-rmse:7.58810+0.00060	test-rmse:7.58811+0.00268
[3]	train-rmse:7.20964+0.00057	test-rmse:7.20964+0.00263
[4]	train-rmse:6.85012+0.00054	test-rmse:6.85017+0.00265
[5]	train-rmse:6.50864+0.00052	test-rmse:6.50868+0.00262
[6]	train-rmse:6.18420+0.00049	test-rmse:6.18418+0.00254
[7]	train-rmse:5.87601+0.00047	test-rmse:5.87603+0.00252
[8]	train-rmse:5.58324+0.00045	test-rmse:5.58319+0.00247
[9]	train-rmse:5.30514+0.00043	test-rmse:5.30515+0.00245
[10]	train-rmse:5.04096+0.00041	test-rmse:5.04089+0.00236
[11]	train-rmse:4.79001+0.00038	test-rmse:4.79001+0.00241
[12]	train-rmse:4.55163+0.00036	test-rmse:4.55161+0.00233
[13]	train-rmse:4.32518+0.00035	test-rmse:4.32521+0.00232
[14]	train-rmse:4.11006+0.00033	test-rmse:4.11012+0.00221
[15]	train-rmse:3.90577+0.00032	test-rmse:3.90590+0.00219
[16]	train-rmse:3.71166+0.00031	test-rmse:3.71174+0.00211
[17]	train-rmse:3.52727+

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,8.405808,0.000658,8.405808,0.002721
1,7.986442,0.000627,7.986459,0.002674
2,7.588097,0.000598,7.588109,0.002683
3,7.209642,0.000570,7.209640,0.002634
4,6.850124,0.000543,6.850174,0.002648
...,...,...,...,...
1126,0.098743,0.000431,0.141380,0.003058
1127,0.098721,0.000433,0.141380,0.003058
1128,0.098700,0.000433,0.141379,0.003063
1129,0.098679,0.000435,0.141379,0.003060


In [37]:
cv_results['test-rmse-mean'].argmin(), cv_results['test-rmse-mean'].min()

0.14137899999999998

### `product_type == 'OwnerOccupier'`

In [26]:
train_df_1 = train_df[train_df['product_type'] == 1]
test_df_1 = test_df[test_df['product_type'] == 1]

X = train_df_1.drop(['sub_area', 'price_doc'], axis=1).copy()
y = np.log1p(train_df_1['price_doc'])
X_test_1 = test_df_1.drop(['sub_area'], axis=1).copy()

In [27]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.05,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 5,
          'reg_alpha': 1,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

In [28]:
dtrain = DMatrix(X, label=y)

In [None]:
cv_results = cv(
    params,
    dtrain,
    num_boost_round=5000,
    early_stopping_rounds=50,
    nfold=5,
    shuffle=True,
    metrics={'rmse'},
    verbose_eval=True,
    seed=42,
)

cv_results

In [32]:
cv_results['test-rmse-mean'].argmin(), cv_results['test-rmse-mean'].min()

0.0946974

In [38]:
error_sum = ((0.141379**2)*len(train_df[train_df['product_type'] == 0]) + \
             (0.0946974**2)*len(train_df[train_df['product_type'] == 1]))
baseline = np.sqrt(error_sum / len(train_df))
baseline

0.12496445868953714

### `StackingRegressor`

In [74]:
X = train_df.drop(['sub_area', 'price_doc'], axis=1).copy()
y = np.log1p(train_df['price_doc'])
X_test = test_df.drop(['sub_area'], axis=1).copy()

In [75]:
params = {'objective': 'reg:squarederror',
          'n_estimators': 2000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.05,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 5,
          'reg_alpha': 1,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

In [76]:
def select_product_type(df, product_type=0):
    return df[df['product_type'] == product_type]

In [72]:
model_0 = make_pipeline(
    FunctionTransformer(select_product_type, kw_args={'product_type': 0}),
    XGBRegressor(**params)
)

model_1 = make_pipeline(
    FunctionTransformer(select_product_type, kw_args={'product_type': 1}),
    XGBRegressor(**params)
)

estimators = [('model_0', model_0), ('model_1', model_1)]
model = StackingRegressor(estimators=estimators,
                          final_estimator=LinearRegression(),
                          cv=5,
                          n_jobs=6)
model.fit(X, y)

XGBoostError: [05:34:40] ../src/data/data.cc:583: Check failed: labels_.Size() == num_row_ (22220 vs. 13434) : Size of labels must equal to number of rows.
Stack trace:
  [bt] (0) /home/stasvlad/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x9133f) [0x7fd5b554833f]
  [bt] (1) /home/stasvlad/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x110fcc) [0x7fd5b55c7fcc]
  [bt] (2) /home/stasvlad/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x1b90e7) [0x7fd5b56700e7]
  [bt] (3) /home/stasvlad/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x1b99bc) [0x7fd5b56709bc]
  [bt] (4) /home/stasvlad/.local/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x50) [0x7fd5b5537ed0]
  [bt] (5) /lib/x86_64-linux-gnu/libffi.so.7(+0x6ff5) [0x7fd60d1c5ff5]
  [bt] (6) /lib/x86_64-linux-gnu/libffi.so.7(+0x640a) [0x7fd60d1c540a]
  [bt] (7) /usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x5b6) [0x7fd60c7ef306]
  [bt] (8) /usr/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0x139dc) [0x7fd60c7ef9dc]



## Testing

In [None]:
test_df_nan = test_df[test_df['product_type'].isna()]
X_test_nan = test_df_nan.drop(['sub_area'], axis=1).copy()

In [None]:
submission = pd.read_csv('data/sample_submission.csv', index_col='id')
pred_0 = np.expm1(model_0.predict(X_test_0))
pred_1 = np.expm1(model_1.predict(X_test_1))
pred_nan = np.expm1(model_1.predict(X_test_nan))

if len(pred_0[pred_0 < 0]) or len(pred_1[pred_1 < 0]) or len(pred_nan[pred_nan < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [None]:
submission.loc[X_test_0.index, 'price_doc'] = 0.87*pred_0
submission.loc[X_test_1.index, 'price_doc'] = 0.93*pred_1
submission.loc[X_test_nan.index, 'price_doc'] = 0.95*pred_nan
submission.to_csv('submits/submission.csv', index='id')

In [None]:
# !kaggle competitions submit -c sberbank-russian-housing-market -f "submits/submission.csv" -m "XGBRegressor tuned"

100%|████████████████████████████████████████| 121k/121k [00:02<00:00, 61.0kB/s]
Successfully submitted to Sberbank Russian Housing Market