In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp /content/drive/MyDrive/sberbank/utils.py .
!pip install geopandas



In [None]:
!mkdir data
!cp -r /content/drive/MyDrive/sberbank/. data/

mkdir: cannot create directory ‘data’: File exists


In [None]:
from utils import *

import folium

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm

from xgboost import XGBRegressor, DMatrix, cv

## Data description

In [None]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])
tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [None]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [None]:
train_df = encode(train_df)

### Test dataset

In [None]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [None]:
train_df['is_train'] = 1
test_df['is_train'] = 0

coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_df['latitude']
all_df['longitude'] = coords_df['longitude']

## 3. Removing outliers

In [None]:
all_df = remove_outliers(all_df)

In [None]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

## 4. Modeling

### `product_type == 'OwnerOccupier'`

In [None]:
train_df_1 = train_df[train_df['product_type'] == 1]
test_df_1 = test_df[test_df['product_type'] == 1]

X_1 = train_df_1.drop(['sub_area', 'price_doc'], axis=1).copy()
y_1 = np.log1p(train_df_1['price_doc'])
X_test_1 = test_df_1.drop(['sub_area'], axis=1).copy()

In [None]:
# X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=42)
# X_train_1.shape, X_val_1.shape

In [None]:
dtrain = DMatrix(X_1, label=y_1)

In [None]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'base_score': 7,
          'booster': 'gbtree',
          'max_depth': 5,
          'eval_metric': 'rmse',
          'learning_rate': 0.3,
          # 'reg_alpha': 30,
          'min_child_weight': 1,
          'subsample': 1,
          'colsample_bytree': 0.8,
          'seed': 42,
          'nthread': -1
          }

cv_results = cv(
    params,
    dtrain,
    num_boost_round=5000,
    early_stopping_rounds=50,
    nfold=5,
    shuffle=True,
    metrics={'rmse'},
    verbose_eval=True,
    seed=42,
)

cv_results

[0]	train-rmse:6.00495+0.00104084	test-rmse:6.00495+0.00562791
[1]	train-rmse:4.20767+0.000740207	test-rmse:4.20762+0.00495682
[2]	train-rmse:2.95108+0.000550789	test-rmse:2.95142+0.00472417
[3]	train-rmse:2.07053+0.000398208	test-rmse:2.07096+0.00429311
[4]	train-rmse:1.45467+0.000343865	test-rmse:1.45566+0.00392862
[5]	train-rmse:1.02485+0.000387708	test-rmse:1.02606+0.00396105
[6]	train-rmse:0.72408+0.000495853	test-rmse:0.726329+0.003709
[7]	train-rmse:0.515048+0.000744981	test-rmse:0.518153+0.00373834
[8]	train-rmse:0.37066+0.000931147	test-rmse:0.375578+0.00363502
[9]	train-rmse:0.272067+0.00121559	test-rmse:0.278948+0.00379707
[10]	train-rmse:0.206378+0.00138437	test-rmse:0.215879+0.00431487
[11]	train-rmse:0.163289+0.00183735	test-rmse:0.175512+0.0041499
[12]	train-rmse:0.136261+0.00212063	test-rmse:0.151158+0.00464482
[13]	train-rmse:0.119604+0.00249298	test-rmse:0.136647+0.00522279
[14]	train-rmse:0.109827+0.0029396	test-rmse:0.128697+0.00567883
[15]	train-rmse:0.103734+0.002

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,6.004948,0.001041,6.004952,0.005628
1,4.207673,0.000740,4.207616,0.004957
2,2.951084,0.000551,2.951420,0.004724
3,2.070534,0.000398,2.070956,0.004293
4,1.454674,0.000344,1.455664,0.003929
...,...,...,...,...
426,0.025301,0.000258,0.101198,0.006374
427,0.025259,0.000256,0.101199,0.006377
428,0.025199,0.000245,0.101202,0.006375
429,0.025151,0.000253,0.101189,0.006377


### Tuning

#### `max_depth`, `min_child_weight`

In [None]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'base_score': 7,
          'booster': 'gbtree',
          'max_depth': 5,
          'eval_metric': 'rmse',
          'learning_rate': 0.3,
          # 'reg_alpha': 30,
          'min_child_weight': 1,
          'subsample': 1,
          'colsample_bytree': 0.8,
          'seed': 42,
          'nthread': -1
          }

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight) 
    for max_depth in range(2, 11) 
    for min_child_weight in range(1, 32, 5)
]

min_rmsle = float("Inf")
best_params = None

for max_depth, min_child_weight in tqdm(gridsearch_params):
    print("CV with max_depth={}, min_child_weight={}".format(max_depth, min_child_weight))
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    cv_results = cv(
        params,
        dtrain,
        num_boost_round=500,
        early_stopping_rounds=50,
        nfold=5,
        shuffle=True,
        metrics={'rmse'},
        seed=42,
    )

    mean_rmsle = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSLE {} for {} rounds".format(mean_rmsle, boost_rounds))

    if mean_rmsle < min_rmsle:
        min_rmsle = mean_rmsle
        best_params = (max_depth, min_child_weight)
    print("Best params: {}, {}, RMSLE: {}".format(best_params[0], best_params[1], min_rmsle))

  0%|          | 0/63 [00:00<?, ?it/s]

CV with max_depth=2, min_child_weight=1
	RMSLE 0.1045346 for 498 rounds
Best params: 2, 1, RMSLE: 0.1045346
CV with max_depth=2, min_child_weight=6
	RMSLE 0.1022988 for 499 rounds
Best params: 2, 6, RMSLE: 0.1022988
CV with max_depth=2, min_child_weight=11
	RMSLE 0.1018178 for 499 rounds
Best params: 2, 11, RMSLE: 0.1018178
CV with max_depth=2, min_child_weight=16
	RMSLE 0.1017382 for 499 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=2, min_child_weight=21
	RMSLE 0.10225499999999998 for 494 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=2, min_child_weight=26
	RMSLE 0.1025652 for 499 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=2, min_child_weight=31
	RMSLE 0.1018138 for 498 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=3, min_child_weight=1
	RMSLE 0.10184299999999999 for 486 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=3, min_child_weight=6
	RMSLE 0.09806480000000001 for 481 rounds
Best params: 3, 6, RMSL

KeyboardInterrupt: ignored

In [None]:
best_params

(4, 6)

print('''
CV with max_depth=2, min_child_weight=1
	RMSLE 0.1045346 for 498 rounds
Best params: 2, 1, RMSLE: 0.1045346
CV with max_depth=2, min_child_weight=6
	RMSLE 0.1022988 for 499 rounds
Best params: 2, 6, RMSLE: 0.1022988
CV with max_depth=2, min_child_weight=11
	RMSLE 0.1018178 for 499 rounds
Best params: 2, 11, RMSLE: 0.1018178
CV with max_depth=2, min_child_weight=16
	RMSLE 0.1017382 for 499 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=2, min_child_weight=21
	RMSLE 0.10225499999999998 for 494 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=2, min_child_weight=26
	RMSLE 0.1025652 for 499 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=2, min_child_weight=31
	RMSLE 0.1018138 for 498 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=3, min_child_weight=1
	RMSLE 0.10184299999999999 for 486 rounds
Best params: 2, 16, RMSLE: 0.1017382
CV with max_depth=3, min_child_weight=6
	RMSLE 0.09806480000000001 for 481 rounds
Best params: 3, 6, RMSLE: 0.09806480000000001
CV with max_depth=3, min_child_weight=11
	RMSLE 0.0985966 for 367 rounds
Best params: 3, 6, RMSLE: 0.09806480000000001
CV with max_depth=3, min_child_weight=16
	RMSLE 0.09912099999999999 for 467 rounds
Best params: 3, 6, RMSLE: 0.09806480000000001
CV with max_depth=3, min_child_weight=21
	RMSLE 0.09868039999999999 for 498 rounds
Best params: 3, 6, RMSLE: 0.09806480000000001
CV with max_depth=3, min_child_weight=26
	RMSLE 0.0984774 for 478 rounds
Best params: 3, 6, RMSLE: 0.09806480000000001
CV with max_depth=3, min_child_weight=31
	RMSLE 0.0996264 for 497 rounds
Best params: 3, 6, RMSLE: 0.09806480000000001
CV with max_depth=4, min_child_weight=1
	RMSLE 0.0992274 for 490 rounds
Best params: 3, 6, RMSLE: 0.09806480000000001
CV with max_depth=4, min_child_weight=6
	RMSLE 0.097876 for 499 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=4, min_child_weight=11
	RMSLE 0.0989528 for 313 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=4, min_child_weight=16
	RMSLE 0.0989668 for 319 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=4, min_child_weight=21
	RMSLE 0.09989659999999999 for 210 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=4, min_child_weight=26
	RMSLE 0.0982998 for 477 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=4, min_child_weight=31
	RMSLE 0.098667 for 470 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=5, min_child_weight=1
	RMSLE 0.10118400000000001 for 430 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=5, min_child_weight=6
	RMSLE 0.0992256 for 300 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=5, min_child_weight=11
	RMSLE 0.0991844 for 204 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=5, min_child_weight=16
	RMSLE 0.0988274 for 264 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=5, min_child_weight=21
	RMSLE 0.0995464 for 329 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=5, min_child_weight=26
	RMSLE 0.099385 for 274 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=5, min_child_weight=31
	RMSLE 0.0988004 for 364 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=6, min_child_weight=1
	RMSLE 0.10146340000000001 for 179 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=6, min_child_weight=6
	RMSLE 0.10078179999999999 for 167 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=6, min_child_weight=11
	RMSLE 0.09901259999999999 for 314 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=6, min_child_weight=16
	RMSLE 0.1001218 for 190 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=6, min_child_weight=21
	RMSLE 0.0990438 for 263 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=6, min_child_weight=26
	RMSLE 0.0995818 for 247 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=6, min_child_weight=31
	RMSLE 0.0990642 for 452 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=7, min_child_weight=1
	RMSLE 0.10117419999999999 for 168 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=7, min_child_weight=6
	RMSLE 0.09984600000000002 for 140 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=7, min_child_weight=11
	RMSLE 0.09989980000000001 for 191 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=7, min_child_weight=16
	RMSLE 0.10065260000000001 for 114 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=7, min_child_weight=21
	RMSLE 0.10013620000000001 for 269 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=7, min_child_weight=26
	RMSLE 0.09979740000000001 for 151 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=7, min_child_weight=31
	RMSLE 0.0980204 for 235 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=8, min_child_weight=1
	RMSLE 0.102146 for 107 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=8, min_child_weight=6
	RMSLE 0.1010558 for 99 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=8, min_child_weight=11
	RMSLE 0.10004899999999999 for 134 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=8, min_child_weight=16
	RMSLE 0.1007844 for 138 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=8, min_child_weight=21
	RMSLE 0.0996166 for 175 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=8, min_child_weight=26
	RMSLE 0.0985514 for 150 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=8, min_child_weight=31
	RMSLE 0.0999812 for 127 rounds
Best params: 4, 6, RMSLE: 0.097876
CV with max_depth=9, min_child_weight=1
''')

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight) 
    for max_depth in range(4, 6) 
    for min_child_weight in range(1, 32, 2)
]

min_rmsle = float("Inf")
best_params = None

for max_depth, min_child_weight in tqdm(gridsearch_params):
    print("CV with max_depth={}, min_child_weight={}".format(max_depth, min_child_weight))
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    cv_results = cv(
        params,
        dtrain,
        num_boost_round=500,
        early_stopping_rounds=50,
        nfold=5,
        shuffle=True,
        metrics={'rmse'},
        seed=42,
    )

    mean_rmsle = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSLE {} for {} rounds".format(mean_rmsle, boost_rounds))

    if mean_rmsle < min_rmsle:
        min_rmsle = mean_rmsle
        best_params = (max_depth, min_child_weight)
    print("Best params: {}, {}, RMSLE: {}".format(best_params[0], best_params[1], min_rmsle))

  0%|          | 0/32 [00:00<?, ?it/s]

CV with max_depth=4, min_child_weight=1
	RMSLE 0.0992274 for 490 rounds
Best params: 4, 1, RMSLE: 0.0992274
CV with max_depth=4, min_child_weight=3
	RMSLE 0.0977214 for 480 rounds
Best params: 4, 3, RMSLE: 0.0977214
CV with max_depth=4, min_child_weight=5
	RMSLE 0.0980742 for 444 rounds
Best params: 4, 3, RMSLE: 0.0977214
CV with max_depth=4, min_child_weight=7
	RMSLE 0.09636439999999999 for 496 rounds
Best params: 4, 7, RMSLE: 0.09636439999999999
CV with max_depth=4, min_child_weight=9
	RMSLE 0.0979736 for 489 rounds
Best params: 4, 7, RMSLE: 0.09636439999999999
CV with max_depth=4, min_child_weight=11
	RMSLE 0.0989528 for 313 rounds
Best params: 4, 7, RMSLE: 0.09636439999999999
CV with max_depth=4, min_child_weight=13
	RMSLE 0.09923320000000001 for 473 rounds
Best params: 4, 7, RMSLE: 0.09636439999999999
CV with max_depth=4, min_child_weight=15
	RMSLE 0.09886740000000001 for 468 rounds
Best params: 4, 7, RMSLE: 0.09636439999999999
CV with max_depth=4, min_child_weight=17
	RMSLE 0.099

In [None]:
best_params

(4, 7)

`max_depth` = 4 [3 - 5]

`min_child_weight` = 7 [3 - 8]

#### `subsample`, `colsample_bytree`

In [None]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'max_depth': 4,
          'min_child_weight': 7,
          'learning_rate': 0.2,
          'subsample': 1,
          'colsample_bytree': 1,
          # 'reg_alpha': 30,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(5, 11)]
    for colsample in [i/10. for i in range(5, 11)]
]

min_rmsle = float("Inf")
best_params = None

for subsample, colsample in tqdm(list(reversed(gridsearch_params))):
    print("CV with subsample={}, colsample={}".format(subsample, colsample))
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    cv_results = cv(
        params,
        dtrain,
        num_boost_round=500,
        early_stopping_rounds=50,
        nfold=5,
        shuffle=True,
        metrics={'rmse'},
        seed=42,
    )

    mean_rmsle = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSLE {} for {} rounds".format(mean_rmsle, boost_rounds))

    if mean_rmsle < min_rmsle:
        min_rmsle = mean_rmsle
        best_params = (subsample, colsample)
    print("Best params: {}, {}, RMSLE: {}".format(best_params[0], best_params[1], min_rmsle))

  0%|          | 0/36 [00:00<?, ?it/s]

CV with subsample=1.0, colsample=1.0
	RMSLE 0.0972602 for 497 rounds
Best params: 1.0, 1.0, RMSLE: 0.0972602
CV with subsample=1.0, colsample=0.9
	RMSLE 0.09571159999999998 for 497 rounds
Best params: 1.0, 0.9, RMSLE: 0.09571159999999998
CV with subsample=1.0, colsample=0.8
	RMSLE 0.0962374 for 495 rounds
Best params: 1.0, 0.9, RMSLE: 0.09571159999999998
CV with subsample=1.0, colsample=0.7
	RMSLE 0.09606200000000001 for 495 rounds
Best params: 1.0, 0.9, RMSLE: 0.09571159999999998
CV with subsample=1.0, colsample=0.6
	RMSLE 0.0959942 for 496 rounds
Best params: 1.0, 0.9, RMSLE: 0.09571159999999998
CV with subsample=1.0, colsample=0.5
	RMSLE 0.09606039999999999 for 493 rounds
Best params: 1.0, 0.9, RMSLE: 0.09571159999999998
CV with subsample=0.9, colsample=1.0
	RMSLE 0.09585059999999998 for 499 rounds
Best params: 1.0, 0.9, RMSLE: 0.09571159999999998
CV with subsample=0.9, colsample=0.9
	RMSLE 0.0970518 for 485 rounds
Best params: 1.0, 0.9, RMSLE: 0.09571159999999998
CV with subsample=

KeyboardInterrupt: ignored

In [None]:
best_params

(1.0, 0.9)

`max_depth` = 4 [3 - 5]

`min_child_weight` = 7 [3 - 8]

`subsample` = 1 [0.9, 1],

`colsample_bytree`: 0.9, [0.9, 1]



#### `reg_lambda`

In [None]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.3,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 1,
          # 'reg_alpha': 0,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

In [None]:
gridsearch_params = [0.1, 1.0, 5.0, 10.0, 50.0, 100.0]

min_rmsle = float("Inf")
best_params = None

for reg_lambda in tqdm(gridsearch_params):
    print("CV with reg_lambda={}".format(reg_lambda))
    params['reg_lambda'] = reg_lambda
    cv_results = cv(
        params,
        dtrain,
        num_boost_round=500,
        early_stopping_rounds=50,
        nfold=5,
        shuffle=True,
        metrics={'rmse'},
        seed=42,
    )

    mean_rmsle = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSLE {} for {} rounds".format(mean_rmsle, boost_rounds))

    if mean_rmsle < min_rmsle:
        min_rmsle = mean_rmsle
        best_params = reg_lambda
    print("Best params: {}, RMSLE: {}".format(reg_lambda, min_rmsle))

  0%|          | 0/6 [00:00<?, ?it/s]

CV with reg_lambda=0.1
	RMSLE 0.0984012 for 406 rounds
Best params: 0.1, RMSLE: 0.0984012
CV with reg_lambda=1.0
	RMSLE 0.09719 for 496 rounds
Best params: 1.0, RMSLE: 0.09719
CV with reg_lambda=5.0
	RMSLE 0.0966784 for 335 rounds
Best params: 5.0, RMSLE: 0.0966784
CV with reg_lambda=10.0
	RMSLE 0.09688540000000001 for 347 rounds
Best params: 10.0, RMSLE: 0.0966784
CV with reg_lambda=50.0
	RMSLE 0.0971166 for 499 rounds
Best params: 50.0, RMSLE: 0.0966784
CV with reg_lambda=100.0
	RMSLE 0.09640679999999999 for 498 rounds
Best params: 100.0, RMSLE: 0.09640679999999999


In [None]:
best_params

`max_depth`: 4 [3 - 5]

`min_child_weight`: 7 [4 - 8]

`subsample`: 1 [0.9, 1],

`colsample_bytree`: 0.9, [0.9, 1]

`reg_lambda`: 5 [?]



#### `reg_alpha`

In [None]:
params = {'objective': 'reg:squarederror',
          # 'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.3,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 5,
          'reg_alpha': 0,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

In [None]:
gridsearch_params = [0, 0.1, 1.0, 5.0, 10.0, 50.0, 100.0]

min_rmsle = float("Inf")
best_params = None

for reg_alpha in tqdm(gridsearch_params):
    print("CV with reg_alpha={}".format(reg_alpha))
    params['reg_alpha'] = reg_alpha
    cv_results = cv(
        params,
        dtrain,
        num_boost_round=500,
        early_stopping_rounds=50,
        nfold=5,
        shuffle=True,
        metrics={'rmse'},
        seed=42,
    )

    mean_rmsle = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSLE {} for {} rounds".format(mean_rmsle, boost_rounds))

    if mean_rmsle < min_rmsle:
        min_rmsle = mean_rmsle
        best_params = reg_alpha
    print("Best params: {}, RMSLE: {}".format(reg_alpha, min_rmsle))

  0%|          | 0/7 [00:00<?, ?it/s]

CV with reg_alpha=0
	RMSLE 0.0966784 for 335 rounds
Best params: 0, RMSLE: 0.0966784
CV with reg_alpha=0.1
	RMSLE 0.0955154 for 482 rounds
Best params: 0.1, RMSLE: 0.0955154
CV with reg_alpha=1.0
	RMSLE 0.0971118 for 463 rounds
Best params: 1.0, RMSLE: 0.0955154
CV with reg_alpha=5.0
	RMSLE 0.1034128 for 127 rounds
Best params: 5.0, RMSLE: 0.0955154
CV with reg_alpha=10.0
	RMSLE 0.1127802 for 70 rounds
Best params: 10.0, RMSLE: 0.0955154
CV with reg_alpha=50.0
	RMSLE 0.13565359999999999 for 33 rounds
Best params: 50.0, RMSLE: 0.0955154
CV with reg_alpha=100.0
	RMSLE 0.1472922 for 31 rounds
Best params: 100.0, RMSLE: 0.0955154


In [None]:
best_params

`max_depth`: 4 [3 - 5]

`min_child_weight`: 7 [4 - 8]

`subsample`: 1 [0.9, 1],

`colsample_bytree`: 0.9, [0.9, 1]

`reg_lambda`: 5 [?]

`reg_alpha`: 1 [?]



### `product_type == 'Investment'`

In [None]:
create_submission(model, X_test)

In [None]:
submission = pd.read_csv('data/sample_submission.csv')
pred = model.predict(X_test)
if len(pred[pred < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')
    pred = np.abs(pred)
submission['price_doc'] = pred * 0.85
submission.to_csv('submission.csv', index=False)


In [None]:
!kaggle competitions submit -c sberbank-russian-housing-market -f submission.csv -m "XGBRegressor w/out outliers"

In [None]:
# !kaggle competitions submissions -c sberbank-russian-housing-market