In [1]:
%cd /home/stasvlad/Documents/hse/sberbank/

/home/stasvlad/Documents/hse/sberbank


In [2]:
from utils import *

import folium
import geopandas as gpd

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_squared_log_error, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

## Data description

In [3]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])
tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [4]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [5]:
train_df = encode(train_df)

### Test dataset

In [6]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [7]:
train_df['is_train'] = 1
test_df['is_train'] = 0

coords_df = pd.read_csv('data/geo/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_df['latitude']
all_df['longitude'] = coords_df['longitude']

## 3. Removing outliers

In [8]:
all_df = remove_outliers(all_df)

In [9]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [10]:
# train_df = remove_fake_prices(train_df)
idx_outliers = np.loadtxt('outliers/idx_outliers_full.txt').astype(int)
train_df = train_df.drop(idx_outliers)

## 4. Modeling

### `product_type == 'Investment'`

In [11]:
train_df_0 = train_df[train_df['product_type'] == 0]
test_df_0 = test_df[test_df['product_type'] == 0]

X_0 = train_df_0.drop(['sub_area', 'price_doc'], axis=1).copy()
y_0 = np.log1p(train_df_0['price_doc'])
X_test_0 = test_df_0.drop(['sub_area'], axis=1).copy()

In [12]:
X_train_0, X_val_0, y_train_0, y_val_0 = train_test_split(X_0, y_0, test_size=0.2, random_state=42)
X_train_0.shape, X_val_0.shape

((13442, 295), (3361, 295))

In [13]:
params = {'objective': 'reg:squarederror',
          'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.05,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 5,
          'reg_alpha': 1,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

model_0 = XGBRegressor(**params)

model_0.fit(X_train_0, y_train_0, eval_set=[(X_train_0, y_train_0), (X_val_0, y_val_0)], early_stopping_rounds=50)

[0]	validation_0-rmse:8.40020	validation_1-rmse:8.40166
[1]	validation_0-rmse:7.98123	validation_1-rmse:7.98286
[2]	validation_0-rmse:7.58322	validation_1-rmse:7.58504
[3]	validation_0-rmse:7.20514	validation_1-rmse:7.20708
[4]	validation_0-rmse:6.84599	validation_1-rmse:6.84807
[5]	validation_0-rmse:6.50481	validation_1-rmse:6.50692
[6]	validation_0-rmse:6.18072	validation_1-rmse:6.18293
[7]	validation_0-rmse:5.87285	validation_1-rmse:5.87519
[8]	validation_0-rmse:5.58042	validation_1-rmse:5.58268
[9]	validation_0-rmse:5.30262	validation_1-rmse:5.30519
[10]	validation_0-rmse:5.03876	validation_1-rmse:5.04125
[11]	validation_0-rmse:4.78810	validation_1-rmse:4.79080
[12]	validation_0-rmse:4.55005	validation_1-rmse:4.55279
[13]	validation_0-rmse:4.32389	validation_1-rmse:4.32646
[14]	validation_0-rmse:4.10905	validation_1-rmse:4.11164
[15]	validation_0-rmse:3.90499	validation_1-rmse:3.90747
[16]	validation_0-rmse:3.71113	validation_1-rmse:3.71373
[17]	validation_0-rmse:3.52699	validation

XGBRegressor(base_score=7, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, eval_metric='rmse',
             gamma=0, gpu_id=0, importance_type='gain',
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=4, min_child_weight=7, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=8,
             nthread=-1, num_parallel_tree=1, random_state=42, reg_alpha=1,
             reg_lambda=5, scale_pos_weight=1, seed=42, subsample=1,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [14]:
y_pred_train = np.abs(model_0.predict(X_train_0))
y_pred_val = np.abs(model_0.predict(X_val_0))
print('___________________________________')
print('      RMSLE\n')
print('      TRAIN_1:', np.sqrt(mean_squared_log_error(np.expm1(y_train_0), np.expm1(y_pred_train))))
print(' VALIDATION_1:', np.sqrt(mean_squared_log_error(np.expm1(y_val_0), np.expm1(y_pred_val))))
print('___________________________________')
print('      MAE\n')
print('      TRAIN_1:', mean_absolute_error(np.expm1(y_train_0), np.expm1(y_pred_train)))
print(' VALIDATION_1:', mean_absolute_error(np.expm1(y_val_0), np.expm1(y_pred_val)))

___________________________________
      RMSLE

      TRAIN_1: 0.16094985176628493
 VALIDATION_1: 0.1874219390780706
___________________________________
      MAE

      TRAIN_1: 754931.9247182339
 VALIDATION_1: 890830.3720618864


### `product_type == 'OwnerOccupier'`

In [15]:
train_df_1 = train_df[train_df['product_type'] == 1]
test_df_1 = test_df[test_df['product_type'] == 1]

X_1 = train_df_1.drop(['sub_area', 'price_doc'], axis=1).copy()
y_1 = np.log1p(train_df_1['price_doc'])
X_test_1 = test_df_1.drop(['sub_area'], axis=1).copy()

In [16]:
X_train_1, X_val_1, y_train_1, y_val_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=42)
X_train_1.shape, X_val_1.shape

((8795, 295), (2199, 295))

In [17]:
params = {'objective': 'reg:squarederror',
          'n_estimators': 10000,
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 7,
          'learning_rate': 0.05,
          'max_depth': 4,
          'min_child_weight': 7,
          'subsample': 1,
          'colsample_bytree': 0.9,
          'reg_lambda': 5,
          'reg_alpha': 1,
          'eval_metric': 'rmse',
          'seed': 42,
          'nthread': -1
          }

model_1 = XGBRegressor(**params)

model_1.fit(X_train_1, y_train_1, eval_set=[(X_train_1, y_train_1), (X_val_1, y_val_1)], early_stopping_rounds=50)

[0]	validation_0-rmse:8.14854	validation_1-rmse:8.13847
[1]	validation_0-rmse:7.74241	validation_1-rmse:7.73230
[2]	validation_0-rmse:7.35657	validation_1-rmse:7.34649
[3]	validation_0-rmse:6.99005	validation_1-rmse:6.98006
[4]	validation_0-rmse:6.64181	validation_1-rmse:6.63179
[5]	validation_0-rmse:6.31099	validation_1-rmse:6.30094
[6]	validation_0-rmse:5.99668	validation_1-rmse:5.98679
[7]	validation_0-rmse:5.69813	validation_1-rmse:5.68838
[8]	validation_0-rmse:5.41446	validation_1-rmse:5.40479
[9]	validation_0-rmse:5.14499	validation_1-rmse:5.13545
[10]	validation_0-rmse:4.88902	validation_1-rmse:4.87973
[11]	validation_0-rmse:4.64582	validation_1-rmse:4.63672
[12]	validation_0-rmse:4.41483	validation_1-rmse:4.40566
[13]	validation_0-rmse:4.19534	validation_1-rmse:4.18641
[14]	validation_0-rmse:3.98683	validation_1-rmse:3.97812
[15]	validation_0-rmse:3.78872	validation_1-rmse:3.78003
[16]	validation_0-rmse:3.60055	validation_1-rmse:3.59200
[17]	validation_0-rmse:3.42179	validation

XGBRegressor(base_score=7, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9, eval_metric='rmse',
             gamma=0, gpu_id=0, importance_type='gain',
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=4, min_child_weight=7, missing=nan,
             monotone_constraints='()', n_estimators=10000, n_jobs=8,
             nthread=-1, num_parallel_tree=1, random_state=42, reg_alpha=1,
             reg_lambda=5, scale_pos_weight=1, seed=42, subsample=1,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [18]:
y_pred_train = np.abs(model_1.predict(X_train_1))
y_pred_val = np.abs(model_1.predict(X_val_1))
print('___________________________________')
print('      RMSLE\n')
print('      TRAIN_1:', np.sqrt(mean_squared_log_error(np.expm1(y_train_1), np.expm1(y_pred_train))))
print(' VALIDATION_1:', np.sqrt(mean_squared_log_error(np.expm1(y_val_1), np.expm1(y_pred_val))))
print('___________________________________')
print('      MAE\n')
print('      TRAIN_1:', mean_absolute_error(np.expm1(y_train_1), np.expm1(y_pred_train)))
print(' VALIDATION_1:', mean_absolute_error(np.expm1(y_val_1), np.expm1(y_pred_val)))

___________________________________
      RMSLE

      TRAIN_1: 0.05320001537975967
 VALIDATION_1: 0.09145954955814795
___________________________________
      MAE

      TRAIN_1: 249876.4265704946
 VALIDATION_1: 382731.755457026


In [19]:
test_df_nan = test_df[test_df['product_type'].isna()]
X_test_nan = test_df_nan.drop(['sub_area'], axis=1).copy()

In [20]:
submission = pd.read_csv('data/submits/sample_submission.csv', index_col='id')
pred_0 = np.expm1(model_0.predict(X_test_0))
pred_1 = np.expm1(model_1.predict(X_test_1))
pred_nan = np.expm1(model_1.predict(X_test_nan))

if len(pred_0[pred_0 < 0]) or len(pred_1[pred_1 < 0]) or len(pred_nan[pred_nan < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [40]:
submission.loc[X_test_0.index, 'price_doc'] = 0.87*pred_0
submission.loc[X_test_1.index, 'price_doc'] = 0.93*pred_1
submission.loc[X_test_nan.index, 'price_doc'] = 0.95*pred_nan
submission.to_csv('data/submits/submission.csv', index='id')

In [44]:
!kaggle competitions submit -c sberbank-russian-housing-market -f "data/submits/submission.csv" -m "m2 test"

100%|████████████████████████████████████████| 121k/121k [00:02<00:00, 46.1kB/s]
Successfully submitted to Sberbank Russian Housing Market

In [None]:
# !kaggle competitions submissions -c sberbank-russian-housing-market

In [None]:
def get_place(my_score):
    df = pd.read_csv('submits/publicleaderboard.csv')
    scores = df['Score'].values
    scores = np.append(scores, my_score)
    scores = np.sort(scores)
    print(f'{np.where(scores == my_score)[0][0]} / {len(scores)}')

In [None]:
get_place(0.31840)

1442 / 3266
