In [1]:
%cd /home/stasvlad/Documents/hse/sberbank

/home/stasvlad/Documents/hse/sberbank


In [2]:
from utils import *
from features import *

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from xgboost import XGBRegressor, DMatrix, cv
from xgboost import train as train_xgb

## Data description

In [3]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])

tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [4]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [5]:
train_df = encode(train_df)

### Test dataset

In [6]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [7]:
coords_train_df = pd.read_csv('data/geo/train_lat_lon.csv')
coords_train_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_train_df.index = coords_train_df.id
coords_train_df.drop(['id'], axis=1, inplace=True)
coords_train_df = coords_train_df.sort_index()

coords_test_df = pd.read_csv('data/geo/test_lat_lon.csv')
coords_test_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_test_df.index = coords_test_df.id
coords_test_df.drop(['id'], axis=1, inplace=True)
coords_test_df = coords_test_df.sort_index()

coords_all_df = pd.concat([coords_train_df, coords_test_df])

In [8]:
train_df['is_train'] = 1
test_df['is_train'] = 0

# coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_all_df['lat']
all_df['longitude'] = coords_all_df['lon']

## 3. Removing outliers

In [9]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [10]:
all_df = create_new_features(all_df)

## 5. Removing fake prices

In [11]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [12]:
train_df = remove_fake_prices(train_df)

REMOVED: 35


In [13]:
idx_outliers = np.loadtxt('outliers/idx_outliers_full.txt').astype(int)
train_df = train_df.drop(idx_outliers)

### `Ensembling`

In [14]:
ETR_train = np.load('predictions/ETR_train.npy')
ETR_test = np.load('predictions/ETR_test.npy')

GBR_train = np.load('predictions/GBR_train.npy')
GBR_test = np.load('predictions/GBR_test.npy')

LGB_F_train = np.load('predictions/LGB_F_train.npy')
LGB_F_test = np.load('predictions/LGB_F_test.npy')

RF_train = np.load('predictions/RF_train.npy')
RF_test = np.load('predictions/RF_test.npy')

XGB_F_norm_mean_train = np.load('predictions/XGB_F_norm_mean_train.npy')
XGB_F_norm_mean_test = np.load('predictions/XGB_F_norm_mean_test.npy')

XGB_F_norm_median_train = np.load('predictions/XGB_F_norm_median_train.npy')
XGB_F_norm_median_test = np.load('predictions/XGB_F_norm_median_test.npy')

XGB_F_split_0_train = np.load('predictions/XGB_F_split_0_train.npy')
XGB_F_split_0_test = np.load('predictions/XGB_F_split_0_test.npy')

XGB_F_split_1_train = np.load('predictions/XGB_F_split_1_train.npy')
XGB_F_split_1_test = np.load('predictions/XGB_F_split_1_test.npy')

XGB_F_train = np.load('predictions/XGB_F_train.npy')
XGB_F_test = np.load('predictions/XGB_F_test.npy')

XGB_F_with_weights_train = np.load('predictions/XGB_F_with_weights_train.npy')
XGB_F_with_weights_test = np.load('predictions/XGB_F_with_weights_test.npy')

In [15]:
y = np.log1p(train_df['price_doc']).values

In [64]:
S_train = np.concatenate(
    [ETR_train,
     GBR_train,
     LGB_F_train,
     RF_train,
     XGB_F_norm_mean_train,
     XGB_F_norm_median_train,
     # XGB_F_split_0_train,
     # XGB_F_split_1_train,
     XGB_F_train,
     XGB_F_with_weights_train],
    axis=1
)
S_test = np.concatenate(
    [ETR_test,
     GBR_test,
     LGB_F_test,
     RF_test,
     XGB_F_norm_mean_test,
     XGB_F_norm_median_test,
     # XGB_F_split_0_test,
     # XGB_F_split_1_test,
     XGB_F_test,
     XGB_F_with_weights_test],
    axis=1)

S_train.shape, S_test.shape

((27762, 8), (7662, 8))

## LR

In [22]:
stacker = LinearRegression(fit_intercept=False)
stacker.fit(S_train, y)
y_pred = stacker.predict(S_test)
y_pred_train = stacker.predict(S_train)
print(f"train-RMSE  : {mean_squared_error(y_pred_train, y, squared=False)}")

train-RMSE  : 0.12501300072597782


In [23]:
submission = pd.read_csv('data/submits/sample_submission.csv', index_col='id')
result = np.expm1(y_pred)

if len(result[result < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [34]:
submission['price_doc'] = 0.915*result # 0.9
submission.to_csv('data/submits/submission.csv', index='id')

In [35]:
!kaggle competitions submit -c sberbank-russian-housing-market -f "data/submits/submission.csv" -m "Ensemble (LR)"

100%|████████████████████████████████████████| 181k/181k [00:02<00:00, 90.3kB/s]
Successfully submitted to Sberbank Russian Housing Market

In [65]:
class my_XGBRegressor(object):
    def __init__(self, params, product_type=-1):
        self.params = params
        self.product_type = product_type

    def fit(self, X, y, w=None):
        # if w == None:
        #    w = np.ones(X.shape[0])

        if self.product_type == 0:
            X = train_df[train_df['product_type'] == 0].drop(['sub_area', 'price_doc'], axis=1).values
            y = np.log1p(test_df[test_df['product_type'] == 0]['price_doc'].values)
            print(X.shape)

        if self.product_type == 1:
            X = train_df[train_df['product_type'] == 1].drop(['sub_area', 'price_doc'], axis=1).values
            y = np.log1p(test_df[test_df['product_type'] == 1]['price_doc'].values)
            print(X.shape)
            
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)  # random_state=42
        d_train = DMatrix(X_train, label=y_train)  # weight = w_train
        d_valid = DMatrix(X_val, label=y_val)  # weight = w_valid

        print(f"Training until validation scores don't improve for 50 rounds") # !!!
        if self.params['booster'] == 'gblinear':
            num_boost_round = 50000
        else:
            num_boost_round = 5000

        bst_partial = train_xgb(self.params,
                                d_train,
                                num_boost_round=num_boost_round,
                                early_stopping_rounds=50,
                                evals=[(d_train, 'train'), (d_valid, 'val')],
                                verbose_eval=500)

        last_round = bst_partial.best_iteration
        print(f"[{last_round}]  RMSE: {bst_partial.best_score}")

        d_all = DMatrix(X, label=y)  # weight = w
        self.bst = train_xgb(self.params,
                             d_all,
                             num_boost_round=last_round,
                             evals=[(d_train, 'train')],
                             verbose_eval=500)

    def predict(self, X_test):
        d_test = DMatrix(X_test)
        return self.bst.predict(d_test)

In [91]:
train_df['w'] = 1
train_df.loc[train_df['timestamp_year'] == 2014, 'w'] = 1.2
train_df.loc[train_df['timestamp_year'] == 2015, 'w'] = 1.5

In [93]:
params_xgb_lin = {'objective': 'reg:squarederror',
                  'booster': 'gblinear',
                  'base_score': 7,
                  'max_depth': 5,
                  'learning_rate': 0.3,
                  'eval_metric': 'rmse',
                  'seed': 42,
                  'nthread': -1
                  }

In [94]:
stacker = my_XGBRegressor(params_xgb_lin)
stacker.fit(S_train, y, train_df['w'])
y_pred = stacker.predict(S_test)
y_pred_train = stacker.predict(S_train)
print(f"train-RMSE  : {mean_squared_error(y_pred_train, y, squared=False)}")

Training until validation scores don't improve for 50 rounds
Parameters: { "max_depth" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-rmse:8.57161	val-rmse:8.57470
[500]	train-rmse:0.23250	val-rmse:0.22704
[1000]	train-rmse:0.21343	val-rmse:0.20834
[1500]	train-rmse:0.19735	val-rmse:0.19258
[2000]	train-rmse:0.18392	val-rmse:0.17941
[2500]	train-rmse:0.17278	val-rmse:0.16849
[3000]	train-rmse:0.16363	val-rmse:0.15953
[3500]	train-rmse:0.15616	val-rmse:0.15220
[4000]	train-rmse:0.15010	val-rmse:0.14625
[4500]	train-rmse:0.14522	val-rmse:0.14148
[5000]	train-rmse:0.14133	val-rmse:0.13766
[5500]	train-rmse:0.13822	val-rmse:0.13462
[6000]	train-rmse:0.13577	val-rmse:0.13222
[6500]	train-rmse:0.13383	val-rmse:0.13032
[7000]	train-rmse:0.13231	val-rmse:0.12883
[7500]	trai

In [95]:
submission = pd.read_csv('data/submits/sample_submission.csv', index_col='id')
result = np.expm1(y_pred)

if len(result[result < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [104]:
submission['price_doc'] = 0.935*result # 0.9
submission.to_csv('data/submits/submission.csv', index='id')

In [105]:
!kaggle competitions submit -c sberbank-russian-housing-market -f "data/submits/submission.csv" -m "Ensemble"

100%|████████████████████████████████████████| 121k/121k [00:02<00:00, 60.5kB/s]
Successfully submitted to Sberbank Russian Housing Market