In [1]:
%cd /home/stasvlad/Documents/hse/sberbank/

/home/stasvlad/Documents/hse/sberbank


In [2]:
from utils import *
from features import *

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from xgboost import XGBRegressor, DMatrix, cv
from xgboost import train as train_xgb

## Data description

In [3]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])

tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [4]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [5]:
train_df = encode(train_df)

### Test dataset

In [6]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [7]:
coords_train_df = pd.read_csv('data/geo/train_lat_lon.csv')
coords_train_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_train_df.index = coords_train_df.id
coords_train_df.drop(['id'], axis=1, inplace=True)
coords_train_df = coords_train_df.sort_index()

coords_test_df = pd.read_csv('data/geo/test_lat_lon.csv')
coords_test_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_test_df.index = coords_test_df.id
coords_test_df.drop(['id'], axis=1, inplace=True)
coords_test_df = coords_test_df.sort_index()

coords_all_df = pd.concat([coords_train_df, coords_test_df])

In [8]:
train_df['is_train'] = 1
test_df['is_train'] = 0

# coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_all_df['lat']
all_df['longitude'] = coords_all_df['lon']

## 3. Removing outliers

In [9]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [10]:
all_df = create_new_features(all_df)

## 5. Removing fake prices

In [11]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [12]:
train_df = remove_fake_prices(train_df)

REMOVED: 35


In [13]:
idx_outliers = np.loadtxt('outliers/idx_outliers_full.txt').astype(int)
train_df = train_df.drop(idx_outliers)

## 6. XGBRegressor

In [14]:
class my_LGBRegressor(object):
    def __init__(self, params):
        self.params = params

    def fit(self, X, y, w=None):
        split = int(X.shape[0] * 0.8)
        indices = np.random.permutation(X.shape[0])
        train_id, test_id = indices[:split], indices[split:]
        X_train, y_train, w_train, X_val, y_val, w_val = X[train_id], y[train_id], w[train_id], X[test_id], y[test_id], w[test_id],
        d_train = lgb.Dataset(X_train, y_train, weight=w_train)
        d_valid = lgb.Dataset(X_val, y_val, weight=w_val) 

        bst_partial = lgb.train(self.params,
                                d_train, 10000,
                                valid_sets=d_valid,
                                callbacks = [lgb.early_stopping(50)])
                                
        num_round = bst_partial.best_iteration
        d_all = lgb.Dataset(X, label=y, weight=w)
        self.bst = lgb.train(self.params, d_all, num_round)

    def predict(self, X):
        return self.bst.predict(X)


class my_XGBRegressor(object):
    def __init__(self, params, product_type=-1):
        self.params = params
        self.product_type = product_type

    def fit(self, X, y, w=None):
        
        if self.product_type == 0:
            X = train_df[train_df['product_type'] == 0].drop(['sub_area', 'price_doc'], axis=1).values
            y = np.log1p(test_df[test_df['product_type'] == 0]['price_doc'].values)
            print(X.shape)

        if self.product_type == 1:
            X = train_df[train_df['product_type'] == 1].drop(['sub_area', 'price_doc'], axis=1).values
            y = np.log1p(test_df[test_df['product_type'] == 1]['price_doc'].values)
            print(X.shape)
            
        split = int(X.shape[0] * 0.8)
        indices = np.random.permutation(X.shape[0])
        train_id, test_id = indices[:split], indices[split:]

        X_train, y_train, w_train, X_val, y_val, w_val = X[train_id], y[train_id], w[train_id], X[test_id], y[test_id], w[test_id],
        X_train, y_train, w_train, X_val, y_val, w_val = X[train_id], y[train_id], w[train_id], X[test_id], y[test_id], w[test_id],

        d_train = DMatrix(X_train, label=y_train, weight = w_train)
        d_valid = DMatrix(X_val, label=y_val, weight = w_val) 

        print(f"Training until validation scores don't improve for 50 rounds") # !!!
        if self.params['booster'] == 'gblinear':
            num_boost_round = 10000
        else:
            num_boost_round = 5000

        bst_partial = train_xgb(self.params,
                                d_train,
                                num_boost_round=num_boost_round,
                                early_stopping_rounds=50,
                                evals=[(d_train, 'train'), (d_valid, 'val')],
                                verbose_eval=500)

        last_round = bst_partial.best_iteration
        print(f"[{last_round}]  RMSE: {bst_partial.best_score}")

        d_all = DMatrix(X, label=y, weight = w)
        self.bst = train_xgb(self.params,
                             d_all,
                             num_boost_round=last_round,
                             evals=[(d_train, 'train')],
                             verbose_eval=500)

    def predict(self, X_test):
        d_test = DMatrix(X_test)
        return self.bst.predict(d_test)


class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, train_df, test_df):
        w = train_df['w'].values
        X = train_df.drop(['sub_area', 'price_doc', 'w'], axis=1).values
        y = np.log1p(train_df['price_doc']).values
        X_test = test_df.drop('sub_area', axis=1).values

        all_df = pd.concat([train_df.drop(['sub_area', 'price_doc', 'w'], axis=1), test_df.drop('sub_area', axis=1)])
        imputer = SimpleImputer(strategy='median') # mean
        imputer.fit(all_df)

        kf = KFold(n_splits=self.n_folds, shuffle=True)  # random_state=42
        folds = list(kf.split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((X_test.shape[0], len(self.base_models)))

        for i, model in enumerate(self.base_models):
            print('\n\nTraining model: ' + str(type(model).__name__))
            S_test_i = np.zeros((X_test.shape[0], len(folds)))

            for j, (train_idx, test_idx) in enumerate(folds):
                print('ROUND ' + str(j+1))

                if (not isinstance(model, my_XGBRegressor)) and (not isinstance(model, my_LGBRegressor)):
                    X = imputer.transform(train_df.drop(['sub_area', 'price_doc', 'w'], axis=1).values)
                    X_test = imputer.transform(X_test)

                X_train = X[train_idx]
                y_train = y[train_idx]
                w_train = w[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]

                model.fit(X_train, y_train, w_train)

                y_train_pred = model.predict(X_train)
                y_pred = model.predict(X_holdout)

                print(f"[ALL]  train-RMSE  : {mean_squared_error(y_train_pred, y_train, squared=False)}")
                print(f"[ALL]  holdout-RMSE: {mean_squared_error(y_pred, y_holdout, squared=False)}")

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = model.predict(X_test)

            S_test[:, i] = S_test_i.mean(axis=1)

        self.S_train, self.S_test, self.y = S_train, S_test, y
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)
        y_pred_train = self.stacker.predict(S_train)
        print(f"\n\n[THE END]  train-RMSE  : {mean_squared_error(y_pred_train, y, squared=False)}")

        return y_pred

In [15]:
train_df['w'] = 1
train_df.loc[train_df['timestamp_year'] == 2014, 'w'] = 1.2
train_df.loc[train_df['timestamp_year'] == 2015, 'w'] = 1.5

## 6.1 Normalizing prices (`product_type == 'Investment'`)

In [16]:
train_df_0 = train_df[train_df['product_type'] == 0].copy()

In [17]:
train_2011_q3_index = train_df_0.loc[train_df_0['timestamp_year'] == 2011].loc[train_df_0['timestamp_month'] >= 7].loc[train_df_0['timestamp_month'] < 10].index
train_2011_q4_index = train_df_0.loc[train_df_0['timestamp_year'] == 2011].loc[train_df_0['timestamp_month'] >= 10].loc[train_df_0['timestamp_month'] <= 12].index
train_2012_q1_index = train_df_0.loc[train_df_0['timestamp_year'] == 2012].loc[train_df_0['timestamp_month'] >= 1].loc[train_df_0['timestamp_month'] < 4].index
train_2012_q2_index = train_df_0.loc[train_df_0['timestamp_year'] == 2012].loc[train_df_0['timestamp_month'] >= 4].loc[train_df_0['timestamp_month'] < 7].index
train_2012_q3_index = train_df_0.loc[train_df_0['timestamp_year'] == 2012].loc[train_df_0['timestamp_month'] >= 7].loc[train_df_0['timestamp_month'] < 10].index
train_2012_q4_index = train_df_0.loc[train_df_0['timestamp_year'] == 2012].loc[train_df_0['timestamp_month'] >= 10].loc[train_df_0['timestamp_month'] <= 12].index
train_2013_q1_index = train_df_0.loc[train_df_0['timestamp_year'] == 2013].loc[train_df_0['timestamp_month'] >= 1].loc[train_df_0['timestamp_month'] < 4].index
train_2013_q2_index = train_df_0.loc[train_df_0['timestamp_year'] == 2013].loc[train_df_0['timestamp_month'] >= 4].loc[train_df_0['timestamp_month'] < 7].index
train_2013_q3_index = train_df_0.loc[train_df_0['timestamp_year'] == 2013].loc[train_df_0['timestamp_month'] >= 7].loc[train_df_0['timestamp_month'] < 10].index
train_2013_q4_index = train_df_0.loc[train_df_0['timestamp_year'] == 2013].loc[train_df_0['timestamp_month'] >= 10].loc[train_df_0['timestamp_month'] <= 12].index
train_2014_q1_index = train_df_0.loc[train_df_0['timestamp_year'] == 2014].loc[train_df_0['timestamp_month'] >= 1].loc[train_df_0['timestamp_month'] < 4].index
train_2014_q2_index = train_df_0.loc[train_df_0['timestamp_year'] == 2014].loc[train_df_0['timestamp_month'] >= 4].loc[train_df_0['timestamp_month'] < 7].index
train_2014_q3_index = train_df_0.loc[train_df_0['timestamp_year'] == 2014].loc[train_df_0['timestamp_month'] >= 7].loc[train_df_0['timestamp_month'] < 10].index
train_2014_q4_index = train_df_0.loc[train_df_0['timestamp_year'] == 2014].loc[train_df_0['timestamp_month'] >= 10].loc[train_df_0['timestamp_month'] <= 12].index
train_2015_q1_index = train_df_0.loc[train_df_0['timestamp_year'] == 2015].loc[train_df_0['timestamp_month'] >= 1].loc[train_df_0['timestamp_month'] < 4].index
train_2015_q2_index = train_df_0.loc[train_df_0['timestamp_year'] == 2015].loc[train_df_0['timestamp_month'] >= 4].loc[train_df_0['timestamp_month'] < 7].index

In [18]:
train_q_idx = [train_2011_q3_index, train_2011_q4_index, train_2012_q1_index,
               train_2012_q2_index, train_2012_q3_index, train_2012_q4_index,
               train_2013_q1_index, train_2013_q2_index, train_2013_q3_index,
               train_2013_q4_index, train_2014_q1_index, train_2014_q2_index,
               train_2014_q3_index, train_2014_q4_index, train_2015_q1_index,
               train_2015_q2_index]

In [19]:
sum(len(q) for q in train_q_idx) == len(train_df_0)

True

In [20]:
train_q_idx.pop()

Int64Index([28765, 28771, 28772, 28774, 28775, 28777, 28778, 28781, 28783,
            28784,
            ...
            30455, 30457, 30460, 30463, 30464, 30466, 30469, 30470, 30472,
            30473],
           dtype='int64', name='id', length=716)

In [21]:
sum(len(q) for q in train_q_idx) == len(train_df_0) - len(train_df.loc[train_2015_q2_index])

True

In [22]:
for q in train_q_idx:
    print((train_df.loc[q].price_doc / train_df.loc[q].full_sq).mean())

146003.66978784843
147007.56866771728
154123.60472626315
156463.75911237008
157370.6808427691
158166.67377496415
160119.11724372697
159137.1708521332
163726.89049531604
158594.24925726495
161002.8972664781
165221.11492517497
165168.61430014294
169940.21707769358
173190.639734523


In [23]:
train_df['average_q_price'] = 1

In [24]:
train_df.loc[train_2015_q2_index, 'average_q_price'] = 1
base_price = (train_df.loc[train_2015_q2_index].price_doc / train_df.loc[train_2015_q2_index].full_sq).mean()
for q in train_q_idx:
    train_df.loc[q, 'average_q_price'] = base_price / (train_df.loc[q].price_doc / train_df.loc[q].full_sq).mean() 


## 6.1 Normalizing prices (`product_type == 'OwnerOccupied'`)

In [25]:
train_df_1 = train_df[train_df['product_type'] == 1].copy()

In [26]:
train_2011_q3_index = train_df_1.loc[train_df_1['timestamp_year'] == 2011].loc[train_df_1['timestamp_month'] >= 7].loc[train_df_1['timestamp_month'] < 10].index
train_2011_q4_index = train_df_1.loc[train_df_1['timestamp_year'] == 2011].loc[train_df_1['timestamp_month'] >= 10].loc[train_df_1['timestamp_month'] <= 12].index
train_2012_q1_index = train_df_1.loc[train_df_1['timestamp_year'] == 2012].loc[train_df_1['timestamp_month'] >= 1].loc[train_df_1['timestamp_month'] < 4].index
train_2012_q2_index = train_df_1.loc[train_df_1['timestamp_year'] == 2012].loc[train_df_1['timestamp_month'] >= 4].loc[train_df_1['timestamp_month'] < 7].index
train_2012_q3_index = train_df_1.loc[train_df_1['timestamp_year'] == 2012].loc[train_df_1['timestamp_month'] >= 7].loc[train_df_1['timestamp_month'] < 10].index
train_2012_q4_index = train_df_1.loc[train_df_1['timestamp_year'] == 2012].loc[train_df_1['timestamp_month'] >= 10].loc[train_df_1['timestamp_month'] <= 12].index
train_2013_q1_index = train_df_1.loc[train_df_1['timestamp_year'] == 2013].loc[train_df_1['timestamp_month'] >= 1].loc[train_df_1['timestamp_month'] < 4].index
train_2013_q2_index = train_df_1.loc[train_df_1['timestamp_year'] == 2013].loc[train_df_1['timestamp_month'] >= 4].loc[train_df_1['timestamp_month'] < 7].index
train_2013_q3_index = train_df_1.loc[train_df_1['timestamp_year'] == 2013].loc[train_df_1['timestamp_month'] >= 7].loc[train_df_1['timestamp_month'] < 10].index
train_2013_q4_index = train_df_1.loc[train_df_1['timestamp_year'] == 2013].loc[train_df_1['timestamp_month'] >= 10].loc[train_df_1['timestamp_month'] <= 12].index
train_2014_q1_index = train_df_1.loc[train_df_1['timestamp_year'] == 2014].loc[train_df_1['timestamp_month'] >= 1].loc[train_df_1['timestamp_month'] < 4].index
train_2014_q2_index = train_df_1.loc[train_df_1['timestamp_year'] == 2014].loc[train_df_1['timestamp_month'] >= 4].loc[train_df_1['timestamp_month'] < 7].index
train_2014_q3_index = train_df_1.loc[train_df_1['timestamp_year'] == 2014].loc[train_df_1['timestamp_month'] >= 7].loc[train_df_1['timestamp_month'] < 10].index
train_2014_q4_index = train_df_1.loc[train_df_1['timestamp_year'] == 2014].loc[train_df_1['timestamp_month'] >= 10].loc[train_df_1['timestamp_month'] <= 12].index
train_2015_q1_index = train_df_1.loc[train_df_1['timestamp_year'] == 2015].loc[train_df_1['timestamp_month'] >= 1].loc[train_df_1['timestamp_month'] < 4].index
train_2015_q2_index = train_df_1.loc[train_df_1['timestamp_year'] == 2015].loc[train_df_1['timestamp_month'] >= 4].loc[train_df_1['timestamp_month'] < 7].index

In [27]:
train_q_idx = [train_2011_q3_index, train_2011_q4_index, train_2012_q1_index,
               train_2012_q2_index, train_2012_q3_index, train_2012_q4_index,
               train_2013_q1_index, train_2013_q2_index, train_2013_q3_index,
               train_2013_q4_index, train_2014_q1_index, train_2014_q2_index,
               train_2014_q3_index, train_2014_q4_index, train_2015_q1_index,
               train_2015_q2_index]

In [28]:
sum(len(q) for q in train_q_idx) == len(train_df_1)

True

In [29]:
train_q_idx.pop()

Int64Index([28763, 28764, 28766, 28767, 28768, 28770, 28773, 28776, 28779,
            28780,
            ...
            30453, 30454, 30456, 30458, 30461, 30462, 30465, 30467, 30468,
            30471],
           dtype='int64', name='id', length=896)

In [30]:
sum(len(q) for q in train_q_idx) == len(train_df_1) - len(train_df.loc[train_2015_q2_index])

True

In [31]:
for q in train_q_idx:
    print((train_df.loc[q].price_doc / train_df.loc[q].full_sq).mean())

113073.97260273973
136145.3975765336
147778.72739137296
138993.9483576403
96237.5923084805
90742.56271618714
98639.39428290667
99585.48477661972
104655.732082085
107174.24678825888
106913.52490299725
111861.94168330009
115122.04022919902
118082.13624021263
120242.83070880704


In [32]:
train_df.loc[train_2015_q2_index, 'average_q_price'] = 1
base_price = (train_df.loc[train_2015_q2_index].price_doc / train_df.loc[train_2015_q2_index].full_sq).mean()
for q in train_q_idx:
    train_df.loc[q, 'average_q_price'] = base_price / (train_df.loc[q].price_doc / train_df.loc[q].full_sq).mean() 


In [33]:
train_df['price_doc'] = train_df['price_doc'] * train_df['average_q_price']

In [34]:
params = {'objective': 'reg:squarederror',
          'tree_method': 'gpu_hist',
          'booster': 'gbtree',
          'base_score': 5,
          'learning_rate': 0.05,
          'max_depth': 5,
          'min_child_weight': 5,
          'eval_metric': 'rmse',
          'subsample': 1,
          'colsample_bytree': 0.8,
          'reg_lambda': 1,
          'reg_alpha': 0,
          'seed': 42,
          'nthread': -1
          }

In [35]:
#stacker
LR = LinearRegression()

#base models
XGB_F = my_XGBRegressor(params)

E = Ensemble(
    n_folds=5,
    stacker=LR,
    base_models=[XGB_F]
)

y_pred = E.fit_predict(train_df, test_df)



Training model: my_XGBRegressor
ROUND 1
Training until validation scores don't improve for 50 rounds
[0]	train-rmse:10.28633	val-rmse:10.28486
[500]	train-rmse:0.09520	val-rmse:0.12781
[1000]	train-rmse:0.07312	val-rmse:0.12571
[1060]	train-rmse:0.07112	val-rmse:0.12570
[1010]  RMSE: 0.125669
[0]	train-rmse:10.28631
[500]	train-rmse:0.09851
[1000]	train-rmse:0.07861
[1009]	train-rmse:0.07831
[ALL]  train-RMSE  : 0.07862343721577819
[ALL]  holdout-RMSE: 0.1264323584414182
ROUND 2
Training until validation scores don't improve for 50 rounds
[0]	train-rmse:10.28398	val-rmse:10.28021
[500]	train-rmse:0.09415	val-rmse:0.12778
[1000]	train-rmse:0.07334	val-rmse:0.12602
[1055]	train-rmse:0.07181	val-rmse:0.12599
[1006]  RMSE: 0.125976
[0]	train-rmse:10.28401
[500]	train-rmse:0.09778
[1000]	train-rmse:0.07782
[1005]	train-rmse:0.07771
[ALL]  train-RMSE  : 0.07813928715827942
[ALL]  holdout-RMSE: 0.12781656752603515
ROUND 3
Training until validation scores don't improve for 50 rounds
[0]	trai

In [36]:
np.save('predictions/XGB_F_norm_mean_train', E.S_train)
np.save('predictions/XGB_F_norm_mean_test', E.S_test)
(np.load('predictions/XGB_F_norm_mean_train.npy') == E.S_train).all(), (np.load('predictions/XGB_F_norm_mean_test.npy') == E.S_test).all()

(True, True)

In [37]:
submission = pd.read_csv('data/submits/sample_submission.csv', index_col='id')
result = np.expm1(E.S_test)

if len(result[result < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [58]:
submission['price_doc'] = 1.02*result # 0.9
submission.to_csv('data/submits/submission.csv', index='id')

In [59]:
!kaggle competitions submit -c sberbank-russian-housing-market -f "data/submits/submission.csv" -m "!XGB_F norm mean! 1.02"

100%|████████████████████████████████████████| 181k/181k [00:02<00:00, 90.4kB/s]
Successfully submitted to Sberbank Russian Housing Market