In [46]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [47]:
!cp /content/drive/MyDrive/sberbank/utils.py .
!pip install geopandas



In [48]:
!mkdir data
!cp -r /content/drive/MyDrive/sberbank/. data/

mkdir: cannot create directory ‘data’: File exists


In [49]:
from utils import *

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from xgboost import XGBRegressor, DMatrix, cv
from xgboost import train as train_xgb

## Data description

In [50]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])
tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [51]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [52]:
train_df = encode(train_df)

### Test dataset

In [53]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [54]:
train_df['is_train'] = 1
test_df['is_train'] = 0

coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_df['latitude']
all_df['longitude'] = coords_df['longitude']

## 3. Removing outliers

In [55]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [56]:
all_df = create_new_features(all_df)

## 5. Removing fake prices

In [57]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [58]:
# train_df = remove_fake_prices(train_df)
idx_outliers = np.loadtxt('data/idx_outliers.txt').astype(int)
train_df = train_df.drop(idx_outliers)

### `Ensembling`

In [59]:
class my_LGBRegressor(object):
    def __init__(self, params):
        self.params = params

    def fit(self, X, y, w=None):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)  # random_state=42
        # x_train, y_train, w_train, x_valid, y_valid,  w_valid = X[train_id], y[train_id], w[train_id], X[test_id], y[test_id], w[test_id],
        d_train = lgb.Dataset(X_train, y_train)  # weight=w_train
        d_valid = lgb.Dataset(X_val, y_val)  # weight=w_val

        bst_partial = lgb.train(self.params,
                                d_train, 10000,
                                valid_sets=d_valid,
                                callbacks = [lgb.early_stopping(50)],
                                verbose_eval=False)
                                
        num_round = bst_partial.best_iteration
        d_all = lgb.Dataset(X, label=y)  # weight=w
        self.bst = lgb.train(self.params, d_all, num_round, verbose_eval=False)

    def predict(self, X):
        return self.bst.predict(X)


class my_XGBRegressor(object):
    def __init__(self, params, product_type=-1):
        self.params = params
        self.product_type = product_type

    def fit(self, X, y, w=None):
        # if w == None:
        #    w = np.ones(X.shape[0])

        if self.product_type == 0:
            X = train_df[train_df['product_type'] == 0].drop(['sub_area', 'price_doc'], axis=1).values
            y = np.log1p(test_df[test_df['product_type'] == 0]['price_doc'].values)
            print(X.shape)

        if self.product_type == 1:
            X = train_df[train_df['product_type'] == 1].drop(['sub_area', 'price_doc'], axis=1).values
            y = np.log1p(test_df[test_df['product_type'] == 1]['price_doc'].values)
            print(X.shape)
            
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)  # random_state=42
        d_train = DMatrix(X_train, label=y_train)  # weight = w_train
        d_valid = DMatrix(X_val, label=y_val)  # weight = w_valid

        print(f"Training until validation scores don't improve for 50 rounds") # !!!
        if self.params['booster'] == 'gblinear':
            num_boost_round = 10000
        else:
            num_boost_round = 5000

        bst_partial = train_xgb(self.params,
                                d_train,
                                num_boost_round=num_boost_round,
                                early_stopping_rounds=50,
                                evals=[(d_train, 'train'), (d_valid, 'val')],
                                verbose_eval=500)

        last_round = bst_partial.best_iteration
        print(f"[{last_round}]  RMSE: {bst_partial.best_score}")

        d_all = DMatrix(X, label=y)  # weight = w
        self.bst = train_xgb(self.params,
                             d_all,
                             num_boost_round=last_round,
                             evals=[(d_train, 'train')],
                             verbose_eval=500)

    def predict(self, X_test):
        d_test = DMatrix(X_test)
        return self.bst.predict(d_test)


class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, train_df, test_df):
        X = train_df.drop(['sub_area', 'price_doc'], axis=1).values
        y = np.log1p(train_df['price_doc']).values
        # w = train_df['w'].values
        X_test = test_df.drop('sub_area', axis=1).values

        all_df = pd.concat([train_df.drop(['sub_area', 'price_doc'], axis=1), test_df.drop('sub_area', axis=1)])
        imputer = SimpleImputer(strategy='median') # mean
        imputer.fit(all_df)

        kf = KFold(n_splits=self.n_folds, shuffle=True)  # random_state=42
        folds = list(kf.split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((X_test.shape[0], len(self.base_models)))

        for i, model in enumerate(self.base_models):
            print('\n\nTraining model: ' + str(type(model).__name__))
            S_test_i = np.zeros((X_test.shape[0], len(folds)))

            for j, (train_idx, test_idx) in enumerate(folds):
                print('ROUND ' + str(j+1))

                if (not isinstance(model, my_XGBRegressor)) and (not isinstance(model, my_LGBRegressor)):
                    X = imputer.transform(X)
                    X_test = imputer.transform(X_test)

                X_train = X[train_idx]
                y_train = y[train_idx]
                # w_train = w[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]

                model.fit(X_train, y_train)  # w_train

                y_train_pred = model.predict(X_train)
                y_pred = model.predict(X_holdout)

                print(f"[ALL]  train-RMSE  : {mean_squared_error(y_train_pred, y_train, squared=False)}")
                print(f"[ALL]  holdout-RMSE: {mean_squared_error(y_pred, y_holdout, squared=False)}")

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = model.predict(X_test)

            S_test[:, i] = S_test_i.mean(axis=1)

        self.S_train, self.S_test, self.y = S_train, S_test, y
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)
        y_pred_train = self.stacker.predict(S_train)
        print(f"\n\n[THE END]  train-RMSE  : {mean_squared_error(y_pred_train, y, squared=False)}")

        return y_pred


In [60]:
params_xgb_tree = {'objective': 'reg:squarederror',
                   'booster': 'gbtree',
                   'tree_method': 'gpu_hist',
                   'base_score': 7,
                   'learning_rate': 0.05,
                   'max_depth': 4,
                   'min_child_weight': 7,
                   'subsample': 1,
                   'colsample_bytree': 0.9,
                   'reg_lambda': 5,
                   'reg_alpha': 1,
                   'eval_metric': 'rmse',
                   'seed': 42,
                   'nthread': -1
                   }


params_xgb_lin = {'objective': 'reg:squarederror',
                  'booster': 'gblinear',
                  'tree_method': 'gpu_hist',
                  'base_score': 7,
                  'learning_rate': 1,
                  'eval_metric': 'rmse',
                  'seed': 42,
                  'nthread': -1
                  }

params_lgb = {'objective': 'regression', 
              'metric': 'rmse',
              'learning_rate': 0.05, 
              'max_depth': -1, 
              'sub_feature': 0.7, 
              'sub_row': 0.9,
              'num_leaves': 15, 
              'min_data': 30, 
              'max_bin': 20,
              'bagging_freq': 40,
              'force_col_wise': True,
              'verbosity': 0}

In [62]:
#stacker
xgb_lin = my_XGBRegressor(params_xgb_lin)
LR = LinearRegression()

#base models
xgb_tree = my_XGBRegressor(params_xgb_tree)

xgb_tree_0 = my_XGBRegressor(params_xgb_tree, 0)
xgb_tree_1 = my_XGBRegressor(params_xgb_tree, 1)

lgb_tree = my_LGBRegressor(params_lgb)

RF = RandomForestRegressor(n_estimators=500, max_depth=5, max_features=0.2, n_jobs=-1)
ETR = ExtraTreesRegressor(n_estimators=500, max_depth=5, max_features=0.3, n_jobs=-1)
Ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=5), n_estimators=200)
GBR = GradientBoostingRegressor(n_estimators=200, max_depth=5, max_features=0.5)

E = Ensemble(
    n_folds=2,
    stacker=xgb_lin,
    base_models=[lgb_tree] # -Ada? -GBR? +xgb_tree_0? +xgb_tree_1?
)

y_pred = E.fit_predict(train_df, test_df)



Training model: my_LGBRegressor
ROUND 1
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[710]	valid_0's rmse: 0.145456
[ALL]  train-RMSE  : 0.09754595509043086
[ALL]  holdout-RMSE: 0.13952376998080132
ROUND 2
Training until validation scores don't improve for 50 rounds.
Early stopping, best iteration is:
[638]	valid_0's rmse: 0.1339
[ALL]  train-RMSE  : 0.10099173395892287
[ALL]  holdout-RMSE: 0.13678378715893455
Training until validation scores don't improve for 50 rounds
[0]	train-rmse:0.445805	val-rmse:0.449953
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 50 rounds.
[500]	train-rmse:0.325829	val-rmse:0.329471
[1000]	train-rmse:0.247508	val-rmse:0.250838
[1500]	train-rmse:0.198697	val-rmse:0.201815
[2000]	train-rmse:0.170047	val-rmse:0.172991
[2500]	train-rmse:0.154247	val-rmse:0.157028
[3000]	train-rmse:0.145968	val-rmse:0.148604
[3500]	train-rmse:0

## Testing

In [43]:
submission = pd.read_csv('data/sample_submission.csv', index_col='id')
result = np.expm1(y_pred)

if len(result[result < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [45]:
submission['price_doc'] = result # 0.9
submission.to_csv('data/submission.csv', index='id')

In [None]:
# !kaggle competitions submit -c sberbank-russian-housing-market -f "submits/submission.csv" -m "Ensemble"