In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install geopandas

Collecting geopandas
  Downloading geopandas-0.10.2-py2.py3-none-any.whl (1.0 MB)
[K     |████████████████████████████████| 1.0 MB 5.2 MB/s 
[?25hCollecting fiona>=1.8
  Downloading Fiona-1.8.20-cp37-cp37m-manylinux1_x86_64.whl (15.4 MB)
[K     |████████████████████████████████| 15.4 MB 22.5 MB/s 
[?25hCollecting pyproj>=2.2.0
  Downloading pyproj-3.2.1-cp37-cp37m-manylinux2010_x86_64.whl (6.3 MB)
[K     |████████████████████████████████| 6.3 MB 41.0 MB/s 
Collecting cligj>=0.5
  Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Collecting click-plugins>=1.0
  Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Collecting munch
  Downloading munch-2.5.0-py2.py3-none-any.whl (10 kB)
Installing collected packages: munch, cligj, click-plugins, pyproj, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.8.20 geopandas-0.10.2 munch-2.5.0 pyproj-3.2.1


In [None]:
!cp -r /content/drive/MyDrive/sberbank/. .

In [None]:
from utils import *
from features import *

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from xgboost import XGBRegressor, DMatrix, cv
from xgboost import train as train_xgb

## Data description

In [None]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])

tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [None]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [None]:
train_df = encode(train_df)

### Test dataset

In [None]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [None]:
coords_train_df = pd.read_csv('data/geo/train_lat_lon.csv')
coords_train_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_train_df.index = coords_train_df.id
coords_train_df.drop(['id'], axis=1, inplace=True)
coords_train_df = coords_train_df.sort_index()

coords_test_df = pd.read_csv('data/geo/test_lat_lon.csv')
coords_test_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_test_df.index = coords_test_df.id
coords_test_df.drop(['id'], axis=1, inplace=True)
coords_test_df = coords_test_df.sort_index()

coords_all_df = pd.concat([coords_train_df, coords_test_df])

In [None]:
train_df['is_train'] = 1
test_df['is_train'] = 0

# coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_all_df['lat']
all_df['longitude'] = coords_all_df['lon']

## 3. Removing outliers

In [None]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [None]:
all_df = create_new_features(all_df)

## 5. Removing fake prices

In [None]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [None]:
train_df = remove_fake_prices(train_df)

REMOVED: 35


In [None]:
idx_outliers = np.loadtxt('outliers/idx_outliers_full.txt').astype(int)
train_df = train_df.drop(idx_outliers)

## 6. GradientBoostingRegressor

### `Ensembling`

In [None]:
class my_LGBRegressor(object):
    def __init__(self, params):
        self.params = params

    def fit(self, X, y, w=None):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)  # random_state=42
        # x_train, y_train, w_train, x_valid, y_valid,  w_valid = X[train_id], y[train_id], w[train_id], X[test_id], y[test_id], w[test_id],
        d_train = lgb.Dataset(X_train, y_train)  # weight=w_train
        d_valid = lgb.Dataset(X_val, y_val)  # weight=w_val

        bst_partial = lgb.train(self.params,
                                d_train, 10000,
                                valid_sets=d_valid,
                                callbacks = [lgb.early_stopping(50)],
                                verbose_eval=False)
                                
        num_round = bst_partial.best_iteration
        d_all = lgb.Dataset(X, label=y)  # weight=w
        self.bst = lgb.train(self.params, d_all, num_round, verbose_eval=False)

    def predict(self, X):
        return self.bst.predict(X)


class my_XGBRegressor(object):
    def __init__(self, params, product_type=-1):
        self.params = params
        self.product_type = product_type

    def fit(self, X, y, w=None):
        # if w == None:
        #    w = np.ones(X.shape[0])

        if self.product_type == 0:
            X = train_df[train_df['product_type'] == 0].drop(['sub_area', 'price_doc'], axis=1).values
            y = np.log1p(test_df[test_df['product_type'] == 0]['price_doc'].values)
            print(X.shape)

        if self.product_type == 1:
            X = train_df[train_df['product_type'] == 1].drop(['sub_area', 'price_doc'], axis=1).values
            y = np.log1p(test_df[test_df['product_type'] == 1]['price_doc'].values)
            print(X.shape)
            
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)  # random_state=42
        d_train = DMatrix(X_train, label=y_train)  # weight = w_train
        d_valid = DMatrix(X_val, label=y_val)  # weight = w_valid

        print(f"Training until validation scores don't improve for 50 rounds") # !!!
        if self.params['booster'] == 'gblinear':
            num_boost_round = 10000
        else:
            num_boost_round = 5000

        bst_partial = train_xgb(self.params,
                                d_train,
                                num_boost_round=num_boost_round,
                                early_stopping_rounds=50,
                                evals=[(d_train, 'train'), (d_valid, 'val')],
                                verbose_eval=500)

        last_round = bst_partial.best_iteration
        print(f"[{last_round}]  RMSE: {bst_partial.best_score}")

        d_all = DMatrix(X, label=y)  # weight = w
        self.bst = train_xgb(self.params,
                             d_all,
                             num_boost_round=last_round,
                             evals=[(d_train, 'train')],
                             verbose_eval=500)

    def predict(self, X_test):
        d_test = DMatrix(X_test)
        return self.bst.predict(d_test)


class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, train_df, test_df):
        X = train_df.drop(['sub_area', 'price_doc'], axis=1).values
        y = np.log1p(train_df['price_doc']).values
        w = train_df['w'].values
        X_test = test_df.drop('sub_area', axis=1).values

        all_df = pd.concat([train_df.drop(['sub_area', 'price_doc', 'w'], axis=1), test_df.drop('sub_area', axis=1)])
        imputer = SimpleImputer(strategy='median') # mean
        imputer.fit(all_df)

        kf = KFold(n_splits=self.n_folds, shuffle=True)  # random_state=42
        folds = list(kf.split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((X_test.shape[0], len(self.base_models)))

        for i, model in enumerate(self.base_models):
            print('\n\nTraining model: ' + str(type(model).__name__))
            S_test_i = np.zeros((X_test.shape[0], len(folds)))

            for j, (train_idx, test_idx) in enumerate(folds):
                print('ROUND ' + str(j+1))

                if (not isinstance(model, my_XGBRegressor)) and (not isinstance(model, my_LGBRegressor)):
                    X = imputer.transform(train_df.drop(['sub_area', 'price_doc', 'w'], axis=1).values)
                    X_test = imputer.transform(X_test)

                X_train = X[train_idx]
                y_train = y[train_idx]
                w_train = w[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]

                model.fit(X_train, y_train, w_train)  # w_train

                y_train_pred = model.predict(X_train)
                y_pred = model.predict(X_holdout)

                print(f"[ALL]  train-RMSE  : {mean_squared_error(y_train_pred, y_train, squared=False)}")
                print(f"[ALL]  holdout-RMSE: {mean_squared_error(y_pred, y_holdout, squared=False)}")

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = model.predict(X_test)

            S_test[:, i] = S_test_i.mean(axis=1)

        self.S_train, self.S_test, self.y = S_train, S_test, y
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)
        y_pred_train = self.stacker.predict(S_train)
        print(f"\n\n[THE END]  train-RMSE  : {mean_squared_error(y_pred_train, y, squared=False)}")

        return y_pred


In [None]:
train_df['w'] = 1
train_df.loc[train_df['timestamp_year'] == 2014, 'w'] = 1.2
train_df.loc[train_df['timestamp_year'] == 2015, 'w'] = 1.5

In [None]:
#stacker
LR = LinearRegression()

#base models
GBR = GradientBoostingRegressor(n_estimators=200, learning_rate=0.05, 
                                max_depth=5, max_features=0.8, verbose=True)

E = Ensemble(
    n_folds=5,
    stacker=LR,
    base_models=[GBR]
)

y_pred = E.fit_predict(train_df, test_df)



Training model: GradientBoostingRegressor
ROUND 1


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1856            2.04m
         2           0.1717            2.01m
         3           0.1591            1.97m
         4           0.1477            1.96m
         5           0.1373            1.94m
         6           0.1280            1.92m
         7           0.1194            1.89m
         8           0.1115            1.87m
         9           0.1044            1.84m
        10           0.0979            1.81m
        20           0.0570            1.60m
        30           0.0395            1.40m
        40           0.0310            1.20m
        50           0.0265            1.00m
        60           0.0236           48.20s
        70           0.0217           36.22s
        80           0.0202           24.24s
        90           0.0191           12.16s
       100           0.0181            0.00s
[ALL]  train-RMSE  : 0.13476719744847107
[ALL]  holdout-RMSE: 0.1470348100308359
ROUND 2


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1850            1.97m
         2           0.1712            1.97m
         3           0.1586            1.96m
         4           0.1471            1.95m
         5           0.1368            1.93m
         6           0.1273            1.92m
         7           0.1188            1.89m
         8           0.1111            1.88m
         9           0.1040            1.86m
        10           0.0975            1.84m
        20           0.0565            1.62m
        30           0.0391            1.41m
        40           0.0308            1.21m
        50           0.0262            1.01m
        60           0.0233           48.64s
        70           0.0214           36.57s
        80           0.0199           24.46s
        90           0.0188           12.27s
       100           0.0178            0.00s
[ALL]  train-RMSE  : 0.1337831718617022
[ALL]  holdout-RMSE: 0.14752805652101186
ROUND 3


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1855            2.03m
         2           0.1716            1.99m
         3           0.1591            1.98m
         4           0.1476            1.95m
         5           0.1375            1.91m
         6           0.1280            1.89m
         7           0.1196            1.88m
         8           0.1117            1.86m
         9           0.1045            1.84m
        10           0.0979            1.81m
        20           0.0569            1.61m
        30           0.0393            1.40m
        40           0.0309            1.19m
        50           0.0262           59.71s
        60           0.0232           47.87s
        70           0.0212           35.95s
        80           0.0199           24.05s
        90           0.0188           12.06s
       100           0.0179            0.00s
[ALL]  train-RMSE  : 0.13380106737680866
[ALL]  holdout-RMSE: 0.14948305033341627
ROUND 4


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1869            2.08m
         2           0.1729            2.03m
         3           0.1600            1.98m
         4           0.1485            1.95m
         5           0.1381            1.93m
         6           0.1286            1.90m
         7           0.1199            1.88m
         8           0.1120            1.86m
         9           0.1048            1.84m
        10           0.0983            1.82m
        20           0.0572            1.61m
        30           0.0394            1.41m
        40           0.0309            1.21m
        50           0.0262            1.01m
        60           0.0232           48.40s
        70           0.0212           36.43s
        80           0.0198           24.38s
        90           0.0187           12.21s
       100           0.0177            0.00s
[ALL]  train-RMSE  : 0.1332578939006281
[ALL]  holdout-RMSE: 0.1480148788430246
ROUND 5


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1873            2.05m
         2           0.1735            2.06m
         3           0.1608            2.04m
         4           0.1497            1.99m
         5           0.1391            1.97m
         6           0.1295            1.94m
         7           0.1208            1.92m
         8           0.1129            1.89m
         9           0.1056            1.87m
        10           0.0991            1.84m
        20           0.0575            1.63m
        30           0.0398            1.42m
        40           0.0313            1.22m
        50           0.0266            1.01m
        60           0.0237           48.76s
        70           0.0216           36.74s
        80           0.0201           24.59s
        90           0.0191           12.34s
       100           0.0182            0.00s
[ALL]  train-RMSE  : 0.13472455537967057
[ALL]  holdout-RMSE: 0.14360972709616682


[THE END]  trai

In [26]:
#stacker
LR = LinearRegression()

#base models
GBR = GradientBoostingRegressor(n_estimators=250, learning_rate=0.1, 
                                max_depth=5, max_features=0.8, verbose=True)

E = Ensemble(
    n_folds=5,
    stacker=LR,
    base_models=[GBR]
)

y_pred = E.fit_predict(train_df, test_df)



Training model: GradientBoostingRegressor
ROUND 1


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1714            5.09m
         2           0.1469            4.95m
         3           0.1267            4.87m
         4           0.1100            4.85m
         5           0.0963            4.81m
         6           0.0850            4.78m
         7           0.0755            4.72m
         8           0.0677            4.69m
         9           0.0611            4.67m
        10           0.0555            4.63m
        20           0.0303            4.44m
        30           0.0230            4.26m
        40           0.0197            4.11m
        50           0.0176            3.93m
        60           0.0162            3.74m
        70           0.0152            3.55m
        80           0.0144            3.36m
        90           0.0138            3.17m
       100           0.0131            2.98m
       200           0.0103            1.00m
[ALL]  train-RMSE  : 0.09722310124257587
[ALL]  holdou

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1711            4.80m
         2           0.1465            4.78m
         3           0.1264            4.80m
         4           0.1098            4.80m
         5           0.0961            4.77m
         6           0.0847            4.75m
         7           0.0754            4.72m
         8           0.0676            4.71m
         9           0.0610            4.68m
        10           0.0555            4.64m
        20           0.0305            4.41m
        30           0.0232            4.23m
        40           0.0200            4.07m
        50           0.0179            3.90m
        60           0.0166            3.72m
        70           0.0154            3.53m
        80           0.0146            3.34m
        90           0.0140            3.15m
       100           0.0134            2.95m
       200           0.0104           59.51s
[ALL]  train-RMSE  : 0.09712767395259228
[ALL]  holdou

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1717            4.83m
         2           0.1476            4.85m
         3           0.1274            4.81m
         4           0.1107            4.78m
         5           0.0970            4.75m
         6           0.0857            4.73m
         7           0.0763            4.75m
         8           0.0683            4.73m
         9           0.0618            4.72m
        10           0.0565            4.68m
        20           0.0313            4.44m
        30           0.0239            4.24m
        40           0.0204            4.07m
        50           0.0183            3.90m
        60           0.0169            3.72m
        70           0.0157            3.53m
        80           0.0148            3.33m
        90           0.0142            3.14m
       100           0.0136            2.95m
       200           0.0105           59.36s
[ALL]  train-RMSE  : 0.09807296161103359
[ALL]  holdou

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1725            4.80m
         2           0.1483            4.73m
         3           0.1283            4.76m
         4           0.1115            4.70m
         5           0.0977            4.68m
         6           0.0862            4.69m
         7           0.0765            4.69m
         8           0.0686            4.67m
         9           0.0620            4.65m
        10           0.0565            4.62m
        20           0.0310            4.35m
        30           0.0236            4.16m
        40           0.0204            4.01m
        50           0.0185            3.84m
        60           0.0170            3.67m
        70           0.0158            3.48m
        80           0.0149            3.29m
        90           0.0143            3.10m
       100           0.0137            2.92m
       200           0.0104           59.19s
[ALL]  train-RMSE  : 0.09742133232451085
[ALL]  holdou

  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


      Iter       Train Loss   Remaining Time 
         1           0.1710            4.75m
         2           0.1467            4.74m
         3           0.1264            4.71m
         4           0.1099            4.70m
         5           0.0963            4.70m
         6           0.0849            4.71m
         7           0.0756            4.73m
         8           0.0678            4.73m
         9           0.0614            4.70m
        10           0.0557            4.68m
        20           0.0304            4.41m
        30           0.0232            4.24m
        40           0.0200            4.08m
        50           0.0180            3.91m
        60           0.0165            3.73m
        70           0.0152            3.55m
        80           0.0144            3.36m
        90           0.0138            3.18m
       100           0.0133            2.98m
       200           0.0099           59.81s
[ALL]  train-RMSE  : 0.09531479621935504
[ALL]  holdou

In [30]:
np.save('/content/drive/MyDrive/GBR_train', E.S_train)
np.save('/content/drive/MyDrive/GBR_test', E.S_test)
(np.load('/content/drive/MyDrive/GBR_train.npy') == E.S_train).all(), (np.load('/content/drive/MyDrive/GBR_test.npy') == E.S_test).all()

(True, True)

In [None]:
submission = pd.read_csv('data/submits/sample_submission.csv', index_col='id')
result = np.expm1(E.S_test)

if len(result[result < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [None]:
submission['price_doc'] = result # 0.9
submission.to_csv('data/submits/submission.csv', index='id')

In [None]:
!kaggle competitions submit -c sberbank-russian-housing-market -f "data/submits/submission.csv" -m "!GBR! no magic"

100%|████████████████████████████████████████| 180k/180k [00:01<00:00, 93.4kB/s]
Successfully submitted to Sberbank Russian Housing Market