In [1]:
%cd /home/stasvlad/Documents/hse/sberbank/

/home/stasvlad/Documents/hse/sberbank


In [91]:
from utils import *
from features import *

import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold

## Data description

In [3]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])

tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [4]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [5]:
train_df = encode(train_df)

### Test dataset

In [6]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [7]:
coords_train_df = pd.read_csv('data/geo/train_lat_lon.csv')
coords_train_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_train_df.index = coords_train_df.id
coords_train_df.drop(['id'], axis=1, inplace=True)
coords_train_df = coords_train_df.sort_index()

coords_test_df = pd.read_csv('data/geo/test_lat_lon.csv')
coords_test_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_test_df.index = coords_test_df.id
coords_test_df.drop(['id'], axis=1, inplace=True)
coords_test_df = coords_test_df.sort_index()

coords_all_df = pd.concat([coords_train_df, coords_test_df])

In [8]:
train_df['is_train'] = 1
test_df['is_train'] = 0

# coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_all_df['lat']
all_df['longitude'] = coords_all_df['lon']

## 3. Removing outliers

In [9]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [10]:
all_df = create_new_features(all_df)

## 5. Removing fake prices

In [11]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [12]:
train_df = remove_fake_prices(train_df)

REMOVED: 35


In [13]:
idx_outliers = np.loadtxt('outliers/idx_outliers_full.txt').astype(int)
train_df = train_df.drop(idx_outliers)

## 6. XGBRegressor

In [65]:
train_df['w'] = 1
train_df.loc[train_df['timestamp_year'] == 2014, 'w'] = 1.2
train_df.loc[train_df['timestamp_year'] == 2015, 'w'] = 1.5

In [51]:
KNN = KNeighborsRegressor(metric='haversine', n_jobs=-1)
params = {'n_neighbors': range(3, 30)}
gs = GridSearchCV(KNN, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

In [52]:
gs.fit(train_df[['latitude', 'longitude']], np.log1p(train_df['price_doc']))

GridSearchCV(cv=5, estimator=KNeighborsRegressor(metric='haversine', n_jobs=-1),
             n_jobs=-1, param_grid={'n_neighbors': range(3, 30)},
             scoring='neg_mean_squared_error')

In [53]:
gs.cv_results_['mean_test_score']

array([-0.1211907 , -0.11653159, -0.11317397, -0.11192493, -0.11082971,
       -0.11078278, -0.11048016, -0.1106676 , -0.11096293, -0.11095568,
       -0.11102286, -0.1109461 , -0.11100395, -0.11113888, -0.11121384,
       -0.11142587, -0.11127334, -0.11115468, -0.11150959, -0.11144242,
       -0.11137891, -0.11160022, -0.11150367, -0.11150859, -0.11163565,
       -0.11173755, -0.11182354])

In [82]:
gs.best_estimator_

KNeighborsRegressor(metric='haversine', n_jobs=-1, n_neighbors=9)

In [89]:
KNN = KNeighborsRegressor(metric='haversine', n_jobs=-1, n_neighbors=9)
KNN.fit(train_df[['latitude', 'longitude']], np.log1p(train_df['price_doc']))
y_pred = KNN.predict(test_df[['latitude', 'longitude']])
result = np.expm1(y_pred)
submission = pd.read_csv('data/submits/sample_submission.csv', index_col='id')
submission['price_doc'] = result
submission.to_csv('data/submits/submission.csv', index='id')

In [90]:
!kaggle competitions submit -c sberbank-russian-housing-market -f "data/submits/submission.csv" -m "KNNRegressor"

100%|████████████████████████████████████████| 180k/180k [00:03<00:00, 59.8kB/s]
Successfully submitted to Sberbank Russian Housing Market

In [96]:
class Ensemble(object):
    def __init__(self, n_folds, stacker, base_models):
        self.n_folds = n_folds
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, train_df, test_df):
        X = train_df[['latitude', 'longitude']].values
        y = np.log1p(train_df['price_doc']).values
        X_test = test_df[['latitude', 'longitude']].values


        kf = KFold(n_splits=self.n_folds, shuffle=True)
        folds = list(kf.split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((X_test.shape[0], len(self.base_models)))

        for i, model in enumerate(self.base_models):
            print('\n\nTraining model: ' + str(type(model).__name__))
            S_test_i = np.zeros((X_test.shape[0], len(folds)))

            for j, (train_idx, test_idx) in enumerate(folds):
                print('ROUND ' + str(j+1))

                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]

                model.fit(X_train, y_train)

                y_train_pred = model.predict(X_train)
                y_pred = model.predict(X_holdout)

                print(f"[ALL]  train-RMSE  : {mean_squared_error(y_train_pred, y_train, squared=False)}")
                print(f"[ALL]  holdout-RMSE: {mean_squared_error(y_pred, y_holdout, squared=False)}")

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = model.predict(X_test)

            S_test[:, i] = S_test_i.mean(axis=1)

        self.S_train, self.S_test, self.y = S_train, S_test, y
        self.stacker.fit(S_train, y)
        y_pred = self.stacker.predict(S_test)
        y_pred_train = self.stacker.predict(S_train)
        print(f"\n\n[THE END]  train-RMSE  : {mean_squared_error(y_pred_train, y, squared=False)}")

        return y_pred

In [97]:
#stacker
LR = LinearRegression()

#base models
KNN = KNeighborsRegressor(metric='haversine', n_jobs=-1, n_neighbors=9)

E = Ensemble(
    n_folds=5,
    stacker=LR,
    base_models=[KNN]
)

y_pred = E.fit_predict(train_df, test_df)



Training model: KNeighborsRegressor
ROUND 1
[ALL]  train-RMSE  : 0.2940238797918632
[ALL]  holdout-RMSE: 0.326296367693664
ROUND 2
[ALL]  train-RMSE  : 0.2929159450248351
[ALL]  holdout-RMSE: 0.3147019147931567
ROUND 3
[ALL]  train-RMSE  : 0.2923114175786264
[ALL]  holdout-RMSE: 0.3195953327742766
ROUND 4
[ALL]  train-RMSE  : 0.29328998155558056
[ALL]  holdout-RMSE: 0.3209303605302776
ROUND 5
[ALL]  train-RMSE  : 0.2916546594477965
[ALL]  holdout-RMSE: 0.3207708385425104


[THE END]  train-RMSE  : 0.31905096154710677


In [98]:
np.save('predictions/KNN_train', E.S_train)
np.save('predictions/KNN_test', E.S_test)
(np.load('predictions/KNN_train.npy') == E.S_train).all(), (np.load('predictions/KNN_test.npy') == E.S_test).all()

(True, True)

In [99]:
submission = pd.read_csv('data/submits/sample_submission.csv', index_col='id')
result = np.expm1(E.S_test)

if len(result[result < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [100]:
submission['price_doc'] = result
submission.to_csv('data/submits/submission.csv', index='id')

In [101]:
!kaggle competitions submit -c sberbank-russian-housing-market -f "data/submits/submission.csv" -m "KNN"

100%|████████████████████████████████████████| 180k/180k [00:03<00:00, 59.9kB/s]
Successfully submitted to Sberbank Russian Housing Market