In [1]:
%cd /home/stasvlad/Documents/hse/sberbank

/home/stasvlad/Documents/hse/sberbank


In [2]:
from utils import *
from features import *

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from xgboost import XGBRegressor, DMatrix, cv
from xgboost import train as train_xgb

## Data description

In [3]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])

tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [4]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [5]:
train_df = encode(train_df)

### Test dataset

In [6]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [7]:
coords_train_df = pd.read_csv('data/geo/train_lat_lon.csv')
coords_train_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_train_df.index = coords_train_df.id
coords_train_df.drop(['id'], axis=1, inplace=True)
coords_train_df = coords_train_df.sort_index()

coords_test_df = pd.read_csv('data/geo/test_lat_lon.csv')
coords_test_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_test_df.index = coords_test_df.id
coords_test_df.drop(['id'], axis=1, inplace=True)
coords_test_df = coords_test_df.sort_index()

coords_all_df = pd.concat([coords_train_df, coords_test_df])

In [8]:
train_df['is_train'] = 1
test_df['is_train'] = 0

# coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_all_df['lat']
all_df['longitude'] = coords_all_df['lon']

## 3. Removing outliers

In [9]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [10]:
all_df = create_new_features(all_df)

## 5. Removing fake prices

In [11]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [12]:
train_df = remove_fake_prices(train_df)

REMOVED: 35


In [13]:
idx_outliers = np.loadtxt('outliers/idx_outliers_full.txt').astype(int)
train_df = train_df.drop(idx_outliers)

### `Ensembling`

In [165]:
ETR_train = np.load('predictions/ETR_train.npy')
ETR_test = np.load('predictions/ETR_test.npy')

GBR_train = np.load('predictions/GBR_train.npy')
GBR_test = np.load('predictions/GBR_test.npy')

LGB_F_train = np.load('predictions/LGB_F_train.npy')
LGB_F_test = np.load('predictions/LGB_F_test.npy')

RF_train = np.load('predictions/RF_train.npy')
RF_test = np.load('predictions/RF_test.npy')

XGB_F_norm_mean_train = np.load('predictions/XGB_F_norm_mean_train.npy')
XGB_F_norm_mean_test = np.load('predictions/XGB_F_norm_mean_test.npy')

XGB_F_norm_median_train = np.load('predictions/XGB_F_norm_median_train.npy')
XGB_F_norm_median_test = np.load('predictions/XGB_F_norm_median_test.npy')

XGB_F_split_0_train = np.load('predictions/XGB_F_split_0_train.npy')
XGB_F_split_0_test = np.load('predictions/XGB_F_split_0_test.npy')

XGB_F_split_1_train = np.load('predictions/XGB_F_split_1_train.npy')
XGB_F_split_1_test = np.load('predictions/XGB_F_split_1_test.npy')

XGB_F_train = np.load('predictions/XGB_F_train.npy')
XGB_F_test = np.load('predictions/XGB_F_test.npy')

XGB_F_with_weights_train = np.load('predictions/XGB_F_with_weights_train.npy')
XGB_F_with_weights_test = np.load('predictions/XGB_F_with_weights_test.npy')

In [166]:
y = np.log1p(train_df['price_doc']).values

In [167]:
train_df_copy = train_df.reset_index().drop('id', axis=1).copy()
idx_0 = train_df_copy[train_df_copy['product_type'] == 0].index.values
idx_1 = train_df_copy[train_df_copy['product_type'] == 1].index.values

XGB_F_split_train = np.zeros(shape=(train_df.shape[0], 1))

XGB_F_split_train[idx_0] = XGB_F_split_0_train
XGB_F_split_train[idx_1] = XGB_F_split_1_train

In [168]:
test_df_copy = test_df.reset_index().drop('id', axis=1).copy()
idx_0 = test_df_copy[test_df_copy['product_type'] == 0].index.values
idx_1 = test_df_copy[test_df_copy['product_type'] == 1].index.values
idx_nan = test_df_copy[test_df_copy['product_type'].isna()].index.values

XGB_F_split_test = np.zeros(shape=(test_df.shape[0], 1))

XGB_F_split_test[idx_0] = XGB_F_split_0_test[idx_0]
XGB_F_split_test[idx_1] = XGB_F_split_1_test[idx_1]
XGB_F_split_test[idx_nan] = XGB_F_split_1_test[idx_nan]

In [217]:
S_train = np.concatenate(
    [ETR_train,
     GBR_train,
     LGB_F_train,
     RF_train,
     XGB_F_norm_mean_train,
     XGB_F_split_train,
     XGB_F_train,
     XGB_F_with_weights_train],
    axis=1
)
S_test = np.concatenate(
    [ETR_test,
     GBR_test,
     LGB_F_test,
     RF_test,
     XGB_F_norm_mean_test,
     XGB_F_split_test,
     XGB_F_test,
     XGB_F_with_weights_test],
    axis=1)

S_train.shape, S_test.shape

((27762, 8), (7662, 8))

## LR

In [218]:
stacker = LinearRegression(fit_intercept=False)
stacker.fit(S_train, y)
y_pred = stacker.predict(S_test)
y_pred_train = stacker.predict(S_train)
print(f"train-RMSE  : {mean_squared_error(y_pred_train, y, squared=False)}")

train-RMSE  : 0.12436736423517876


In [219]:
stacker.coef_

array([-0.04185252,  0.06857151,  0.13419866,  0.12169046, -0.19372594,
        0.24784324,  0.41863202,  0.24441239])

In [220]:
submission = pd.read_csv('data/submits/sample_submission.csv', index_col='id')
result = np.expm1(y_pred)

if len(result[result < 0]):
    print('WARNING: NEGATIVE PREDICTIONS')

In [221]:
submission.loc[30474 + idx_0, 'price_doc'] = 0.875*result[idx_0]
submission.loc[30474 + idx_1, 'price_doc'] = 0.965*result[idx_1]
submission.loc[30474 + idx_nan, 'price_doc'] = 0.965*result[idx_nan]
submission.to_csv('data/submits/submission.csv', index='id')

In [222]:
!kaggle competitions submit -c sberbank-russian-housing-market -f "data/submits/submission.csv" -m "Ensemble (LR)"

100%|████████████████████████████████████████| 181k/181k [00:03<00:00, 53.1kB/s]
Successfully submitted to Sberbank Russian Housing Market

In [223]:
get_place(0.31421)

1134 / 3266
