In [1]:
%cd /home/stasvlad/Documents/hse/sberbank/

/home/stasvlad/Documents/hse/sberbank


In [2]:
from utils import *
from features import *

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from xgboost import XGBRegressor, DMatrix, cv
from xgboost import train as train_xgb

## Data description

In [3]:
macro_df = pd.read_csv('data/macro.csv', parse_dates=['timestamp'])
train_df = pd.read_csv('data/train.csv', index_col='id', parse_dates=['timestamp'])
test_df = pd.read_csv('data/test.csv', index_col='id', parse_dates=['timestamp'])

tverskoe_issue_fix(train_df)
tverskoe_issue_fix(test_df)

Fix:  550
Fix:  149


## 1. Data preprocessing
## I part (encoding and correcting mistakes)

### Macro dataset

In [4]:
macro_df['child_on_acc_pre_school'] = macro_df['child_on_acc_pre_school'].str.replace('#!', 'nan')
for column in macro_df.select_dtypes('object').columns:
    macro_df[column] = macro_df[column].str.replace(',', '.')
    macro_df[column] = macro_df[column].astype(float)

if not len(macro_df.select_dtypes('object').columns):
    print('OK')

OK


### Train dataset

In [5]:
train_df = encode(train_df)

### Test dataset

In [6]:
test_df = encode(test_df)

## II part (Filling missing values)

XGBRegressor model handles `np.NaN` values itself

## 2. Encoding `sub_area` feature

In [7]:
coords_train_df = pd.read_csv('data/geo/train_lat_lon.csv')
coords_train_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_train_df.index = coords_train_df.id
coords_train_df.drop(['id'], axis=1, inplace=True)
coords_train_df = coords_train_df.sort_index()

coords_test_df = pd.read_csv('data/geo/test_lat_lon.csv')
coords_test_df.drop(['key', 'tolerance_m'], axis=1, inplace=True)
coords_test_df.index = coords_test_df.id
coords_test_df.drop(['id'], axis=1, inplace=True)
coords_test_df = coords_test_df.sort_index()

coords_all_df = pd.concat([coords_train_df, coords_test_df])

In [8]:
train_df['is_train'] = 1
test_df['is_train'] = 0

# coords_df = pd.read_csv('data/coords.csv', index_col='id')
all_df = pd.concat([train_df, test_df])

all_df['latitude'] = coords_all_df['lat']
all_df['longitude'] = coords_all_df['lon']

## 3. Removing outliers

In [9]:
all_df = remove_outliers(all_df)

## 4. Feature engineering

In [10]:
all_df = create_new_features(all_df)

## 5. Removing fake prices

In [11]:
train_df = all_df[all_df['is_train'] == 1].drop(['is_train'], axis=1)
test_df = all_df[all_df['is_train'] == 0].drop(['is_train', 'price_doc'], axis=1)

In [12]:
train_df = remove_fake_prices(train_df)

REMOVED: 35


In [13]:
idx_outliers = np.loadtxt('outliers/idx_outliers_full.txt').astype(int)
train_df = train_df.drop(idx_outliers)

## 6. Random Forest

In [18]:
X = train_df.drop(['price_doc', 'sub_area'], axis=1)
y = np.log1p(train_df['price_doc'])

In [19]:
all_df = pd.concat([train_df.drop(['sub_area', 'price_doc'], axis=1), test_df.drop('sub_area', axis=1)])
imputer = SimpleImputer(strategy='median')
imputer.fit(all_df)

SimpleImputer(strategy='median')

In [20]:
X = imputer.transform(X)
RF = RandomForestRegressor(n_estimators=500, max_depth=5, max_features=0.5, verbose=True, n_jobs=-1)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cross_val_score(RF, X, y, cv=kf, scoring='neg_root_mean_squared_error', n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   25.3s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   28.2s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   28.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: 

array([-0.19347543, -0.18977763, -0.19218449, -0.18735222, -0.19587987])

In [26]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf_split = list(kf.split(X, y))[0]

In [45]:
X = imputer.transform(X)
RF = RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=10, max_features=0.5, n_jobs=-1)
RF.fit(X[kf_split[0]], y.iloc[kf_split[0]])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   44.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.1min finished


RandomForestRegressor(max_depth=10, max_features=0.5, min_samples_leaf=10,
                      n_estimators=500, n_jobs=-1, verbose=True)

In [46]:
mean_squared_error(RF.predict(X[kf_split[1]]), y.iloc[kf_split[1]], squared=False)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.4s finished


0.15301050714337347

In [47]:
X = imputer.transform(X)
RF = RandomForestRegressor(n_estimators=500, max_depth=8, min_samples_leaf=10, max_features=0.5, n_jobs=-1)
RF.fit(X[kf_split[0]], y.iloc[kf_split[0]])

RandomForestRegressor(max_depth=8, max_features=0.5, min_samples_leaf=10,
                      n_estimators=500, n_jobs=-1)

In [48]:
mean_squared_error(RF.predict(X[kf_split[1]]), y.iloc[kf_split[1]], squared=False)

0.1628513351366661

In [49]:
X = imputer.transform(X)
RF = RandomForestRegressor(n_estimators=500, max_depth=12, max_features=0.5, n_jobs=-1)
RF.fit(X[kf_split[0]], y.iloc[kf_split[0]])

RandomForestRegressor(max_depth=12, max_features=0.5, n_estimators=500,
                      n_jobs=-1)

In [50]:
mean_squared_error(RF.predict(X[kf_split[1]]), y.iloc[kf_split[1]], squared=False)

0.1440972771109638

In [51]:
X = imputer.transform(X)
RF = RandomForestRegressor(n_estimators=500, max_features=0.5, n_jobs=-1)
RF.fit(X[kf_split[0]], y.iloc[kf_split[0]])

RandomForestRegressor(max_features=0.5, n_estimators=500, n_jobs=-1)

In [52]:
mean_squared_error(RF.predict(X[kf_split[1]]), y.iloc[kf_split[1]], squared=False)

0.13782184458797508

In [53]:
X = imputer.transform(X)
RF = RandomForestRegressor(n_estimators=500, max_features=0.3, n_jobs=-1)
RF.fit(X[kf_split[0]], y.iloc[kf_split[0]])

RandomForestRegressor(max_features=0.3, n_estimators=500, n_jobs=-1)

In [54]:
mean_squared_error(RF.predict(X[kf_split[1]]), y.iloc[kf_split[1]], squared=False)

0.13854546750120925

RF = RandomForestRegressor(n_estimators=500, max_features=0.5, n_jobs=-1)