In [10]:
import numpy as np
import os
import pandas as pd
import pickle
import xgboost as xgb

from scipy import stats
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, mean_absolute_error
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.options.display.max_columns=500
pd.options.display.max_rows=500
pd.options.display.max_colwidth=500
np.set_printoptions(precision=3)

%run ./run_xgb.py

In [2]:
df = pd.read_csv('./data/train.csv')
train_cols = list(df.iloc[:, 13:20].columns)
label_col = 'price_doc'

df['label_log'] = np.log(df[label_col])
df['label_binned'] = pd.cut(df['label_log'], 5, labels=[0,1,2,3,4])

In [3]:
df['label_binned'].value_counts()

2    16452
3    12574
1     1207
4      226
0       12
Name: label_binned, dtype: int64

### Model check

In [28]:
def my_acc(labels, preds):
    return accuracy_score(labels, stats.mode(preds, 0)[0].reshape((-1, 1)))


def multy_acc(labels, preds):
    return (labels == np.argmax(stats.mode(preds, 0)[0][0], axis=1)).mean()


def mae(labels, preds):
    return mean_absolute_error(labels, preds.mean(0))

In [15]:
kfold = KFold(n_splits=3)

train_params = {
    'num_boost_round': 200,
    'verbose_eval': 10,
    'early_stopping_rounds': 10
}

booster_params = {
    'objective': 'multi:softprob',
    'num_class': 5,
    'nthread': -1,
    'eta': .1,
    'max_depth': 5
}

log_params = {
    'description': 'proba',
    'result_path': './data/result'
}

In [27]:
res = run_xgb(train_df=df.loc[:2000, :],
              val_df=df.loc[2000:4000, :],
             train_cols=train_cols,
             label_cols='label_binned',
             booster_params=booster_params,
             train_params=train_params,
             log_params=log_params,
             kfold=kfold,
             metric=multy_acc)

./data/result/09-07-19
(3, 2001, 5)
[0]	train-merror:0.424288	val-merror:0.407796
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 10 rounds.
[10]	train-merror:0.4003	val-merror:0.389805
[20]	train-merror:0.384558	val-merror:0.385307
Stopping. Best iteration:
[17]	train-merror:0.388306	val-merror:0.385307

[0]	train-merror:0.388306	val-merror:0.422789
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 10 rounds.
[10]	train-merror:0.377811	val-merror:0.416792
Stopping. Best iteration:
[7]	train-merror:0.38006	val-merror:0.415292

[0]	train-merror:0.386807	val-merror:0.457271
Multiple eval metrics have been passed: 'val-merror' will be used for early stopping.

Will train until val-merror hasn't improved in 10 rounds.
[10]	train-merror:0.371814	val-merror:0.436282
[20]	train-merror:0.365067	val-merror:0.425787
[30]	train-m

In [31]:
n_folds = 3
n_examples = 10
n_targets = 1

a1 = np.zeros([n_folds, n_examples, n_targets])
a2 = np.ones([n_examples])

In [33]:
a1[0] = a2

ValueError: could not broadcast input array from shape (10) into shape (10,1)

(3, 5)

### Feature importance checker

In [27]:
with open('./data/result/09-03-14/models/model0.pickle', 'rb') as f:
    model = pickle.load(f)

In [29]:
model.feature_names

['area_m',
 'raion_popul',
 'green_zone_part',
 'indust_part',
 'children_preschool',
 'preschool_quota',
 'preschool_education_centers_raion']

In [69]:
%run ./check_importance.py

In [70]:
result = check_importance(df,
                          model,
                          input_col='raion_popul',
                         input_change_method='rel',
                        target_measure_method='mass_rel',
                        agg_method='mean',
                        increase_koeff=5)


Start preds mean --> 7762036.00    changed preds mean --> 7765612.50
0.0005
