# Демонстрация автоматического расчета важностей

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:95% !important;}</style>"))

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Стандартные библиотеки для анализа
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format

import numpy as np
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pylab as pl

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [3]:
import sys
sys.path.append('../')

# Наша разработанная библиотека
from autobinary import SentColumns, CatBoostEncoder, AutoTrees, base_pipe

In [4]:
import sklearn as sk
import xgboost as xgb
import matplotlib as mplot
import autobinary as ab

print('sklearn: ', sk.__version__)
print('xgboost: ', xgb.__version__)
print('matplotlib: ', mplot.__version__)
print('autobinary: ', ab.__version__)

sklearn:  1.2.1
xgboost:  1.5.2
matplotlib:  3.7.0
autobinary:  1.0.9


## 1.1 Загрузка выборки

In [5]:
sample = pd.read_csv('../data/train_houseprice.csv')

print('Размерность данных:', sample.shape)
sample.head(2)

Размерность данных: (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [6]:
num_columns = [
    'LotArea', 'LotFrontage', 'TotalBsmtSF', 'PoolArea', 'FullBath'
]

cat_columns = [
    'BldgType', 'CentralAir', 'GarageQual', 'Fence', 'SaleType'
]

target = ['SalePrice']

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    sample[num_columns+cat_columns],
    sample[target],
    test_size=0.3,
    random_state=42
)

In [8]:
y_train[target].mean()

SalePrice             181,312.69
dtype: float64

In [9]:
y_val[target].mean()

SalePrice             180,007.70
dtype: float64

## 1.2 Определяем конвейер обработки, стратегию CV

In [10]:
prep_pipe = base_pipe(
    num_columns=num_columns, 
    cat_columns=cat_columns)

Определены количественные и категориальные переменные!


In [11]:
# задаем стратегию проверки
strat = KFold(
    n_splits=3,
    shuffle=True,
    random_state=42)

## 2. DecisionTreeRegressor

In [12]:
params = {
    'criterion':'squared_error',
    'max_depth':5,
    'random_state':42}

fit_params = {}

dt = DecisionTreeRegressor(**params)

In [13]:
dt_model = AutoTrees(
    main_estimator=dt, 
    main_fit_params=fit_params,
    main_prep_pipe = prep_pipe,
    main_features=num_columns+cat_columns,
    
    X_train=X_train,
    y_train=y_train,
    
    main_metric='rmse',
    model_type = 'decisiontree')

In [14]:
dt_model.model_fit_cv(strat=strat)

1it [00:00,  9.61it/s]

rmse на обучающей выборке: 42627.775
rmse на проверочной выборке: 69226.440

********************** 1 фолд обучен! ******************************




3it [00:00,  6.76it/s]

rmse на обучающей выборке: 39851.574
rmse на проверочной выборке: 64432.509

********************** 2 фолд обучен! ******************************


rmse на обучающей выборке: 37576.294
rmse на проверочной выборке: 60019.203

********************** 3 фолд обучен! ******************************







In [15]:
dt_model.get_mean_cv_scores()

64559.38413951845

In [16]:
dt_model.get_extra_scores()

Unnamed: 0,names,fold_1,fold_2,fold_3
0,mae_train,29930.57,28532.63,26731.56
1,mae_valid,41030.85,39286.95,36494.4
2,mse_train,1817127228.04,1588147989.12,1411977870.82
3,mse_valid,4792300026.04,4151548208.58,3602304759.09
4,rmse_train,42627.78,39851.57,37576.29
5,rmse_valid,69226.44,64432.51,60019.2


In [17]:
fi = dt_model.get_fi()
fi

Unnamed: 0,index,importance 0,importance 1,importance 2,mean_importance
0,TotalBsmtSF,0.57,0.58,0.5,0.55
1,FullBath,0.16,0.19,0.28,0.21
2,LotFrontage,0.05,0.08,0.06,0.06
3,BldgType,0.05,0.03,0.08,0.05
4,LotArea,0.03,0.06,0.05,0.05
5,SaleType,0.03,0.03,0.0,0.02
6,GarageQual,0.03,0.03,0.0,0.02
7,Fence,0.05,0.0,0.0,0.02
8,CentralAir,0.02,0.01,0.02,0.02
9,PoolArea,0.0,0.0,0.0,0.0


## 3. RandomForestRegressor

In [18]:
params = {
    'criterion':'squared_error',
    'max_depth':5,
    'random_state':42,
    'n_estimators':50}

fit_params = {}

rf = RandomForestRegressor(**params)

In [19]:
rf_model = AutoTrees(
    main_estimator=rf, 
    main_fit_params=fit_params,
    main_prep_pipe = prep_pipe,
    main_features=num_columns+cat_columns,
    
    X_train=X_train,
    y_train=y_train,
    
    main_metric='rmse',
    model_type = 'randomforest')

In [20]:
rf_model.model_fit_cv(strat=strat)

0it [00:00, ?it/s]



1it [00:00,  5.16it/s]

rmse на обучающей выборке: 37981.756
rmse на проверочной выборке: 46810.880

********************** 1 фолд обучен! ******************************




2it [00:00,  5.43it/s]

rmse на обучающей выборке: 37823.338
rmse на проверочной выборке: 50472.571

********************** 2 фолд обучен! ******************************




3it [00:00,  5.49it/s]

rmse на обучающей выборке: 35459.508
rmse на проверочной выборке: 54287.541

********************** 3 фолд обучен! ******************************







In [21]:
rf_model.get_mean_cv_scores()

50523.66414689851

In [22]:
rf_model.get_extra_scores()

Unnamed: 0,names,fold_1,fold_2,fold_3
0,mae_train,27174.23,27122.08,25815.23
1,mae_valid,33118.5,32469.29,33403.54
2,mse_train,1442613774.56,1430604878.95,1257376695.41
3,mse_valid,2191258474.28,2547480433.53,2947137158.75
4,rmse_train,37981.76,37823.34,35459.51
5,rmse_valid,46810.88,50472.57,54287.54


In [23]:
fi = rf_model.get_fi()
fi

Unnamed: 0,index,importance 0,importance 1,importance 2,mean_importance
0,TotalBsmtSF,0.57,0.53,0.42,0.5
1,FullBath,0.19,0.23,0.35,0.26
2,LotArea,0.07,0.08,0.1,0.08
3,LotFrontage,0.05,0.05,0.04,0.04
4,BldgType,0.04,0.03,0.05,0.04
5,SaleType,0.02,0.02,0.01,0.02
6,CentralAir,0.02,0.02,0.02,0.02
7,PoolArea,0.03,0.02,0.0,0.02
8,GarageQual,0.02,0.02,0.01,0.02
9,Fence,0.0,0.0,0.0,0.0


## 4. XGBRegressor

In [24]:
params = {
    'eta':0.01,
    'n_estimators':1000,
    'subsample':0.9,
    'colsample_bytree':0.6,
    'max_depth':6,
    'objective':'reg:squarederror',
    'n_jobs':-1,
    'random_state':42}

fit_params = {
    'early_stopping_rounds':200, 
    'eval_metric':'rmse',
    'verbose':50}

xgb = XGBRegressor(**params)

In [25]:
xgb_model = AutoTrees(
    main_estimator=xgb, 
    main_fit_params=fit_params,
    main_prep_pipe = prep_pipe,
    main_features=num_columns+cat_columns,
    
    X_train=X_train,
    y_train=y_train,
    
    main_metric='mape',
    model_type = 'xgboost')

In [26]:
xgb_model.model_fit_cv(strat=strat)

0it [00:00, ?it/s]

[0]	validation_0-rmse:198388.87500	validation_1-rmse:189377.35938
[50]	validation_0-rmse:127582.07812	validation_1-rmse:121080.61719
[100]	validation_0-rmse:85580.90625	validation_1-rmse:81691.11719
[150]	validation_0-rmse:60365.14453	validation_1-rmse:60295.39062
[200]	validation_0-rmse:45620.53906	validation_1-rmse:49884.48047
[250]	validation_0-rmse:36938.44922	validation_1-rmse:45401.64844
[300]	validation_0-rmse:32013.08594	validation_1-rmse:43663.23438
[350]	validation_0-rmse:29039.93555	validation_1-rmse:43077.52734
[400]	validation_0-rmse:27063.90625	validation_1-rmse:42936.77344
[450]	validation_0-rmse:25602.40430	validation_1-rmse:42922.38672
[500]	validation_0-rmse:24425.60156	validation_1-rmse:42914.98438
[550]	validation_0-rmse:23607.84570	validation_1-rmse:42991.66016
[600]	validation_0-rmse:22862.62305	validation_1-rmse:43104.96094
[633]	validation_0-rmse:22334.65234	validation_1-rmse:43110.57422


1it [00:00,  1.30it/s]

BEST ITERATION:  434
mape на обучающей выборке: 0.101
mape на проверочной выборке: 0.185

********************** 1 фолд обучен! ******************************


[0]	validation_0-rmse:199086.67188	validation_1-rmse:187837.50000
[50]	validation_0-rmse:127943.14844	validation_1-rmse:119597.99219
[100]	validation_0-rmse:85926.85938	validation_1-rmse:81188.29688
[150]	validation_0-rmse:61082.99219	validation_1-rmse:60716.01562
[200]	validation_0-rmse:46793.99219	validation_1-rmse:50552.10156
[250]	validation_0-rmse:38521.19141	validation_1-rmse:46034.32422
[300]	validation_0-rmse:33786.35938	validation_1-rmse:44133.85156
[350]	validation_0-rmse:30703.49023	validation_1-rmse:43217.26953
[400]	validation_0-rmse:28673.79883	validation_1-rmse:42842.87500
[450]	validation_0-rmse:27143.68555	validation_1-rmse:42645.43750
[500]	validation_0-rmse:26032.01562	validation_1-rmse:42531.96875
[550]	validation_0-rmse:25134.87305	validation_1-rmse:42501.67578
[600]	validation_0-rmse:24356.03320	validation

2it [00:01,  1.14it/s]

BEST ITERATION:  630
mape на обучающей выборке: 0.102
mape на проверочной выборке: 0.168

********************** 2 фолд обучен! ******************************


[0]	validation_0-rmse:188640.20312	validation_1-rmse:208469.35938
[50]	validation_0-rmse:120941.39062	validation_1-rmse:140212.59375
[100]	validation_0-rmse:80450.67969	validation_1-rmse:100426.11719
[150]	validation_0-rmse:56600.60156	validation_1-rmse:78915.03906
[200]	validation_0-rmse:42598.74609	validation_1-rmse:67550.75781
[250]	validation_0-rmse:34758.57422	validation_1-rmse:61828.59375
[300]	validation_0-rmse:30054.37695	validation_1-rmse:58687.78906
[350]	validation_0-rmse:27367.99023	validation_1-rmse:57274.70312
[400]	validation_0-rmse:25603.57422	validation_1-rmse:56502.05078
[450]	validation_0-rmse:24321.00781	validation_1-rmse:56205.58984
[500]	validation_0-rmse:23338.26562	validation_1-rmse:56103.89844
[550]	validation_0-rmse:22542.18359	validation_1-rmse:56024.25781
[600]	validation_0-rmse:21809.33008	validatio

3it [00:02,  1.17it/s]

mape на обучающей выборке: 0.101
mape на проверочной выборке: 0.167

********************** 3 фолд обучен! ******************************







In [27]:
xgb_model.get_mean_cv_scores()

0.17366091296114003

In [28]:
xgb_model._best_iters

[434, 630, 540]

In [29]:
xgb_model.get_extra_scores()

Unnamed: 0,names,fold_1,fold_2,fold_3
0,mae_train,18503.73,17294.96,16642.67
1,mae_valid,30728.38,29055.78,33309.26
2,mse_train,679586124.89,566544198.85,514039100.18
3,mse_valid,1840259554.49,1802816970.68,3137965259.51
4,rmse_train,26068.87,23802.19,22672.43
5,rmse_valid,42898.25,42459.59,56017.54


In [30]:
fi = xgb_model.get_fi()
fi

Unnamed: 0,index,importance 0,importance 1,importance 2,mean_importance
0,FullBath,34952355840.0,33296699392.0,33161064448.0,33803373226.67
1,TotalBsmtSF,16201422848.0,12633034752.0,7502221312.0,12112226304.0
2,SaleType,8990325760.0,9131326464.0,5181465600.0,7767705941.33
3,CentralAir,9841988608.0,5597278720.0,6236506112.0,7225257813.33
4,BldgType,8139835904.0,5784364544.0,6416704000.0,6780301482.67
5,PoolArea,7316493312.0,9313934336.0,1331123840.0,5987183829.33
6,GarageQual,4730639360.0,4423862272.0,4385095680.0,4513199104.0
7,LotArea,5089410048.0,3860668928.0,3282997248.0,4077692074.67
8,LotFrontage,3570960640.0,2485466880.0,2005310080.0,2687245866.67
9,Fence,2354578688.0,2187159040.0,2093010304.0,2211582677.33


In [31]:
xgb_model.get_curve_plots()

## 5. LGBMRegressor

In [32]:
params = {
    'learning_rate':0.01,
    'n_estimators':1000,
    'subsample':0.9,
    'colsample_bytree':0.6,
    'max_depth':6,
    'objective':'regression',
    'n_jobs':-1,
    'random_state':42}

fit_params = {
    'early_stopping_rounds':200, 
    'eval_metric':'rmse',
    'verbose':50}

# создаем экземпляр класса XGBClassifier
lgb = LGBMRegressor(**params)

In [33]:
lgb_model = AutoTrees(
    main_estimator=lgb, 
    main_fit_params=fit_params,
    main_prep_pipe = prep_pipe,
    main_features=num_columns+cat_columns,
    
    X_train=X_train,
    y_train=y_train,
    main_metric='mape',
    model_type = 'lightboost')

In [34]:
lgb_model.model_fit_cv(strat=strat)

0it [00:00, ?it/s]

[50]	training's rmse: 64461	training's l2: 4.15521e+09	valid_1's rmse: 58935.3	valid_1's l2: 3.47336e+09
[100]	training's rmse: 54900.5	training's l2: 3.01407e+09	valid_1's rmse: 52204.3	valid_1's l2: 2.72529e+09


1it [00:00,  1.40it/s]

[150]	training's rmse: 49529.1	training's l2: 2.45314e+09	valid_1's rmse: 48919.9	valid_1's l2: 2.39316e+09
[200]	training's rmse: 46292.3	training's l2: 2.14298e+09	valid_1's rmse: 47285.6	valid_1's l2: 2.23593e+09
[250]	training's rmse: 44466	training's l2: 1.97723e+09	valid_1's rmse: 46380.2	valid_1's l2: 2.15112e+09
[300]	training's rmse: 43264.7	training's l2: 1.87183e+09	valid_1's rmse: 45817.1	valid_1's l2: 2.09921e+09
[350]	training's rmse: 42368.8	training's l2: 1.79511e+09	valid_1's rmse: 45438.9	valid_1's l2: 2.06469e+09
[400]	training's rmse: 41641.3	training's l2: 1.734e+09	valid_1's rmse: 45115.5	valid_1's l2: 2.03541e+09
[450]	training's rmse: 41117.5	training's l2: 1.69065e+09	valid_1's rmse: 44918.4	valid_1's l2: 2.01766e+09
[500]	training's rmse: 40709.3	training's l2: 1.65725e+09	valid_1's rmse: 44819.4	valid_1's l2: 2.00877e+09
[550]	training's rmse: 40296	training's l2: 1.62376e+09	valid_1's rmse: 44877.6	valid_1's l2: 2.014e+09
[600]	training's rmse: 39936	trainin

2it [00:01,  1.92it/s]

[700]	training's rmse: 37768.6	training's l2: 1.42646e+09	valid_1's rmse: 46729.5	valid_1's l2: 2.18365e+09
[750]	training's rmse: 37292.5	training's l2: 1.39073e+09	valid_1's rmse: 46739.5	valid_1's l2: 2.18458e+09
BEST ITERATION:  557
mape на обучающей выборке: 0.148
mape на проверочной выборке: 0.178

********************** 2 фолд обучен! ******************************


[50]	training's rmse: 58218.6	training's l2: 3.38941e+09	valid_1's rmse: 74758.6	valid_1's l2: 5.58885e+09
[100]	training's rmse: 50260.6	training's l2: 2.52613e+09	valid_1's rmse: 65935.4	valid_1's l2: 4.34747e+09
[150]	training's rmse: 45519.3	training's l2: 2.07201e+09	valid_1's rmse: 60725.8	valid_1's l2: 3.68762e+09
[200]	training's rmse: 42630	training's l2: 1.81732e+09	valid_1's rmse: 57558.2	valid_1's l2: 3.31295e+09
[250]	training's rmse: 40768	training's l2: 1.66203e+09	valid_1's rmse: 55776	valid_1's l2: 3.11096e+09
[300]	training's rmse: 39468	training's l2: 1.55772e+09	valid_1's rmse: 54588.5	valid_1's 

3it [00:01,  2.03it/s]

mape на обучающей выборке: 0.146
mape на проверочной выборке: 0.170

********************** 3 фолд обучен! ******************************







In [35]:
lgb_model.get_mean_cv_scores()

0.18063233452718205

In [36]:
lgb_model._best_iters

[507, 557, 505]

In [37]:
lgb_model.get_extra_scores()

Unnamed: 0,names,fold_1,fold_2,fold_3
0,mae_train,25744.32,25888.33,24816.22
1,mae_valid,31372.81,30403.26,33223.33
2,mse_train,1653879424.26,1530407618.22,1363617564.1
3,mse_valid,2007331202.71,2178051357.94,2817273193.41
4,rmse_train,40667.92,39120.42,36927.19
5,rmse_valid,44803.25,46669.6,53077.99


In [38]:
fi = lgb_model.get_fi()
fi

Unnamed: 0,index,importance 0,importance 1,importance 2,mean_importance
0,TotalBsmtSF,78536609609293.0,72730151492104.0,47749443599554.24,66338734900317.09
1,FullBath,29381155954560.0,35911452600288.0,28178495178496.0,31157034577781.33
2,LotArea,23715256227362.0,24768265349554.0,23048396383026.0,23843972653314.0
3,LotFrontage,13953753184066.5,13518057808216.0,9526502306965.4,12332771099749.3
4,SaleType,6860829247070.0,8803850589592.0,4865109552336.0,6843263129666.0
5,BldgType,5832470583730.0,4958343227800.0,6272462044626.0,5687758618718.67
6,CentralAir,3586467190673.31,2634604007456.0,3309601383896.0,3176890860675.1
7,GarageQual,2334638857720.0,2466338623356.0,2885250675018.0,2562076052031.33
8,Fence,1229052406088.0,2167860437628.0,2054426896095.0,1817113246603.67
9,PoolArea,0.0,0.0,0.0,0.0


In [39]:
lgb_model.get_curve_plots()

## 6. CatBoostRegressor - MAPE

In [40]:
params = {
    'learning_rate':0.01,
    'iterations':1000,
    'subsample':0.9,
    'colsample_bylevel':0.9,
    'depth':6,
    'loss_function':'MAPE',
    'thread_count':-1,
    'random_state':42,
    'eval_metric':'MAPE',
    'custom_metric':'MAE',
    'logging_level':'Verbose'}

fit_params = {
    'use_best_model':True,
    'early_stopping_rounds':200, 
    'verbose':50,
    'plot':False}

# создаем экземпляр класса XGBClassifier
catb = CatBoostRegressor(**params)

In [41]:
catb_model_1 = AutoTrees(
    main_estimator=catb, 
    main_fit_params=fit_params,
    main_prep_pipe = prep_pipe,
    main_features=num_columns+cat_columns,
    
    X_train=X_train,
    y_train=y_train,
    main_metric='mape',
    model_type = 'catboost')

In [42]:
catb_model_1.model_fit_cv(strat=strat)

0it [00:00, ?it/s]

0:	learn: 0.2916038	test: 0.2916038	test1: 0.2949008	best: 0.2949008 (0)	total: 52.1ms	remaining: 52.1s
50:	learn: 0.2734544	test: 0.2734544	test1: 0.2821159	best: 0.2821159 (50)	total: 93.1ms	remaining: 1.73s
100:	learn: 0.2637368	test: 0.2637368	test1: 0.2755607	best: 0.2755607 (100)	total: 126ms	remaining: 1.13s
150:	learn: 0.2550280	test: 0.2550280	test1: 0.2688672	best: 0.2688672 (150)	total: 156ms	remaining: 875ms
200:	learn: 0.2482925	test: 0.2482925	test1: 0.2644480	best: 0.2644480 (200)	total: 182ms	remaining: 722ms
250:	learn: 0.2424390	test: 0.2424390	test1: 0.2605604	best: 0.2605604 (250)	total: 208ms	remaining: 621ms
300:	learn: 0.2387001	test: 0.2387001	test1: 0.2584970	best: 0.2584970 (300)	total: 236ms	remaining: 547ms
350:	learn: 0.2356392	test: 0.2356392	test1: 0.2564463	best: 0.2564463 (350)	total: 264ms	remaining: 487ms
400:	learn: 0.2325070	test: 0.2325070	test1: 0.2543495	best: 0.2543495 (400)	total: 291ms	remaining: 434ms
450:	learn: 0.2300723	test: 0.2300723	tes

1it [00:00,  1.15it/s]

750:	learn: 0.2195574	test: 0.2195574	test1: 0.2476560	best: 0.2476526 (748)	total: 513ms	remaining: 170ms
800:	learn: 0.2184984	test: 0.2184984	test1: 0.2473160	best: 0.2473160 (800)	total: 541ms	remaining: 134ms
850:	learn: 0.2175713	test: 0.2175713	test1: 0.2469639	best: 0.2469580 (843)	total: 571ms	remaining: 99.9ms
900:	learn: 0.2169951	test: 0.2169951	test1: 0.2468066	best: 0.2468041 (898)	total: 601ms	remaining: 66ms
950:	learn: 0.2164361	test: 0.2164361	test1: 0.2465599	best: 0.2465565 (949)	total: 629ms	remaining: 32.4ms
999:	learn: 0.2160873	test: 0.2160873	test1: 0.2464729	best: 0.2464724 (998)	total: 658ms	remaining: 0us

bestTest = 0.2464723603
bestIteration = 998

Shrink model to first 999 iterations.
BEST ITERATION:  998
mape на обучающей выборке: 0.216
mape на проверочной выборке: 0.246

********************** 1 фолд обучен! ******************************


0:	learn: 0.3017868	test: 0.3017868	test1: 0.2741534	best: 0.2741534 (0)	total: 836us	remaining: 836ms
50:	learn: 

2it [00:01,  1.21it/s]

999:	learn: 0.2151240	test: 0.2151240	test1: 0.2168045	best: 0.2168034 (998)	total: 596ms	remaining: 0us

bestTest = 0.2168034304
bestIteration = 998

Shrink model to first 999 iterations.
BEST ITERATION:  998
mape на обучающей выборке: 0.215
mape на проверочной выборке: 0.217

********************** 2 фолд обучен! ******************************


0:	learn: 0.2846038	test: 0.2846038	test1: 0.3093511	best: 0.3093511 (0)	total: 714us	remaining: 714ms
50:	learn: 0.2673401	test: 0.2673401	test1: 0.2930225	best: 0.2930225 (50)	total: 34.2ms	remaining: 637ms
100:	learn: 0.2560047	test: 0.2560047	test1: 0.2830181	best: 0.2830181 (100)	total: 67.8ms	remaining: 603ms
150:	learn: 0.2489051	test: 0.2489051	test1: 0.2773232	best: 0.2773232 (150)	total: 100ms	remaining: 565ms
200:	learn: 0.2422405	test: 0.2422405	test1: 0.2721824	best: 0.2721824 (200)	total: 134ms	remaining: 533ms
250:	learn: 0.2364905	test: 0.2364905	test1: 0.2678176	best: 0.2678176 (250)	total: 168ms	remaining: 501ms
300:	learn: 

3it [00:02,  1.19it/s]

mape на обучающей выборке: 0.209
mape на проверочной выборке: 0.255

********************** 3 фолд обучен! ******************************







In [43]:
catb_model_1.get_mean_cv_scores()

0.23947945744371227

In [44]:
catb_model_1._best_iters

[998, 998, 995]

In [45]:
catb_model_1.get_extra_scores()

Unnamed: 0,names,fold_1,fold_2,fold_3
0,mae_train,47741.53,46374.29,43116.18
1,mae_valid,47773.93,42552.28,58156.73
2,mse_train,6001063557.12,5515236633.15,4730750676.17
3,mse_valid,4983948177.98,4574666145.89,8490959598.63
4,rmse_train,77466.53,74264.64,68780.45
5,rmse_valid,70597.08,67636.28,92146.4


In [46]:
fi = catb_model_1.get_fi()
fi

Unnamed: 0,index,importance_0,importance_1,importance_2,mean_importance
0,FullBath,19.64,16.91,27.21,21.25
1,TotalBsmtSF,13.49,19.78,18.95,17.41
2,LotArea,12.15,18.04,15.91,15.37
3,PoolArea,17.95,9.6,1.92,9.82
4,GarageQual,9.35,7.73,7.65,8.24
5,CentralAir,6.2,6.01,9.25,7.15
6,LotFrontage,6.48,7.86,6.54,6.96
7,SaleType,7.35,6.91,4.73,6.33
8,BldgType,3.5,4.21,5.51,4.4
9,Fence,3.91,2.94,2.33,3.06


In [47]:
catb_model_1.get_curve_plots()

## 7. CatBoostRegressor - SMAPE

In [48]:
params = {
    'learning_rate':0.01,
    'iterations':2000,
    'subsample':0.9,
    'colsample_bylevel':0.9,
    'depth':6,
    'loss_function':'MAPE',
    'thread_count':-1,
    'random_state':42,
    'eval_metric':'SMAPE',
    'custom_metric':'MAPE',
    'logging_level':'Verbose'}

fit_params = {
    'use_best_model':True,
    'early_stopping_rounds':200, 
    'verbose':50,
    'plot':False}

# создаем экземпляр класса XGBClassifier
catb = CatBoostRegressor(**params)

In [49]:
catb_model_2 = AutoTrees(
    main_estimator=catb, 
    main_fit_params=fit_params,
    main_prep_pipe = prep_pipe,
    main_features=num_columns+cat_columns,
    
    X_train=X_train,
    y_train=y_train,
    
    main_metric='mape',
    model_type = 'catboost')

In [50]:
catb_model_2.model_fit_cv(strat=strat)

0it [00:00, ?it/s]

0:	learn: 31.6931125	test: 31.6931125	test1: 31.2687152	best: 31.2687152 (0)	total: 754us	remaining: 1.51s
50:	learn: 29.9725116	test: 29.9725116	test1: 29.9710641	best: 29.9710641 (50)	total: 32.7ms	remaining: 1.25s
100:	learn: 29.0767090	test: 29.0767090	test1: 29.3343832	best: 29.3343832 (100)	total: 61ms	remaining: 1.15s
150:	learn: 28.2312534	test: 28.2312534	test1: 28.6607746	best: 28.6607746 (150)	total: 92.6ms	remaining: 1.13s
200:	learn: 27.5020794	test: 27.5020794	test1: 28.1607294	best: 28.1607294 (200)	total: 124ms	remaining: 1.11s
250:	learn: 26.9606752	test: 26.9606752	test1: 27.7874902	best: 27.7874902 (250)	total: 155ms	remaining: 1.08s
300:	learn: 26.5960319	test: 26.5960319	test1: 27.5756577	best: 27.5756577 (300)	total: 181ms	remaining: 1.02s
350:	learn: 26.3110226	test: 26.3110226	test1: 27.3763728	best: 27.3763728 (350)	total: 207ms	remaining: 971ms
400:	learn: 26.0080594	test: 26.0080594	test1: 27.1651754	best: 27.1651754 (400)	total: 233ms	remaining: 927ms
450:	l

1it [00:01,  1.56s/it]


bestTest = 26.27882091
bestIteration = 1986

Shrink model to first 1987 iterations.
BEST ITERATION:  1986
mape на обучающей выборке: 0.213
mape на проверочной выборке: 0.246

********************** 1 фолд обучен! ******************************


0:	learn: 32.6588338	test: 32.6588338	test1: 29.5394690	best: 29.5394690 (0)	total: 1.19ms	remaining: 2.38s
50:	learn: 30.8164865	test: 30.8164865	test1: 27.9519094	best: 27.9519094 (50)	total: 31ms	remaining: 1.19s
100:	learn: 29.5498479	test: 29.5498479	test1: 26.9692189	best: 26.9692189 (100)	total: 57.3ms	remaining: 1.08s
150:	learn: 28.2567355	test: 28.2567355	test1: 25.9365558	best: 25.9365558 (150)	total: 90.4ms	remaining: 1.11s
200:	learn: 27.4833951	test: 27.4833951	test1: 25.3520357	best: 25.3520357 (200)	total: 120ms	remaining: 1.07s
250:	learn: 26.7830800	test: 26.7830800	test1: 24.8739298	best: 24.8739298 (250)	total: 150ms	remaining: 1.05s
300:	learn: 26.3047531	test: 26.3047531	test1: 24.5756120	best: 24.5756120 (300)	total: 175

2it [00:03,  1.52s/it]

mape на обучающей выборке: 0.208
mape на проверочной выборке: 0.214

********************** 2 фолд обучен! ******************************


0:	learn: 30.5453473	test: 30.5453473	test1: 34.1319560	best: 34.1319560 (0)	total: 897us	remaining: 1.79s
50:	learn: 28.9897023	test: 28.9897023	test1: 32.6976416	best: 32.6976416 (50)	total: 30.2ms	remaining: 1.15s
100:	learn: 27.9334515	test: 27.9334515	test1: 31.7929386	best: 31.7929386 (100)	total: 56.9ms	remaining: 1.07s
150:	learn: 27.2697711	test: 27.2697711	test1: 31.3201931	best: 31.3201931 (150)	total: 82.7ms	remaining: 1.01s
200:	learn: 26.5727734	test: 26.5727734	test1: 30.8134556	best: 30.8134556 (200)	total: 112ms	remaining: 1s
250:	learn: 25.9762769	test: 25.9762769	test1: 30.3923166	best: 30.3923166 (250)	total: 142ms	remaining: 990ms
300:	learn: 25.5306562	test: 25.5306562	test1: 30.0835466	best: 30.0835466 (300)	total: 167ms	remaining: 945ms
350:	learn: 25.1361516	test: 25.1361516	test1: 29.8437273	best: 29.8437273 (350)	total: 1

3it [00:04,  1.50s/it]

1850:	learn: 22.3355857	test: 22.3355857	test1: 28.9680946	best: 28.9639254 (1825)	total: 1.06s	remaining: 85.2ms
1900:	learn: 22.3038962	test: 22.3038962	test1: 28.9685739	best: 28.9639254 (1825)	total: 1.09s	remaining: 56.8ms
1950:	learn: 22.2855474	test: 22.2855474	test1: 28.9691191	best: 28.9639254 (1825)	total: 1.12s	remaining: 28.2ms
1999:	learn: 22.2616636	test: 22.2616636	test1: 28.9688354	best: 28.9639254 (1825)	total: 1.15s	remaining: 0us

bestTest = 28.96392545
bestIteration = 1825

Shrink model to first 1826 iterations.
BEST ITERATION:  1825
mape на обучающей выборке: 0.201
mape на проверочной выборке: 0.253

********************** 3 фолд обучен! ******************************







In [51]:
catb_model_2.get_mean_cv_scores()

0.2375191336521265

In [52]:
catb_model_2._best_iters

[1986, 1974, 1825]

In [53]:
catb_model_2.get_extra_scores()

Unnamed: 0,names,fold_1,fold_2,fold_3
0,mae_train,47402.08,45441.85,42143.96
1,mae_valid,47663.05,42060.68,57958.56
2,mse_train,5980256505.25,5419725951.91,4677322372.51
3,mse_valid,4969483139.91,4489964802.78,8469666377.31
4,rmse_train,77332.12,73618.79,68390.95
5,rmse_valid,70494.56,67007.2,92030.79


In [54]:
fi = catb_model_2.get_fi()
fi

Unnamed: 0,index,importance_0,importance_1,importance_2,mean_importance
0,FullBath,19.58,16.83,26.56,20.99
1,TotalBsmtSF,13.64,19.76,19.47,17.62
2,LotArea,12.31,18.32,16.36,15.66
3,PoolArea,17.67,9.14,1.81,9.54
4,GarageQual,9.27,7.63,7.64,8.18
5,CentralAir,6.22,5.91,9.34,7.15
6,LotFrontage,6.49,8.61,6.34,7.15
7,SaleType,7.37,6.7,4.56,6.21
8,BldgType,3.59,4.3,5.7,4.53
9,Fence,3.85,2.8,2.21,2.95


In [55]:
catb_model_2.get_curve_plots()