In [1]:
import pandas as pd
import numpy as np

In [4]:
import category_encoders as ce
from sklearn.base import clone
from tqdm import tqdm

In [295]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.ensemble import RandomTreesEmbedding, StackingRegressor

In [21]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [19]:
from sklearn.metrics import mean_absolute_percentage_error as mape

In [2]:
# read data, made on previous step (car_prices_EDA)
data = pd.read_csv('data_eda_ksv.csv')

In [3]:
target = 'price'
SEED = 42

In [312]:
""" function to prepare data to ML:
    * split on tran and test dataset
    * split train on x and y var for ML
        (note) on last time, y variable was changed to log(y)
    * make dataset of test of competition - the question - 'q'
    * encode all not numeric features with some encoder
        - fit on x
        - and transform x and q
"""
def split_data(encoder: encoder or None, data=data, target=target):
    """

    :param encoder: some apropriate encoder
    :param data: dataset with train and test with key vector 'test'
    :param target: name of target vector
    :return: x, y, q - data prepared for ML
    """
    train = data[data.test == 0].copy()
    test = data[data.test == 1].copy()

    x, y = train.drop([target], axis=1), np.log(train[target])
    # q - question of competition, prediction will be a - 'answer'
    q = test.drop([target], axis=1)

    if encoder:
        encoder.fit(x, y)
        x = encoder.transform(x)
        q = encoder.transform(q)

    return x, y, q

In [40]:
x, y, q = split_data(encoder=ce.TargetEncoder())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

---
# Just RandomForest with base settings

In [283]:
regressor = RandomForestRegressor(random_state=SEED, n_jobs=-1)
regressor.fit(x_train, y_train)
predict = np.exp(regressor.predict(x_test))

#print(f"MAPE: {(mape_(np.exp(y_test), predict))*100:0.2f}%")
print(f'sklearn mape: {(mape(np.exp(y_test), predict))*100:0.2f}%')

sklearn mape: 9.31%


In [42]:
regressor = RandomForestRegressor(random_state=SEED, n_jobs=-1)
regressor.fit(x, y)
predict = np.exp(regressor.predict(q))

predict

array([ 790555.94695054, 1095997.49681796,  958106.22273016, ...,
        513839.56641181, 1163682.5211251 , 1299707.21919453])

In [43]:
q['price']=predict
submission = q[['sell_id', 'price']].copy()

In [44]:
submission.price = submission.price.apply(lambda w: w * 1.255839 / 1.3)

In [45]:
submission.to_csv('sub_12_11_2.csv', index=False)

---
# CatBoostRegresor with base settings

In [25]:
x, y, q = split_data(encoder=ce.TargetEncoder())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [26]:
regressor = CatBoostRegressor(random_state=SEED)
regressor.fit(x_train, y_train)
predict = np.exp(regressor.predict(x_test))

#print(f"MAPE: {(mape_(np.exp(y_test), predict))*100:0.2f}%")
print(f'sklearn mape: {(mape(y_test, predict))*100:0.2f}%')

Learning rate set to 0.086871
0:	learn: 0.8966591	total: 69.8ms	remaining: 1m 9s
1:	learn: 0.8344884	total: 88.2ms	remaining: 44s
2:	learn: 0.7783921	total: 98.7ms	remaining: 32.8s
3:	learn: 0.7273817	total: 109ms	remaining: 27.2s
4:	learn: 0.6801949	total: 119ms	remaining: 23.7s
5:	learn: 0.6381613	total: 130ms	remaining: 21.5s
6:	learn: 0.6003415	total: 139ms	remaining: 19.8s
7:	learn: 0.5659081	total: 152ms	remaining: 18.8s
8:	learn: 0.5352523	total: 165ms	remaining: 18.2s
9:	learn: 0.5063727	total: 178ms	remaining: 17.6s
10:	learn: 0.4806011	total: 189ms	remaining: 17s
11:	learn: 0.4576897	total: 201ms	remaining: 16.6s
12:	learn: 0.4366148	total: 213ms	remaining: 16.2s
13:	learn: 0.4176798	total: 225ms	remaining: 15.8s
14:	learn: 0.4007421	total: 235ms	remaining: 15.4s
15:	learn: 0.3853079	total: 250ms	remaining: 15.4s
16:	learn: 0.3712048	total: 261ms	remaining: 15.1s
17:	learn: 0.3587953	total: 271ms	remaining: 14.8s
18:	learn: 0.3470527	total: 280ms	remaining: 14.5s
19:	learn: 0

---
# Bagging (sklearn lib)

In [46]:
from sklearn.ensemble import BaggingRegressor

In [81]:
x, y, q = split_data(encoder=ce.TargetEncoder())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [82]:
regressor = BaggingRegressor(base_estimator=RandomForestRegressor(random_state=SEED, n_jobs=-1),
                             n_estimators=10,
                             random_state=SEED).fit(x_train, y_train)
y_test_pred = np.exp(regressor.predict(x_test))
print(f'sklearn MAPE: {(mape(y_test, y_test_pred)) * 100:0.2f}%')

sklearn MAPE: 9.91%


In [83]:
regressor = BaggingRegressor(base_estimator=RandomForestRegressor(random_state=SEED, n_jobs=-1),
                             n_estimators=10,
                             random_state=SEED).fit(x, y)
predict = np.exp(regressor.predict(q))
predict

array([ 727555.00020816, 1051620.48625443,  949022.79892262, ...,
        394016.97204767, 1201188.58397692, 1218093.51276876])

In [84]:
q['price']=predict
submission = q[['sell_id', 'price']].copy()

In [85]:
submission.price = submission.price.apply(lambda w: w * 1.255839 / 1.55)

In [86]:
submission.to_csv('sub_12_11_3.csv', index=False)

---
# Boosting
## CatBoostRegressor with some settings

In [87]:
x, y, q = split_data(encoder=ce.TargetEncoder())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [57]:
model = CatBoostRegressor(iterations = 5000,
                          random_seed = SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,
                          )
model.fit(x_train, np.log(y_train),
          #cat_features=cat_features_ids,
          eval_set=(x_test, y_test),
          verbose_eval=0,
          use_best_model=True,
          #plot=True
          )

model.save_model('catboost_single_model_baseline.model')
predict = np.exp(regressor.predict(x_test))
print(f"Точность модели по метрике MAPE: {(mape(y_test, predict))*100:0.2f}%")

Точность модели по метрике MAPE: 5.66%


In [None]:
model = CatBoostRegressor(iterations = 5000,
                          random_seed = SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,
                          )
model.fit(x, np.log(y),
          #cat_features=cat_features_ids,
          #eval_set=(x, np.log(y_test)),
          verbose_eval=0,
          #use_best_model=True,
          #plot=True
          )

model.save_model('catboost_single_model_baseline.model')

predict = np.exp(model.predict(q))
predict

In [89]:
q['price']=predict
submission = q[['sell_id', 'price']].copy()

In [90]:
submission.price = submission.price.apply(lambda w: w * 1.255839 / 1.60)
submission.to_csv('sub_12_11_4.csv', index=False)

As for resume, this ML-model will show best estimate (19,80657 - 166 place) in leaderboard, but when I divid results on 1.6 instead 1.255,
as made on previous step. It is a secret for my, why?

----
# Bosting
## AdaBoostRegressor with base settings

In [76]:
x, y, q = split_data(encoder=ce.TargetEncoder())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [77]:
regressor = AdaBoostRegressor(base_estimator=RandomForestRegressor(random_state=SEED, n_jobs=-1),
                             n_estimators=10,
                             random_state=SEED).fit(x_train, np.log(y_train))
y_test_pred = np.exp(regressor.predict(x_test))
print(f'sklearn MAPE: {(mape(y_test, y_test_pred)) * 100:0.2f}%')

sklearn MAPE: 9.37%


In [78]:
regressor = AdaBoostRegressor(base_estimator=RandomForestRegressor(random_state=SEED, n_jobs=-1),
                              n_estimators=10,
                              random_state=SEED).fit(x, np.log(y))
predict = np.exp(regressor.predict(q))
predict

array([ 734757.18640028, 1043470.14928105,  996620.02808889, ...,
        403244.2996093 , 1183636.66025549, 1253707.42947987])

In [79]:
q['price']=predict
submission = q[['sell_id', 'price']].copy()

In [80]:
submission.price = submission.price.apply(lambda w: w * 1.255839 / 1.45)
submission.to_csv('sub_12_11_5.csv', index=False)

---
# Validation

After some not successful attempts, I try to understand, where  my feature dataset is so bad to make predictio of price?

So. I made two attempts of parsing auto.ru. I decide to use the first part as train, and the second - as test. They are the same structure of brand-model, and the second parsed with setting "last seven days".

The results with CatBoostRegressor - MAPE = 14%. Its is not fine, but it is 65 place in leaderboard as for 13/12/2021.
So, it is not bad features dataset


In [91]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146632 entries, 0 to 146631
Data columns (total 27 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   bodyType              146632 non-null  object 
 1   brand                 146632 non-null  object 
 2   color                 146632 non-null  object 
 3   engineDisplacement    146632 non-null  object 
 4   enginePower           146632 non-null  int64  
 5   fuelType              146632 non-null  object 
 6   mileage               146632 non-null  int64  
 7   numberOfDoors         146632 non-null  int64  
 8   parsing_unixtime      146632 non-null  int64  
 9   productionDate        146632 non-null  int64  
 10  sell_id               146632 non-null  int64  
 11  super_gen             146632 non-null  object 
 12  vehicleConfiguration  146632 non-null  object 
 13  vehicleTransmission   146632 non-null  object 
 14  vendor                146632 non-null  object 
 15  

In [None]:
from datetime import datetime

In [112]:
data['day']=data.parsing_unixtime.apply(datetime.utcfromtimestamp)

In [128]:
data[data.day <= datetime(2021,12,6)].shape

(122432, 28)

In [140]:
valid = data[data.day > datetime(2021,12,7)].copy()

In [141]:
train = data[(data.test == 0)&(data.day <= datetime(2021,12,7))].copy()

In [142]:
valid.shape, train.shape, data.shape

((18816, 28), (93130, 28), (146632, 28))

In [143]:
encoder = ce.TargetEncoder()

y=train[target]
x=train.drop(['test', 'day', target], axis=1)

a=valid[target]
q=valid.drop(['test', 'day', target], axis=1)


encoder.fit(x,y)
x=encoder.transform(x)
q=encoder.transform(q)


In [144]:
model = CatBoostRegressor(iterations = 5000,
                          random_seed = SEED,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,
                          )
model.fit(x, np.log(y),
          #cat_features=cat_features_ids,
          eval_set=(q, np.log(a)),
          verbose_eval=0,
          use_best_model=True,
          #plot=True
          )

model.save_model('catboost_single_model_baseline.model')
predict = np.exp(model.predict(q))
print(f"Точность модели по метрике MAPE: {(mape(a, predict))*100:0.2f}%")

Точность модели по метрике MAPE: 14.31%


In [146]:
regressor = BaggingRegressor(base_estimator=RandomForestRegressor(random_state=SEED),
                             n_estimators=10,
                             random_state=SEED).fit(x, np.log(y))
y_test_pred = np.exp(regressor.predict(q))
print(f'sklearn MAPE: {(mape(a, y_test_pred)) * 100:0.2f}%')

sklearn MAPE: 14.75%


In [157]:
data.drop(['day'], axis=1, inplace=True)

---
# Stacking
## stacking model from sklearn library

In [147]:
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR

In [158]:
x, y, q = split_data(encoder=ce.TargetEncoder())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [162]:
estimators = [
    ('lr', RidgeCV()),
    ('Tree', DecisionTreeRegressor(random_state=SEED)),
#    ('svr', LinearSVR(random_state=SEED)),
    ('RF', RandomForestRegressor(random_state=SEED))
]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,
                                          random_state=42)
)

reg.fit(x_train, y_train)
y_test_pred = np.exp(reg.predict(x_test))

In [163]:
print(f'sklearn MAPE: {(mape(np.exp(y_test), y_test_pred)) * 100:0.2f}%')

sklearn MAPE: 10.79%


In [165]:
estimators = [
    ('lr', RidgeCV()),
    ('Tree', DecisionTreeRegressor(random_state=SEED)),
    #    ('svr', LinearSVR(random_state=SEED)),
    ('RF', RandomForestRegressor(random_state=SEED))
]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,
                                          random_state=42)
)

reg.fit(x, y)
predict = np.exp(reg.predict(q))

In [167]:
q['price'] = predict
submission = q[['sell_id', 'price']].copy()
submission.price = submission.price.apply(lambda w: w * 1.255839 / 1.5)
submission.to_csv('sub_12_12.csv', index=False)

---
# Stacking
## My stacking script.
Some code was taken from DST


In [224]:
folderer = KFold(n_splits=10, shuffle=True, random_state=SEED)
regressor = RandomForestRegressor(random_state=SEED)

In [284]:
"""
    function for computing meta feature as predict value of given ML model
"""


def compute_meta_feature(regressor, x_train, x_test, y_train, folderer):
    """

    :param regressor: ML-regressor
    :param x_train: x train set
    :param x_test: x test set
    :param y_train: y train set
    :param folderer: folder splitter
    :return: new feature vectors for train and test stack
    """

    # create new meta feature for train (filling zeros for initiate)
    x_meta_train = np.zeros((len(y_train)), dtype=np.float32)

    # make generator for split train sets
    splits = folderer.split(x_train)
    scores=[]

    # cycle for prediction on folded sets
    for train_fold_index, predict_fold_index in splits:
        x_fold_train = x_train.iloc[train_fold_index].copy()
        x_fold_predict = x_train.iloc[predict_fold_index].copy()
        y_fold_train = y_train.iloc[train_fold_index].copy()
        y_fold_predict = y_train.iloc[predict_fold_index].copy()

        folded_reg = clone(regressor)
        folded_reg.fit(x_fold_train, y_fold_train)

        predict = np.exp(folded_reg.predict(x_fold_predict))
        x_meta_train[predict_fold_index] = predict

        # calculate MAPE score
        current_score = mape(np.exp(y_fold_predict), predict)
        scores.append(current_score)

        # test current score as minimum and leave this fold for predict meta feature for test set
        if current_score == min(scores):
            x, y = x_fold_train, y_fold_train

    best = scores.index(min(scores))
    print(f'folder MAPE {np.around(scores,5)}')
    print(f'Best score # {best} {min(scores)}')

    # make prediction for test set with training on best fold
    folded_reg = clone(regressor)
    folded_reg.fit(x, y)
    x_meta_test = np.exp(folded_reg.predict(x_test))

    return x_meta_train, x_meta_test


In [249]:
"""
    function for generate datasets of meta-features with prediction on different ML models
    (use function 'compute_meta_features')
"""

def generate_meta_features(regressors, x_train, x_test, y_train, folderer):
    """

    :param regressors: list of ML regressors
    :param x_train: x train dataset
    :param x_test: x test dataset
    :param y_train: target vector of train
    :param folderer: folder splitter (need for 'compute_meta_features')
    :return:
    """

    # make the list of predictions by avery ML regressor from list
    features = [
        compute_meta_feature(regressor, x_train, x_test, y_train, folderer)
        for regressor in tqdm(regressors)
    ]

    # extract predictions for train dataset and stack them to ndarray
    stacked_features_train = np.stack([
        features_train for features_train, features_test in features
    ]).T

    # extract predictions for test dataset and stack them to ndarray
    stacked_features_test = np.stack([
        features_test for features_train, features_test in features
    ]).T

    return stacked_features_train, stacked_features_test

In [303]:
"""
    some function for finishing step of ML: calculate metric and make prediction for submission
"""


def compute_mape(regressor, x_train=x_train, y_train=y_train, x_test=x_test):
    """
    make prediction and calculate MAPE metric
    :param regressor: ML regressor
    :param x_train: x train set
    :param y_train: y train set
    :param x_test: x test set
    :return: MAPE metric with rounding 6 points
    """
    regressor.fit(x_train, y_train)
    y_test_pred = np.exp(regressor.predict(x_test))
    return np.around(mape(np.exp(y_test), y_test_pred), 6)

def create_submission(regressor, x=x, y=y, q=q):
    """
    Make prediction for submission
    :param regressor: ML regressor
    :param x: x set
    :param y: y set
    :param q: question set (test set for prediction)
    :return: prediction
    """
    regressor.fit(x, y)
    prediction = np.exp(regressor.predict(q))

    return prediction

---
# Stacking 2
## 0 ensemble

In [251]:
x, y, q = split_data(encoder=ce.TargetEncoder())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [252]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestRegressor(n_estimators=200,
                          min_samples_split=2,
                          min_samples_leaf=2,
                          max_features='sqrt',
                          max_depth=14,
                          bootstrap=False,
                          n_jobs=-1,
                          random_state=SEED),
    ExtraTreesRegressor(n_estimators=300, random_state=SEED)
], x_train, x_test, y_train, folderer)

  0%|          | 0/2 [00:00<?, ?it/s]

folder MAPE [0.15871 0.15619 0.15927 0.15781 0.15963]
Best score # 1 0.15619053170457223


 50%|█████     | 1/2 [01:05<01:05, 65.27s/it]

folder MAPE [0.12531 0.12417 0.12553 0.12585 0.12443]
Best score # 1 0.12417068827151483


100%|██████████| 2/2 [12:26<00:00, 373.21s/it]


In [253]:
regressor = RandomForestRegressor(random_state=SEED)
compute_mape(regressor, x_train=stacked_features_train, x_test=stacked_features_test)

0.139228

In [275]:
list_to_drop = ['ПТС', 'Владельцы', 'fuelType', 'climate'] #, 'numberOfDoors', 'vehicleTransmission', 'Руль', 'owners', ]

x, y, q = split_data(data=data.drop(list_to_drop, axis=1), encoder=ce.TargetEncoder())
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=SEED)

In [276]:
folderer = KFold(n_splits=10, shuffle=True, random_state=SEED)

---
# Stacking 2
## 1 ensemble

In [285]:
stacked_features_train, stacked_features_test = generate_meta_features([
    ExtraTreesRegressor(n_estimators=300, random_state=SEED, n_jobs=-1),
    LinearRegression(n_jobs=-1),
    DecisionTreeRegressor(random_state=SEED)
], x_train, x_test, y_train, folderer)



  0%|          | 0/3 [00:00<?, ?it/s][A[A

folder MAPE [0.11059 0.11111 0.10956 0.10851 0.11018 0.11128 0.10989 0.11164 0.10892
 0.11108]
Best score # 3 0.10851200130535904




 33%|███▎      | 1/3 [09:23<18:46, 563.26s/it][A[A

 67%|██████▋   | 2/3 [09:23<03:52, 232.30s/it][A[A

folder MAPE [0.22933 0.23408 0.22469 0.22964 0.22778 0.23125 0.2267  0.2285  0.22772
 0.22873]
Best score # 2 0.22468913223387457
folder MAPE [0.13319 0.13372 0.1286  0.13652 0.13275 0.13461 0.13428 0.13767 0.13602
 0.13883]
Best score # 2 0.12859999730804916




100%|██████████| 3/3 [09:34<00:00, 191.48s/it][A[A


In [288]:
regressor = RandomForestRegressor(random_state=SEED)
compute_mape(regressor, x_train=stacked_features_train, x_test=stacked_features_test)

0.112653

---
# 2 ensemble

In [290]:
stacked_features_train, stacked_features_test = generate_meta_features([
    ExtraTreesRegressor(n_estimators=300, random_state=SEED, n_jobs=-1),
    ExtraTreesRegressor(random_state=SEED, n_jobs=-1),
    DecisionTreeRegressor(random_state=SEED)
], x_train, x_test, y_train, folderer)



  0%|          | 0/3 [00:00<?, ?it/s][A[A

folder MAPE [0.11059 0.11111 0.10956 0.10851 0.11018 0.11128 0.10989 0.11164 0.10892
 0.11108]
Best score # 3 0.10851200130535903




 33%|███▎      | 1/3 [09:13<18:26, 553.50s/it][A[A

folder MAPE [0.11097 0.11158 0.10957 0.10944 0.11049 0.1114  0.11022 0.11195 0.10905
 0.11139]
Best score # 8 0.10904975779883688




 67%|██████▋   | 2/3 [12:17<05:36, 336.25s/it][A[A

folder MAPE [0.13319 0.13372 0.1286  0.13652 0.13275 0.13461 0.13428 0.13767 0.13602
 0.13883]
Best score # 2 0.12859999730804916




100%|██████████| 3/3 [12:26<00:00, 248.98s/it][A[A


In [291]:
regressor = CatBoostRegressor(iterations = 5000,
                              random_seed = SEED,
                              eval_metric='MAPE',
                              custom_metric=['R2', 'MAE'],
                              silent=True,
                              )
compute_mape(regressor, x_train=stacked_features_train, x_test=stacked_features_test)

0.108806

In [292]:
regressor = RandomForestRegressor(random_state=SEED)
compute_mape(regressor, x_train=stacked_features_train, x_test=stacked_features_test)

0.122509

---
# 3 ensemble
the last. I deside, i cant improve estimate on leaderboard

In [297]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestRegressor(random_state=SEED),
    ExtraTreesRegressor(n_estimators=300, random_state=SEED, n_jobs=-1),
    ExtraTreesRegressor(random_state=SEED, n_jobs=-1),
    GradientBoostingRegressor(random_state=SEED)
], x_train, x_test, y_train, folderer)




  0%|          | 0/4 [00:00<?, ?it/s][A[A[A

folder MAPE [0.0954  0.09702 0.0937  0.09613 0.09539 0.09684 0.0936  0.09546 0.09527
 0.09533]
Best score # 6 0.09359785255261037





 25%|██▌       | 1/4 [09:44<29:14, 584.86s/it][A[A[A

folder MAPE [0.11059 0.11111 0.10956 0.10851 0.11018 0.11128 0.10989 0.11164 0.10892
 0.11108]
Best score # 3 0.10851200130535903





 50%|█████     | 2/4 [18:35<18:26, 553.23s/it][A[A[A

folder MAPE [0.11097 0.11158 0.10957 0.10944 0.11049 0.1114  0.11022 0.11195 0.10905
 0.11139]
Best score # 8 0.10904975779883694





 75%|███████▌  | 3/4 [21:23<06:17, 377.34s/it][A[A[A

folder MAPE [0.17343 0.17625 0.17032 0.17561 0.17346 0.1761  0.1687  0.17274 0.1738
 0.176  ]
Best score # 6 0.1686972666782429





100%|██████████| 4/4 [24:18<00:00, 364.58s/it][A[A[A


In [298]:
regressor = CatBoostRegressor(iterations = 5000,
                              random_seed = SEED,
                              eval_metric='MAPE',
                              custom_metric=['R2', 'MAE'],
                              silent=True,
                              )
compute_mape(regressor, x_train=stacked_features_train, x_test=stacked_features_test)

0.098601

---
### making submission

In [301]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestRegressor(random_state=SEED),
    ExtraTreesRegressor(n_estimators=300, random_state=SEED, n_jobs=-1),
    ExtraTreesRegressor(random_state=SEED, n_jobs=-1),
    GradientBoostingRegressor(random_state=SEED)
], x, q, y, folderer)





  0%|          | 0/4 [00:00<?, ?it/s][A[A[A[A

folder MAPE [0.08973 0.09177 0.09237 0.08886 0.09139 0.09168 0.09277 0.08972 0.09028
 0.09103]
Best score # 3 0.08885723299491108






 25%|██▌       | 1/4 [11:32<34:38, 692.98s/it][A[A[A[A

folder MAPE [0.10445 0.10678 0.10679 0.10446 0.10528 0.10673 0.10594 0.10477 0.10487
 0.10658]
Best score # 0 0.10444574686693764






 50%|█████     | 2/4 [20:36<20:09, 605.00s/it][A[A[A[A

folder MAPE [0.10473 0.10707 0.10732 0.10534 0.10583 0.10704 0.10637 0.10501 0.10503
 0.10693]
Best score # 0 0.10472900116527475






 75%|███████▌  | 3/4 [23:41<06:53, 413.48s/it][A[A[A[A

folder MAPE [0.17488 0.17402 0.17472 0.1698  0.17392 0.17267 0.17512 0.17257 0.17308
 0.17573]
Best score # 3 0.16980283407460273






100%|██████████| 4/4 [27:15<00:00, 408.78s/it][A[A[A[A


In [304]:
regressor = CatBoostRegressor(iterations = 5000,
                              random_seed = SEED,
                              eval_metric='MAPE',
                              custom_metric=['R2', 'MAE'],
                              silent=True,
                              )
answer = create_submission(regressor, x=stacked_features_train, q=stacked_features_test)

In [306]:
q['price']=answer

In [307]:
submission = q[['sell_id', 'price']].copy()

In [310]:
submission.price = submission.price.apply(lambda w: w * 1.255839 / 1.5)

In [311]:
submission.to_csv('sub_12_13.csv', index=False)

## resume
Score on kafgle is 32%. After dividing on 1.5 - 25%. No luck.
___