# Project name: [Predict Future Sales](https://www.kaggle.com/c/competitive-data-science-predict-future-sales)

# Objective

Optimize hyperparameters to get 0.923 score on [Public Leaderboard](https://www.kaggle.com/c/competitive-data-science-predict-future-sales/leaderboard)

# Version

In [1]:
__ver__ = "0.6"

# Setup

In [2]:
import numpy as np
import pandas as pd
import catboost
import sklearn
import hyperopt
from hyperopt import hp
from sklearn import preprocessing
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import statsmodels as sm
from pmdarima import arima
%matplotlib inline 

import itertools
import warnings
import sys

warnings.filterwarnings('ignore')

# Load processed data

In [3]:
# load and save ID in index
test_path = "./raw_data/test.csv.gz"
test = pd.read_csv(test_path).set_index('ID')

In [4]:
processed_path = f"./processed_data/data_0.5.pickle"
data = pd.read_pickle(processed_path)

In [5]:
# drop data with NaN in lags
data.dropna(subset=list(set(data.columns) - set("y")), inplace=True)

In [6]:
# Drop bad features
data.drop(["city", "not_sold_before", "price_discount_1"], axis=1, inplace=True)

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4712400 entries, 2784600 to 7496999
Data columns (total 31 columns):
date_block_num         int8
shop_id                int8
item_id                int16
y                      float32
item_category_id       int8
category_group         int8
sub_category           int8
month                  int8
y_1                    float32
y_2                    float32
y_3                    float32
y_4                    float32
diff_y1_y2             float32
diff_y1_y13            float32
item_sold              bool
new_item               bool
item_cluster_21        int8
item_date_y_1          float32
item_date_y_2          float32
item_date_y_3          float32
shop_date_y_1          float32
shop_date_y_2          float32
shop_date_y_12         float32
shop_cat_date_y_1      float32
shop_cat_date_y_2      float32
shop_cat_date_y_3      float32
cat_date_y_1           float32
cat_date_y_2           float32
cat_date_y_3           float32
cummean_sho

# Validation strategies

Validation strategy:

* Train - all months except last one in the training set
* Validate - a last month in the training set
* Test - a test set

In [8]:
val_range = 1
test_date_block = data.date_block_num.max()

x_train = data[data.date_block_num < test_date_block - val_range].drop("y", axis=1)
y_train = data.loc[data.date_block_num < test_date_block - val_range, "y"]

mask = (test_date_block - val_range <= data.date_block_num) & (data.date_block_num < test_date_block)
x_valid = data[mask].drop("y", axis=1)
y_valid = data.loc[mask, "y"]

x_test = data[data.date_block_num == test_date_block].drop("y", axis=1)

In [9]:
cat_columns = [i for i, col in enumerate(x_train) if not issubclass(x_train[col].dtype.type, np.floating)]
cat_columns

[0, 1, 2, 3, 4, 5, 6, 13, 14, 15]

In [10]:
train = catboost.Pool(
    data=x_train,
    label=y_train,
    cat_features=cat_columns
)

In [11]:
valid = catboost.Pool(
    data=x_valid,
    label=y_valid,
    cat_features=cat_columns
)

# Hyperparameters optimization

In [18]:
space = dict(
    random_state=284704,
    od_type="Iter",
    learning_rate=0.1,
    task_type="GPU",
    verbose=False,
    depth=hp.choice("depth", list(range(1, 17))),
    l2_leaf_reg=hp.loguniform("l2_leaf_reg", np.log(0.3), np.log(30)),
    random_strength=hp.loguniform("rand_strength", np.log(0.1), np.log(10)),
    bagging_temperature=hp.loguniform("bagging_temperature", np.log(0.1), np.log(10))
)
if sys.platform == "darwin":
    del space["task_type"]

In [19]:
class CatboostHyper:
    def __init__(self, train, valid):
        self._train = train
        self._valid = valid
        self._counter = 0
        self._rmse = None
        self._clf = None
        
    def __call__(self, params, plot=False):
        clf = catboost.CatBoostRegressor(**params)
        clf.fit(
            X=self._train, 
            eval_set=self._valid
        )
        rmse = clf.get_best_score()["validation_0"]["RMSE"]
        print(f"Run {self._counter}: "
              f"RMSE - {rmse:0.5f}, "
              f"Best Iteration - {clf.get_best_iteration() + 1}"
             )
        self._counter += 1
        if self._rmse is None or rmse < self._rmse:
            self._rmse = rmse
            self._clf = clf
            print(f"Best run: params - {clf.get_params()}")
            print()
        return rmse
            
    def feature_importance(self):
        clf = self.best_clf
        for i, v in clf.get_feature_importance(prettified=True):
            print(i.ljust(20), v)
        print()   
        for i, j, value in clf.get_feature_importance(fstr_type="Interaction", prettified=True)[:10]:
            print(x_train.columns[i].ljust(20), x_train.columns[j].ljust(20), value)
    
    @property
    def best_clf(self):
        return self._clf

In [20]:
opt = CatboostHyper(train, valid)
hyperopt.fmin(
    opt,
    space=space,
    algo=hyperopt.tpe.suggest,
    max_evals=100)

Run 0: RMSE - 1.03089, Best Iteration - 5
Best run: params - {'iterations': 5, 'learning_rate': 0.1, 'depth': 1, 'l2_leaf_reg': 5.2003828181618745, 'loss_function': 'RMSE', 'od_type': 'Iter', 'verbose': False, 'random_strength': 0.1101051113732725, 'bagging_temperature': 8.144781580352324, 'random_state': 284704}

Run 1: RMSE - 0.98158, Best Iteration - 5
Best run: params - {'iterations': 5, 'learning_rate': 0.1, 'depth': 4, 'l2_leaf_reg': 0.6665086343057374, 'loss_function': 'RMSE', 'od_type': 'Iter', 'verbose': False, 'random_strength': 4.961808655416047, 'bagging_temperature': 0.7118780402545827, 'random_state': 284704}

Run 2: RMSE - 1.00284, Best Iteration - 5
Run 3: RMSE - 1.00784, Best Iteration - 5


{'bagging_temperature': 0.7118780402545827,
 'depth': 3,
 'l2_leaf_reg': 0.6665086343057374,
 'rand_strength': 4.961808655416047}

In [21]:
opt.feature_importance()

y_1                  75.02639680167826
item_date_y_1        8.290124226064508
y_3                  6.135055083519171
cummean_shop_item_y  4.572382848255849
shop_cat_date_y_3    3.152552694638401
new_item             2.823488345843804
date_block_num       0.0
shop_id              0.0
item_id              0.0
item_category_id     0.0
category_group       0.0
sub_category         0.0
month                0.0
y_2                  0.0
y_4                  0.0
diff_y1_y2           0.0
diff_y1_y13          0.0
item_sold            0.0
item_cluster_21      0.0
item_date_y_2        0.0
item_date_y_3        0.0
shop_date_y_1        0.0
shop_date_y_2        0.0
shop_date_y_12       0.0
shop_cat_date_y_1    0.0
shop_cat_date_y_2    0.0
cat_date_y_1         0.0
cat_date_y_2         0.0
cat_date_y_3         0.0
price_growth_1_6     0.0

y_1                  item_date_y_1        29.812237633737386
y_1                  y_3                  20.578204330390836
y_1                  shop_cat_date_y_3    1

# Submission

In [23]:
sub_df = pd.DataFrame(
    opt.best_clf.predict(x_test), 
    index=test.index, 
    columns=["item_cnt_month"]
).clip(0, 20)
sub_df.index.name = "ID"

In [24]:
sub_path = f"./submissions/submission_{__ver__}.csv"
sub_df.to_csv(sub_path)

## Ensembling

* Save all good models
* Make diverse models

* Averaging
* Weighted averaging
* Bagging (BaggingClassifier and BaggingRegressor from Sklearn, seed bagging)
* Boosting (AdaBoostClassifier from Sklearn)
* Stacking (Meta model should be modest)
* StackNet
    - Diversity of base algoritms
    - Diversity of base data
    - Simpler algoritms on higher levels
    - Feature engineering of meta feature (differences, std ...)
    - For every level 1model for 5-10 modelesin orivios level