# Imports and config

In [1]:
import os
import pickle
import warnings
import random
import pickle
import time
import itertools

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from cycler import cycler

import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import config as cfg
import data_processing as dp


warnings.filterwarnings('ignore')
plt.style.use('seaborn-dark-palette')

# Model Selection

https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/  

We will use an approach similar to that of GBM here. The various steps to be performed are:

- Choose a relatively high learning rate. Generally a learning rate of 0.1 works but somewhere between 0.05 to 0.3 - should work for different problems. Determine the optimum number of trees for this learning rate. XGBoost has a very useful function called as “cv” which performs cross-validation at each boosting iteration and thus returns the optimum number of trees required.  
- Tune tree-specific parameters ( max_depth, min_child_weight, gamma, subsample, colsample_bytree) for decided learning rate and number of trees. Note that we can choose different parameters to define a tree and I’ll take up an example here.  
- Tune regularization parameters (lambda, alpha) for xgboost which can help reduce model complexity and enhance performance.  
- Lower the learning rate and decide the optimal parameters .

In [2]:
df= pd.read_pickle("cleaned_sets/df_fs_done.pkl")

In [3]:
train = df.loc[(df["date_block_num"]>=20) & (df["date_block_num"]<=32)].sample(frac=0.6)
test = df.loc[(df["date_block_num"]>=33)]

identificators = ["shop_id","item_id","date_block_num"]
predictors = [x for x in train.columns if x not in identificators]
label = "item_cnt_next_month"

X_train = train[predictors]
y_train = train[label]

X_test = test[predictors+identificators]

In [4]:
dtrain = xgb.DMatrix(X_train, label=y_train)

### Function for fine tuning

In [5]:
def dict_to_iterlist(d):
    keys=d.keys()
    lists=d.values()
    return list(keys), list(itertools.product(*lists))

In [7]:
def fine_tune_xgb(initial_params, gridsearch_params, dtrain, early_stopping_rounds=10, cv_fold=5):
    min_rmse = float("Inf")
    best_params = None
    
    params=initial_params
    gs_param_names, combinations = dict_to_iterlist(gridsearch_params)
    nb_gs_params = len(gs_param_names)
    
    boosting_rounds=100
    
    for combi in combinations:
        start_time=time.time()
        print(", ".join([f"{gs_param_names[i]}={combi[i]}" for i in range(nb_gs_params)]))

        # Update our parameters
        for i in range(nb_gs_params):
            params[gs_param_names[i]] = combi[i]
            
        if "num_boost_round" in gs_param_names:
            boosting_rounds=combi[gs_param_names.index("num_boost_round")]
        
        # Run CV
        cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=boosting_rounds,
            seed=42,
            nfold=cv_fold,
            metrics={'rmse'},
            early_stopping_rounds=early_stopping_rounds
        )
        # Update best RMSE
        
        mean_rmse = cv_results['test-rmse-mean'].min()
        boost_rounds = cv_results['test-rmse-mean'].argmin()
        
        print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
        print("Time taken for this round {}".format(time.time()-start_time))
        if mean_rmse < min_rmse:
            min_rmse = mean_rmse
            best_params = combi
    
    print(best_params)
    print("".join(["Best params:", 
                   ", ".join([f"{gs_param_names[i]}={best_params[i]}" for i in range(nb_gs_params)]),
                  f", RMSE: {min_rmse}",]))

#### Number of estimators

In [7]:
params = {
    # Parameters that we are going to tune.
    'max_depth':5,
    'min_child_weight': 2,
    'gamma':0,
    'eta':.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    # Other parameters
    'objective':'reg:squarederror',
    'eval_metric':'rmse'
}

gridsearch_params = {
    "num_boost_round":[50,100,200,300,400,500]
        }

fine_tune_xgb(params, gridsearch_params, dtrain)

num_boost_round=50
	RMSE 0.19884619999999997 for 49 rounds
Time taken for this round 174.0552237033844
num_boost_round=100
	RMSE 0.16946060000000002 for 99 rounds
Time taken for this round 363.9197838306427
num_boost_round=200
	RMSE 0.16890280000000002 for 101 rounds
Time taken for this round 402.503536939621
num_boost_round=300
	RMSE 0.16890280000000002 for 101 rounds
Time taken for this round 389.96784806251526
num_boost_round=400
	RMSE 0.16890280000000002 for 101 rounds
Time taken for this round 362.24060893058777
num_boost_round=500
	RMSE 0.16890280000000002 for 101 rounds
Time taken for this round 383.1901047229767
(200,)
Best params:num_boost_round=500, RMSE: 0.16890280000000002


num_boost_round=100 seems fine.

##### max_depth, min_child_weight

In [None]:
params = {
    # Parameters that we are going to tune.
    'max_depth':5,
    'min_child_weight': 2,
    'gamma':0,
    'eta':.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    # Other parameters
    'objective':'reg:squarederror',
    'eval_metric':'rmse'
}

gridsearch_params = {
    "max_depth": [5,6,7,8,10,12],
    "min_child_weight": [2,4,6,8]
}

fine_tune_xgb(params, gridsearch_params, dtrain)

max_depth=5, min_child_weight=2
	RMSE 0.1656404 for 94 rounds
Time taken for this round 400.32938861846924
max_depth=5, min_child_weight=4
	RMSE 0.1680922 for 99 rounds
Time taken for this round 396.3620090484619
max_depth=5, min_child_weight=6
	RMSE 0.1988702 for 88 rounds
Time taken for this round 417.3902268409729
max_depth=5, min_child_weight=8
	RMSE 0.2490282 for 65 rounds
Time taken for this round 305.3179974555969
max_depth=6, min_child_weight=2
	RMSE 0.1722786 for 93 rounds
Time taken for this round 1242.185308456421
max_depth=6, min_child_weight=4
	RMSE 0.17041499999999998 for 90 rounds
Time taken for this round 431.78232884407043
max_depth=6, min_child_weight=6
	RMSE 0.2029366 for 70 rounds
Time taken for this round 389.0773892402649
max_depth=6, min_child_weight=8
	RMSE 0.24996000000000002 for 61 rounds
Time taken for this round 351.74165391921997
max_depth=7, min_child_weight=2
	RMSE 0.170522 for 77 rounds
Time taken for this round 500.659955739975
max_depth=7, min_child_we

# Prediction

In [92]:
X_test = test[["shop_id","item_id","date_block_num"]+predictors]
print(X_test.isnull().sum())
X_test.head(2)

shop_id                             0
item_id                             0
date_block_num                      0
mean_item_price                     0
std_item_price                      0
item_cnt_month                      0
days_with_sell                      0
month                               0
year                                0
nb_days                             0
std_item_price_lag1                 0
std_item_price_lag2                 0
item_cnt_month_lag1                 0
item_cnt_month_lag2                 0
mean_category_cnt_month             0
mean_category_item_price            0
std_category_item_price             0
mean_city_cnt_month                 0
mean_city_item_price                0
std_city_item_price                 0
std_category_item_price_lag1        0
is_lowest_price                     0
main_category_name_Accessories      0
main_category_name_Books            0
main_category_name_Game consoles    0
main_category_name_Games            0
main_categor

Unnamed: 0,shop_id,item_id,date_block_num,mean_item_price,std_item_price,item_cnt_month,days_with_sell,month,year,nb_days,...,main_category_name_Accessories,main_category_name_Books,main_category_name_Game consoles,main_category_name_Games,main_category_name_Gifts,main_category_name_Movies,main_category_name_Music,main_category_name_Others,main_category_name_Payment card,main_category_name_Program
33,0,16385,33,310.568627,0.0,0.0,0.0,10,15,31,...,0,0,0,1,0,0,0,0,0,0
67,0,8195,33,141.206851,0.041437,0.0,0.0,10,15,31,...,0,0,0,0,0,1,0,0,0,0


In [93]:
y_pred = xgbr.predict(X_test[predictors])
X_test["item_cnt_month_pred"] = y_pred

X_test.head(2)

ValueError: feature_names mismatch: ['mean_item_price', 'std_item_price', 'item_cnt_month', 'days_with_sell', 'month', 'year', 'nb_days', 'std_item_price_lag1', 'std_item_price_lag2', 'item_cnt_month_lag1', 'item_cnt_month_lag2', 'mean_category_cnt_month', 'mean_category_item_price', 'std_category_item_price', 'mean_city_cnt_month', 'mean_city_item_price', 'std_city_item_price', 'std_category_item_price_lag1', 'is_lowest_price', 'main_category_name_Accessories', 'main_category_name_Books', 'main_category_name_Game consoles', 'main_category_name_Games', 'main_category_name_Gifts', 'main_category_name_Movies', 'main_category_name_Music', 'main_category_name_Others', 'main_category_name_Payment card', 'main_category_name_Program', 'city_name_Adygea', 'city_name_Balashikha', 'city_name_Czechs', 'city_name_Kaluga', 'city_name_Kazan', 'city_name_Khimki', 'city_name_Kolomna', 'city_name_Krasnoyarsk', 'city_name_Kursk', 'city_name_Moscow', 'city_name_Mytishchi', 'city_name_Nizhny Novgorod', 'city_name_Novosibirsk', 'city_name_Omsk', 'city_name_Other', 'city_name_Rostov-on-Don', 'city_name_SPb', 'city_name_Samara', 'city_name_Sergiev', 'city_name_Surgut', 'city_name_Tomsk', 'city_name_Tyumen', 'city_name_Ufa', 'city_name_Vologda', 'city_name_Volzhsky', 'city_name_Voronezh', 'city_name_Yakutsk', 'city_name_Yaroslavl', 'city_name_Zhukovsky'] ['mean_item_price', 'std_item_price', 'item_cnt_month', 'days_with_sell', 'month', 'year', 'nb_days', 'std_item_price_lag1', 'std_item_price_lag2', 'item_cnt_month_lag1', 'item_cnt_month_lag2', 'mean_category_cnt_month', 'mean_category_item_price', 'std_category_item_price', 'mean_city_cnt_month', 'mean_city_item_price', 'std_city_item_price', 'std_category_item_price_lag1', 'is_lowest_price', 'main_category_name_Accessories', 'main_category_name_Books', 'main_category_name_Game consoles', 'main_category_name_Games', 'main_category_name_Gifts', 'main_category_name_Movies', 'main_category_name_Music', 'main_category_name_Others', 'main_category_name_Payment card', 'main_category_name_Program']
expected city_name_Other, city_name_Samara, city_name_Sergiev, city_name_Krasnoyarsk, city_name_Tomsk, city_name_Yakutsk, city_name_Rostov-on-Don, city_name_SPb, city_name_Omsk, city_name_Voronezh, city_name_Surgut, city_name_Kolomna, city_name_Ufa, city_name_Czechs, city_name_Moscow, city_name_Zhukovsky, city_name_Balashikha, city_name_Volzhsky, city_name_Mytishchi, city_name_Nizhny Novgorod, city_name_Novosibirsk, city_name_Kazan, city_name_Yaroslavl, city_name_Kursk, city_name_Tyumen, city_name_Khimki, city_name_Kaluga, city_name_Adygea, city_name_Vologda in input data

# Format to submission

In [62]:
to_pred = pd.read_csv(cfg.FILENAMES['TEST_SALES'])
to_pred.head(2)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320


In [63]:
submission = pd.merge(to_pred, X_test[["item_id","shop_id","item_cnt_month_pred"]],
                      how="left", on=["item_id","shop_id"])
print(submission.isnull().sum())
submission = submission.fillna(0)
submission.head(2)

ID                          0
shop_id                     0
item_id                     0
item_cnt_month_pred    102796
dtype: int64


Unnamed: 0,ID,shop_id,item_id,item_cnt_month_pred
0,0,5,5037,0.620287
1,1,5,5320,0.0


In [68]:
submission.loc[submission["item_cnt_month_pred"]>20,"item_cnt_month_pred"]=20
submission.loc[submission["item_cnt_month_pred"]<0, "item_cnt_month_pred"]=0

In [69]:
sub_example = pd.read_csv(cfg.FILENAMES["SAMPLE_SUBM"])
sub_example.head(2)

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5


In [70]:
submission_formated = (submission[["ID", "item_cnt_month_pred"]]
                       .rename({"item_cnt_month_pred":"item_cnt_month"}, axis=1))
submission_formated.head(2)

Unnamed: 0,ID,item_cnt_month
0,0,0.620287
1,1,0.0


In [71]:
submission_formated.to_csv(os.path.join("submissions", "sub_04.csv"), index=False)