# Imports and config

In [2]:
import os
import pickle
import warnings
import random
import pickle
import time

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from cycler import cycler

import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

import config as cfg
import data_processing as dp


warnings.filterwarnings('ignore')
plt.style.use('seaborn-dark-palette')

In [2]:
df_fs=pd.read_pickle("cleaned_sets/df_features.pkl")

# Feature Selection

In [3]:
df_fs.head(1)

Unnamed: 0,shop_id,item_id,date_block_num,random_date,mean_item_price,std_item_price,item_cnt_month,days_with_sell,item_name,item_category_id,...,mean_city_cnt_month,mean_city_item_price,std_city_item_price,mean_category_cnt_month_lag1,mean_category_item_price_lag1,std_category_item_price_lag1,mean_city_cnt_month_lag1,mean_city_item_price_lag1,std_city_item_price_lag1,is_lowest_price
0,0,16385,0,20.01.2013,200.0,0.0,2.0,2.0,"Нэнси Дрю. Сгоревшее алиби [PC, Jewel]",30,...,0.581237,513.799861,2.299256,,,,,,,True


In [4]:
df_fs.columns

Index(['shop_id', 'item_id', 'date_block_num', 'random_date',
       'mean_item_price', 'std_item_price', 'item_cnt_month', 'days_with_sell',
       'item_name', 'item_category_id', 'main_category_name',
       'sub_category_name', 'city_name', 'item_cnt_next_month', 'month',
       'year', 'nb_days', 'mean_item_price_lag1', 'mean_item_price_lag2',
       'std_item_price_lag1', 'std_item_price_lag2', 'item_cnt_month_lag1',
       'item_cnt_month_lag2', 'item_cnt_month_lag12',
       'mean_category_cnt_month', 'mean_category_item_price',
       'std_category_item_price', 'mean_city_cnt_month',
       'mean_city_item_price', 'std_city_item_price',
       'mean_category_cnt_month_lag1', 'mean_category_item_price_lag1',
       'std_category_item_price_lag1', 'mean_city_cnt_month_lag1',
       'mean_city_item_price_lag1', 'std_city_item_price_lag1',
       'is_lowest_price'],
      dtype='object')

In [5]:
col_to_remove = ["random_date","item_name","item_category_id", 
                 "item_cnt_month_lag12","sub_category_name"]
df_fs = df_fs.drop(col_to_remove, axis=1)

In [6]:
col_id = ["shop_id","item_id","date_block_num"]
df_fs_predictors = df_fs.loc[:,[col for col in df_fs.columns if col not in col_id]]

#### Collinearity

![caption](images/feature_heatmap.png)

In [7]:
col_to_remove = ["mean_item_price_lag1", "mean_item_price_lag2","mean_category_cnt_month_lag1",
                 "mean_category_item_price_lag1","mean_category_item_price_lag1",
                 "mean_city_cnt_month_lag1", "mean_city_item_price_lag1","std_city_item_price_lag1"]
df_fs = df_fs.drop(col_to_remove, axis=1)

##### String features to dummies

In [8]:
col_to_dummies = ["main_category_name","city_name"]
df_fs = pd.concat([df_fs.drop(col_to_dummies, axis=1), pd.get_dummies(df_fs[col_to_dummies])], axis=1)
df_fs.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,mean_item_price,std_item_price,item_cnt_month,days_with_sell,item_cnt_next_month,month,year,...,city_name_Surgut,city_name_Tomsk,city_name_Tyumen,city_name_Ufa,city_name_Vologda,city_name_Volzhsky,city_name_Voronezh,city_name_Yakutsk,city_name_Yaroslavl,city_name_Zhukovsky
0,0,16385,0,200.0,0.0,2.0,2.0,2.0,1,13,...,0,0,0,0,0,0,0,1,0,0
1,0,16385,1,200.0,0.0,2.0,2.0,0.0,2,13,...,0,0,0,0,0,0,0,1,0,0


##### Model based feature selection

First, we need to build a train and a validation set. We'll also build the test set.  

In [9]:
train = df_fs.loc[(df_fs["date_block_num"]>=2) & (df_fs["date_block_num"]<=27)]
valid = df_fs.loc[(df_fs["date_block_num"]>=28) & (df_fs["date_block_num"]<=32)]
test = df_fs.loc[(df_fs["date_block_num"]>=33)]

In [10]:
X_train_fs = train.drop(["date_block_num","item_cnt_next_month","item_id","shop_id"], axis=1)
y_train_fs = train.loc[train.index.isin(X_train_fs.index),"item_cnt_next_month"]
X_val_fs = valid.drop(["date_block_num","item_cnt_next_month","item_id","shop_id"], axis=1)
y_val_fs = valid.loc[valid.index.isin(X_val_fs.index),"item_cnt_next_month"]

In [11]:
xgbr = xgb.XGBRegressor(seed=0)
xgbr.fit(X_train_fs, y_train_fs,
        eval_metric="rmse", 
              eval_set=[(X_train_fs, y_train_fs), (X_val_fs, y_val_fs)], 
              verbose=20, 
              early_stopping_rounds=20)

[0]	validation_0-rmse:2.83787	validation_1-rmse:1.8989
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 20 rounds.
[20]	validation_0-rmse:1.93695	validation_1-rmse:1.41145
[40]	validation_0-rmse:1.84671	validation_1-rmse:1.39601
[60]	validation_0-rmse:1.81407	validation_1-rmse:1.39497
[80]	validation_0-rmse:1.79449	validation_1-rmse:1.39376
[99]	validation_0-rmse:1.77371	validation_1-rmse:1.39064


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=None,
             subsample=1, verbosity=1)

In [19]:
ft_imp ={}
for i in range(len(X_train_fs.columns)):
    ft_imp[X_train_fs.columns[i]] = xgbr.feature_importances_[i]
sorted_ft_imp=sorted(ft_imp.items(), key=lambda kv: kv[1], reverse=True)
sorted_ft_imp

[('days_with_sell', 0.2573476),
 ('city_name_Moscow', 0.10368011),
 ('city_name_Khimki', 0.099290125),
 ('item_cnt_month', 0.070304416),
 ('std_item_price_lag2', 0.060795363),
 ('mean_item_price', 0.060523406),
 ('mean_category_item_price', 0.044028115),
 ('mean_city_cnt_month', 0.043333657),
 ('std_category_item_price_lag1', 0.035584435),
 ('nb_days', 0.034289084),
 ('main_category_name_Others', 0.031003052),
 ('std_item_price_lag1', 0.027426241),
 ('city_name_SPb', 0.017428607),
 ('city_name_Other', 0.017253457),
 ('std_city_item_price', 0.017120935),
 ('month', 0.015207431),
 ('item_cnt_month_lag2', 0.01519397),
 ('mean_city_item_price', 0.014522049),
 ('item_cnt_month_lag1', 0.0111323),
 ('mean_category_cnt_month', 0.008358764),
 ('city_name_Yakutsk', 0.005214255),
 ('std_category_item_price', 0.004348342),
 ('main_category_name_Movies', 0.003333525),
 ('is_lowest_price', 0.001925058),
 ('main_category_name_Payment card', 0.0008298387),
 ('year', 0.00052589865),
 ('std_item_price',

In [14]:
xgbr.save_model("model_feature_selection")

##### Zero importance features

In [20]:
non_zero_features=[x[0] for x in sorted_ft_imp if x[1]>0]
non_zero_features

['days_with_sell',
 'city_name_Moscow',
 'city_name_Khimki',
 'item_cnt_month',
 'std_item_price_lag2',
 'mean_item_price',
 'mean_category_item_price',
 'mean_city_cnt_month',
 'std_category_item_price_lag1',
 'nb_days',
 'main_category_name_Others',
 'std_item_price_lag1',
 'city_name_SPb',
 'city_name_Other',
 'std_city_item_price',
 'month',
 'item_cnt_month_lag2',
 'mean_city_item_price',
 'item_cnt_month_lag1',
 'mean_category_cnt_month',
 'city_name_Yakutsk',
 'std_category_item_price',
 'main_category_name_Movies',
 'is_lowest_price',
 'main_category_name_Payment card',
 'year']

##### Keep only features such as cumulated importance > threshold - NOT DONE

In [22]:
cumul_imp_ft=[]
threshold = 0.95
cumul_imp=0
i=0
while cumul_imp<threshold:
    cumul_imp_ft.append(sorted_ft_imp[i][0])
    cumul_imp+=sorted_ft_imp[i][1]
    i+=1
print(cumul_imp)
print(cumul_imp_ft)
print([x for x in non_zero_features if x not in cumul_imp_ft])

0.9643320674076676
['days_with_sell', 'city_name_Moscow', 'city_name_Khimki', 'item_cnt_month', 'std_item_price_lag2', 'mean_item_price', 'mean_category_item_price', 'mean_city_cnt_month', 'std_category_item_price_lag1', 'nb_days', 'main_category_name_Others', 'std_item_price_lag1', 'city_name_SPb', 'city_name_Other', 'std_city_item_price', 'month', 'item_cnt_month_lag2', 'mean_city_item_price']
['item_cnt_month_lag1', 'mean_category_cnt_month', 'city_name_Yakutsk', 'std_category_item_price', 'main_category_name_Movies', 'is_lowest_price', 'main_category_name_Payment card', 'year']


#### Feature selection

In [23]:
not_predictors = ["shop_id","item_id","date_block_num","item_cnt_next_month"]

predictors = cumul_imp_ft
predictors

['days_with_sell',
 'city_name_Moscow',
 'city_name_Khimki',
 'item_cnt_month',
 'std_item_price_lag2',
 'mean_item_price',
 'mean_category_item_price',
 'mean_city_cnt_month',
 'std_category_item_price_lag1',
 'nb_days',
 'main_category_name_Others',
 'std_item_price_lag1',
 'city_name_SPb',
 'city_name_Other',
 'std_city_item_price',
 'month',
 'item_cnt_month_lag2',
 'mean_city_item_price']

In [27]:
df_fs_done = df_fs[not_predictors+predictors]
df_fs_done.columns

Index(['shop_id', 'item_id', 'date_block_num', 'item_cnt_next_month',
       'days_with_sell', 'city_name_Moscow', 'city_name_Khimki',
       'item_cnt_month', 'std_item_price_lag2', 'mean_item_price',
       'mean_category_item_price', 'mean_city_cnt_month',
       'std_category_item_price_lag1', 'nb_days', 'main_category_name_Others',
       'std_item_price_lag1', 'city_name_SPb', 'city_name_Other',
       'std_city_item_price', 'month', 'item_cnt_month_lag2',
       'mean_city_item_price'],
      dtype='object')

In [28]:
df_fs_done.to_pickle("cleaned_sets/df_fs_done.pkl")

# Model Selection

Results on the feature selection model.

In [4]:
df_fs_done= pd.read_pickle("cleaned_sets/df_fs_done.pkl")

##### 1st test

In [13]:
train = df_fs_done.loc[(df_fs_done["date_block_num"]>=2) & (df_fs_done["date_block_num"]<=27)]
valid = df_fs_done.loc[(df_fs_done["date_block_num"]>=28) & (df_fs_done["date_block_num"]<=32)]
test = df_fs_done.loc[(df_fs_done["date_block_num"]>=33)]

In [15]:
identificators = ["shop_id","item_id","date_block_num"]
predictors = [x for x in train.columns if x not in identificators]
label = "item_cnt_next_month"

X_train = train[predictors]
y_train = train[label]
X_valid = valid[predictors]
y_valid = valid[label]

X_test = test[predictors+identificators]

In [16]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

In [11]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}

In [18]:
params['eval_metric'] = "rmse"
num_boost_round = 999

In [19]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dvalid, "Test")],
    early_stopping_rounds=10
)

[0]	Test-rmse:1.53574
Will train until Test-rmse hasn't improved in 10 rounds.
[1]	Test-rmse:1.30968
[2]	Test-rmse:1.15222
[3]	Test-rmse:1.09587
[4]	Test-rmse:1.05875
[5]	Test-rmse:1.04354
[6]	Test-rmse:1.02912
[7]	Test-rmse:1.02182


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/robinblanchard/Desktop/training/kaggle-predict-future-sales/venv/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3319, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-19-64089c68b389>", line 6, in <module>
    early_stopping_rounds=10
  File "/Users/robinblanchard/Desktop/training/kaggle-predict-future-sales/venv/lib/python3.7/site-packages/xgboost/training.py", line 216, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "/Users/robinblanchard/Desktop/training/kaggle-predict-future-sales/venv/lib/python3.7/site-packages/xgboost/training.py", line 74, in _train_internal
    bst.update(dtrain, i, obj)
  File "/Users/robinblanchard/Desktop/training/kaggle-predict-future-sales/venv/lib/python3.7/site-packages/xgboost/core.py", line 1109, in update
    dtrain.handle))
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most

KeyboardInterrupt: 

#### Test using XGBoost’s CV

In [20]:
train = df_fs_done.loc[(df_fs_done["date_block_num"]>=2) & (df_fs_done["date_block_num"]<=32)]
test = df_fs_done.loc[(df_fs_done["date_block_num"]>=33)]

In [21]:
identificators = ["shop_id","item_id","date_block_num"]
predictors = [x for x in train.columns if x not in identificators]
label = "item_cnt_next_month"

X_train = train[predictors]
y_train = train[label]
X_valid = valid[predictors]
y_valid = valid[label]

X_test = test[predictors+identificators]

In [24]:
dtrain = xgb.DMatrix(X_train, label=y_train)

In [None]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=999,
    seed=42,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=2
)
cv_results



#### CV Parameters max_depth and min_child_weight

In [23]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in [6,8,10,12]
    for min_child_weight in range(5,8)
]

[(6, 5),
 (6, 6),
 (6, 7),
 (8, 5),
 (8, 6),
 (8, 7),
 (10, 5),
 (10, 6),
 (10, 7),
 (12, 5),
 (12, 6),
 (12, 7)]

In [None]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
    'eval_metric':'rmse'
}

In [None]:
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    start_time=time.time()
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=999,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_mae, boost_rounds))
    print("Time taken for this round {}".format(time.time()-start_time))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_rmse))

# Prediction

In [92]:
X_test = test[["shop_id","item_id","date_block_num"]+predictors]
print(X_test.isnull().sum())
X_test.head(2)

shop_id                             0
item_id                             0
date_block_num                      0
mean_item_price                     0
std_item_price                      0
item_cnt_month                      0
days_with_sell                      0
month                               0
year                                0
nb_days                             0
std_item_price_lag1                 0
std_item_price_lag2                 0
item_cnt_month_lag1                 0
item_cnt_month_lag2                 0
mean_category_cnt_month             0
mean_category_item_price            0
std_category_item_price             0
mean_city_cnt_month                 0
mean_city_item_price                0
std_city_item_price                 0
std_category_item_price_lag1        0
is_lowest_price                     0
main_category_name_Accessories      0
main_category_name_Books            0
main_category_name_Game consoles    0
main_category_name_Games            0
main_categor

Unnamed: 0,shop_id,item_id,date_block_num,mean_item_price,std_item_price,item_cnt_month,days_with_sell,month,year,nb_days,...,main_category_name_Accessories,main_category_name_Books,main_category_name_Game consoles,main_category_name_Games,main_category_name_Gifts,main_category_name_Movies,main_category_name_Music,main_category_name_Others,main_category_name_Payment card,main_category_name_Program
33,0,16385,33,310.568627,0.0,0.0,0.0,10,15,31,...,0,0,0,1,0,0,0,0,0,0
67,0,8195,33,141.206851,0.041437,0.0,0.0,10,15,31,...,0,0,0,0,0,1,0,0,0,0


In [93]:
y_pred = xgbr.predict(X_test[predictors])
X_test["item_cnt_month_pred"] = y_pred

X_test.head(2)

ValueError: feature_names mismatch: ['mean_item_price', 'std_item_price', 'item_cnt_month', 'days_with_sell', 'month', 'year', 'nb_days', 'std_item_price_lag1', 'std_item_price_lag2', 'item_cnt_month_lag1', 'item_cnt_month_lag2', 'mean_category_cnt_month', 'mean_category_item_price', 'std_category_item_price', 'mean_city_cnt_month', 'mean_city_item_price', 'std_city_item_price', 'std_category_item_price_lag1', 'is_lowest_price', 'main_category_name_Accessories', 'main_category_name_Books', 'main_category_name_Game consoles', 'main_category_name_Games', 'main_category_name_Gifts', 'main_category_name_Movies', 'main_category_name_Music', 'main_category_name_Others', 'main_category_name_Payment card', 'main_category_name_Program', 'city_name_Adygea', 'city_name_Balashikha', 'city_name_Czechs', 'city_name_Kaluga', 'city_name_Kazan', 'city_name_Khimki', 'city_name_Kolomna', 'city_name_Krasnoyarsk', 'city_name_Kursk', 'city_name_Moscow', 'city_name_Mytishchi', 'city_name_Nizhny Novgorod', 'city_name_Novosibirsk', 'city_name_Omsk', 'city_name_Other', 'city_name_Rostov-on-Don', 'city_name_SPb', 'city_name_Samara', 'city_name_Sergiev', 'city_name_Surgut', 'city_name_Tomsk', 'city_name_Tyumen', 'city_name_Ufa', 'city_name_Vologda', 'city_name_Volzhsky', 'city_name_Voronezh', 'city_name_Yakutsk', 'city_name_Yaroslavl', 'city_name_Zhukovsky'] ['mean_item_price', 'std_item_price', 'item_cnt_month', 'days_with_sell', 'month', 'year', 'nb_days', 'std_item_price_lag1', 'std_item_price_lag2', 'item_cnt_month_lag1', 'item_cnt_month_lag2', 'mean_category_cnt_month', 'mean_category_item_price', 'std_category_item_price', 'mean_city_cnt_month', 'mean_city_item_price', 'std_city_item_price', 'std_category_item_price_lag1', 'is_lowest_price', 'main_category_name_Accessories', 'main_category_name_Books', 'main_category_name_Game consoles', 'main_category_name_Games', 'main_category_name_Gifts', 'main_category_name_Movies', 'main_category_name_Music', 'main_category_name_Others', 'main_category_name_Payment card', 'main_category_name_Program']
expected city_name_Other, city_name_Samara, city_name_Sergiev, city_name_Krasnoyarsk, city_name_Tomsk, city_name_Yakutsk, city_name_Rostov-on-Don, city_name_SPb, city_name_Omsk, city_name_Voronezh, city_name_Surgut, city_name_Kolomna, city_name_Ufa, city_name_Czechs, city_name_Moscow, city_name_Zhukovsky, city_name_Balashikha, city_name_Volzhsky, city_name_Mytishchi, city_name_Nizhny Novgorod, city_name_Novosibirsk, city_name_Kazan, city_name_Yaroslavl, city_name_Kursk, city_name_Tyumen, city_name_Khimki, city_name_Kaluga, city_name_Adygea, city_name_Vologda in input data

# Format to submission

In [62]:
to_pred = pd.read_csv(cfg.FILENAMES['TEST_SALES'])
to_pred.head(2)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320


In [63]:
submission = pd.merge(to_pred, X_test[["item_id","shop_id","item_cnt_month_pred"]],
                      how="left", on=["item_id","shop_id"])
print(submission.isnull().sum())
submission = submission.fillna(0)
submission.head(2)

ID                          0
shop_id                     0
item_id                     0
item_cnt_month_pred    102796
dtype: int64


Unnamed: 0,ID,shop_id,item_id,item_cnt_month_pred
0,0,5,5037,0.620287
1,1,5,5320,0.0


In [68]:
submission.loc[submission["item_cnt_month_pred"]>20,"item_cnt_month_pred"]=20
submission.loc[submission["item_cnt_month_pred"]<0, "item_cnt_month_pred"]=0

In [69]:
sub_example = pd.read_csv(cfg.FILENAMES["SAMPLE_SUBM"])
sub_example.head(2)

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5


In [70]:
submission_formated = (submission[["ID", "item_cnt_month_pred"]]
                       .rename({"item_cnt_month_pred":"item_cnt_month"}, axis=1))
submission_formated.head(2)

Unnamed: 0,ID,item_cnt_month
0,0,0.620287
1,1,0.0


In [71]:
submission_formated.to_csv(os.path.join("submissions", "sub_04.csv"), index=False)