# Imports and config

In [1]:
import os
import pickle
import warnings
import random
import pickle

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from cycler import cycler

import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error

import config as cfg
import data_processing as dp


warnings.filterwarnings('ignore')
plt.style.use('seaborn-dark-palette')

In [28]:
df_fs=pd.read_pickle("cleaned_sets/df_features.pkl")

# Feature Selection

In [29]:
df_fs.head(1)

Unnamed: 0,shop_id,item_id,date_block_num,random_date,mean_item_price,std_item_price,item_cnt_month,days_with_sell,item_name,item_category_id,...,mean_city_cnt_month,mean_city_item_price,std_city_item_price,mean_category_cnt_month_lag1,mean_category_item_price_lag1,std_category_item_price_lag1,mean_city_cnt_month_lag1,mean_city_item_price_lag1,std_city_item_price_lag1,is_lowest_price
0,0,16385,0,20.01.2013,200.0,0.0,2.0,2.0,"Нэнси Дрю. Сгоревшее алиби [PC, Jewel]",30,...,0.581237,513.799861,2.299256,,,,,,,True


In [30]:
df_fs.columns

Index(['shop_id', 'item_id', 'date_block_num', 'random_date',
       'mean_item_price', 'std_item_price', 'item_cnt_month', 'days_with_sell',
       'item_name', 'item_category_id', 'main_category_name',
       'sub_category_name', 'city_name', 'item_cnt_next_month', 'month',
       'year', 'nb_days', 'mean_item_price_lag1', 'mean_item_price_lag2',
       'std_item_price_lag1', 'std_item_price_lag2', 'item_cnt_month_lag1',
       'item_cnt_month_lag2', 'item_cnt_month_lag12',
       'mean_category_cnt_month', 'mean_category_item_price',
       'std_category_item_price', 'mean_city_cnt_month',
       'mean_city_item_price', 'std_city_item_price',
       'mean_category_cnt_month_lag1', 'mean_category_item_price_lag1',
       'std_category_item_price_lag1', 'mean_city_cnt_month_lag1',
       'mean_city_item_price_lag1', 'std_city_item_price_lag1',
       'is_lowest_price'],
      dtype='object')

In [31]:
col_to_remove = ["random_date","item_name","item_category_id", 
                 "item_cnt_month_lag12","sub_category_name"]
df_fs = df_fs.drop(col_to_remove, axis=1)

In [32]:
col_id = ["shop_id","item_id","date_block_num"]
df_fs_predictors = df_fs.loc[:,[col for col in df_fs.columns if col not in col_id]]

#### Collinearity

![caption](images/feature_heatmap.png)

In [33]:
col_to_remove = ["mean_item_price_lag1", "mean_item_price_lag2","mean_category_cnt_month_lag1",
                 "mean_category_item_price_lag1","mean_category_item_price_lag1",
                 "mean_city_cnt_month_lag1", "mean_city_item_price_lag1","std_city_item_price_lag1"]
df_fs = df_fs.drop(col_to_remove, axis=1)

##### String features to dummies

In [36]:
col_to_dummies = ["main_category_name","city_name"]
df_fs = pd.concat([df_fs.drop(col_to_dummies, axis=1), pd.get_dummies(df_fs[col_to_dummies])], axis=1)
df_fs.head(2)

Unnamed: 0,shop_id,item_id,date_block_num,mean_item_price,std_item_price,item_cnt_month,days_with_sell,item_cnt_next_month,month,year,...,city_name_Surgut,city_name_Tomsk,city_name_Tyumen,city_name_Ufa,city_name_Vologda,city_name_Volzhsky,city_name_Voronezh,city_name_Yakutsk,city_name_Yaroslavl,city_name_Zhukovsky
0,0,16385,0,200.0,0.0,2.0,2.0,2.0,1,13,...,0,0,0,0,0,0,0,1,0,0
1,0,16385,1,200.0,0.0,2.0,2.0,0.0,2,13,...,0,0,0,0,0,0,0,1,0,0


##### Zero Importance Features

First, we need to build a train and a validation set. We'll also build the test set.  

In [37]:
train = df_fs.loc[(df_fs["date_block_num"]>=2) & (df_fs["date_block_num"]<=27)]
valid = df_fs.loc[(df_fs["date_block_num"]>=28) & (df_fs["date_block_num"]<=32)]
test = df_fs.loc[(df_fs["date_block_num"]>=33)]

In [76]:
X_train_fs = train.drop(["date_block_num","item_cnt_next_month","item_id","shop_id"], axis=1).sample(frac=0.2)
y_train_fs = train.loc[train.index.isin(X_train_fs.index),"item_cnt_next_month"]
X_val_fs = valid.drop(["date_block_num","item_cnt_next_month","item_id","shop_id"], axis=1).sample(frac=0.2)
y_val_fs = valid.loc[valid.index.isin(X_val_fs.index),"item_cnt_next_month"]

In [78]:
xgbr = xgb.XGBRegressor(seed=0)
xgbr.fit(X_train_fs, y_train_fs,
        eval_metric="rmse", 
              eval_set=[(X_train_fs, y_train_fs), (X_val_fs, y_val_fs)], 
              verbose=20, 
              early_stopping_rounds=10)

[0]	validation_0-rmse:2.60384	validation_1-rmse:2.04182
Multiple eval metrics have been passed: 'validation_1-rmse' will be used for early stopping.

Will train until validation_1-rmse hasn't improved in 10 rounds.
Stopping. Best iteration:
[9]	validation_0-rmse:2.6069	validation_1-rmse:2.02509



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=0, silent=None,
             subsample=1, verbosity=1)

In [80]:
ft_imp ={}
for i in range(len(X_train_fs.columns)):
    ft_imp[X_train_fs.columns[i]] = xgbr.feature_importances_[i]
sorted(ft_imp.items(), key=lambda kv: kv[1], reverse=True)

[('is_lowest_price', 0.30668318),
 ('city_name_Tyumen', 0.26034078),
 ('mean_city_cnt_month', 0.21589477),
 ('mean_item_price', 0.048524242),
 ('city_name_Omsk', 0.024549523),
 ('std_item_price_lag1', 0.020891327),
 ('city_name_Other', 0.017957376),
 ('mean_city_item_price', 0.016603692),
 ('std_item_price_lag2', 0.016049614),
 ('mean_category_cnt_month', 0.0150545),
 ('days_with_sell', 0.013442144),
 ('city_name_Volzhsky', 0.008228719),
 ('item_cnt_month_lag1', 0.0061957897),
 ('std_city_item_price', 0.0055480087),
 ('month', 0.005148007),
 ('mean_category_item_price', 0.0048806686),
 ('city_name_Khimki', 0.004814718),
 ('item_cnt_month_lag2', 0.0028455546),
 ('std_category_item_price', 0.0023008257),
 ('city_name_Novosibirsk', 0.0020749127),
 ('std_category_item_price_lag1', 0.001971608),
 ('std_item_price', 0.0),
 ('item_cnt_month', 0.0),
 ('year', 0.0),
 ('nb_days', 0.0),
 ('main_category_name_Accessories', 0.0),
 ('main_category_name_Books', 0.0),
 ('main_category_name_Game consol

##### Keep only features such as cumulated importance > threshold - NOT DONE

# Prediction

In [60]:
X_test = test.drop(["date_block_num","item_cnt_next_month"], axis=1)
print(X_test.isnull().sum())
X_test.head(2)

shop_id                             0
item_id                             0
mean_item_price                     0
std_item_price                      0
item_cnt_month                      0
days_with_sell                      0
month                               0
year                                0
nb_days                             0
std_item_price_lag1                 0
std_item_price_lag2                 0
item_cnt_month_lag1                 0
item_cnt_month_lag2                 0
mean_category_cnt_month             0
mean_category_item_price            0
std_category_item_price             0
mean_city_cnt_month                 0
mean_city_item_price                0
std_city_item_price                 0
std_category_item_price_lag1        0
is_lowest_price                     0
main_category_name_Accessories      0
main_category_name_Books            0
main_category_name_Game consoles    0
main_category_name_Games            0
main_category_name_Gifts            0
main_categor

Unnamed: 0,shop_id,item_id,mean_item_price,std_item_price,item_cnt_month,days_with_sell,month,year,nb_days,std_item_price_lag1,...,city_name_Surgut,city_name_Tomsk,city_name_Tyumen,city_name_Ufa,city_name_Vologda,city_name_Volzhsky,city_name_Voronezh,city_name_Yakutsk,city_name_Yaroslavl,city_name_Zhukovsky
33,0,16385,310.568627,0.0,0.0,0.0,10,15,31,0.0,...,0,0,0,0,0,0,0,1,0,0
67,0,8195,141.206851,0.041437,0.0,0.0,10,15,31,0.0,...,0,0,0,0,0,0,0,1,0,0


In [61]:
y_pred = xgbr.predict(X_test)
X_test["item_cnt_month_pred"] = y_pred

X_test.head(2)

Unnamed: 0,shop_id,item_id,mean_item_price,std_item_price,item_cnt_month,days_with_sell,month,year,nb_days,std_item_price_lag1,...,city_name_Tomsk,city_name_Tyumen,city_name_Ufa,city_name_Vologda,city_name_Volzhsky,city_name_Voronezh,city_name_Yakutsk,city_name_Yaroslavl,city_name_Zhukovsky,item_cnt_month_pred
33,0,16385,310.568627,0.0,0.0,0.0,10,15,31,0.0,...,0,0,0,0,0,0,1,0,0,0.086264
67,0,8195,141.206851,0.041437,0.0,0.0,10,15,31,0.0,...,0,0,0,0,0,0,1,0,0,0.086264


# Format to submission

In [62]:
to_pred = pd.read_csv(cfg.FILENAMES['TEST_SALES'])
to_pred.head(2)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320


In [63]:
submission = pd.merge(to_pred, X_test[["item_id","shop_id","item_cnt_month_pred"]],
                      how="left", on=["item_id","shop_id"])
print(submission.isnull().sum())
submission = submission.fillna(0)
submission.head(2)

ID                          0
shop_id                     0
item_id                     0
item_cnt_month_pred    102796
dtype: int64


Unnamed: 0,ID,shop_id,item_id,item_cnt_month_pred
0,0,5,5037,0.620287
1,1,5,5320,0.0


In [68]:
submission.loc[submission["item_cnt_month_pred"]>20,"item_cnt_month_pred"]=20
submission.loc[submission["item_cnt_month_pred"]<0, "item_cnt_month_pred"]=0

In [69]:
sub_example = pd.read_csv(cfg.FILENAMES["SAMPLE_SUBM"])
sub_example.head(2)

Unnamed: 0,ID,item_cnt_month
0,0,0.5
1,1,0.5


In [70]:
submission_formated = (submission[["ID", "item_cnt_month_pred"]]
                       .rename({"item_cnt_month_pred":"item_cnt_month"}, axis=1))
submission_formated.head(2)

Unnamed: 0,ID,item_cnt_month
0,0,0.620287
1,1,0.0


In [71]:
submission_formated.to_csv(os.path.join("submissions", "sub_04.csv"), index=False)