In [2]:
import os
import pandas as pd
import numpy as np
import plotly_express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import copy
import warnings
warnings.filterwarnings('ignore')
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import joblib
import optuna
from tqdm import tqdm
import time
import codecs
import pickle

# Configuration

In [3]:
rawdata_dir = 'rawdata'
data_dir = 'data'

# lags = [1,2,3,6,12,24,36]
lags = [28, 35, 42, 49, 56]

# NUM_FOLD_DAYS = 28
NUM_FOLD_DAYS = 56
VALID_FIRST_DAY = 1914
VALID_LAST_DAY = 1941
NUM_USE_DAY = 365
# TRAIN_FIRST_DAY = max(lags)
TRAIN_FIRST_DAY = VALID_FIRST_DAY - NUM_USE_DAY + 1

OPTUNA_TRIAL = 70

# Read data

In [4]:
sales = pd.read_csv(os.path.join(rawdata_dir, 'sales_train_evaluation.csv'))
sales.name = 'sales'
calendar = pd.read_csv(os.path.join(rawdata_dir, 'calendar.csv'))
calendar.name = 'calendar'
prices = pd.read_csv(os.path.join(rawdata_dir, 'sell_prices.csv'))
prices.name = 'prices'

Since, the validation data is now available for the days 1914-1941, Adding zero sales for days: d_1942 - d_1969(Test)

In [5]:
#Add zero sales for the remaining days 1942-1969
for d in range(1942,1970):
    col = 'd_' + str(d)
    sales[col] = 0
    sales[col] = sales[col].astype(np.int16)

# Downcast

In [6]:
#Downcast in order to save memory
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df

sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)

# Melt data

In [7]:
df = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], var_name='d', value_name='sold').dropna()

## Combine data
Combine price data from prices dataframe and days data from calendar dataset.

In [8]:
df = pd.merge(df, calendar, on='d', how='left')
df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left')

# Feature Engineering

## Label Encoding

In [9]:
#Store the categories along with their codes
d_id = dict(zip(df.id.cat.codes, df.id))
d_item_id = dict(zip(df.item_id.cat.codes, df.item_id))
d_dept_id = dict(zip(df.dept_id.cat.codes, df.dept_id))
d_cat_id = dict(zip(df.cat_id.cat.codes, df.cat_id))
d_store_id = dict(zip(df.store_id.cat.codes, df.store_id))
d_state_id = dict(zip(df.state_id.cat.codes, df.state_id))

In [10]:
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)
cols = df.dtypes.index.tolist()
types = df.dtypes.values.tolist()
for i,type in enumerate(types):
    if type.name == 'category':
        df[cols[i]] = df[cols[i]].cat.codes
        
df.drop('date',axis=1,inplace=True)

## Introduce lags

In [11]:
#Introduce lags
def make_lags(df, lags):
    for lag in lags:
        df['sold_lag_'+str(lag)] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],as_index=False)['sold'].shift(lag).astype(np.float16)
        
    return df

In [12]:
df = make_lags(df, lags)

## make new features

In [13]:
def make_new_feature(df, d_last = df['d'].max()):
    # copy sold
    sold_tmp = copy.copy(df['sold'])
    
    # embed NaN
    df['sold'][df['d'] > d_last] = np.nan
    
    # make feature using sold
    df['item_sold_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
    df['state_sold_avg'] = df.groupby('state_id')['sold'].transform('mean').astype(np.float16)
    df['store_sold_avg'] = df.groupby('store_id')['sold'].transform('mean').astype(np.float16)
    df['cat_sold_avg'] = df.groupby('cat_id')['sold'].transform('mean').astype(np.float16)
    df['dept_sold_avg'] = df.groupby('dept_id')['sold'].transform('mean').astype(np.float16)
    df['cat_dept_sold_avg'] = df.groupby(['cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
    df['store_item_sold_avg'] = df.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
    df['cat_item_sold_avg'] = df.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
    df['dept_item_sold_avg'] = df.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)
    df['state_store_sold_avg'] = df.groupby(['state_id','store_id'])['sold'].transform('mean').astype(np.float16)
    df['state_store_cat_sold_avg'] = df.groupby(['state_id','store_id','cat_id'])['sold'].transform('mean').astype(np.float16)
    df['store_cat_dept_sold_avg'] = df.groupby(['store_id','cat_id','dept_id'])['sold'].transform('mean').astype(np.float16)
    
    df['rolling_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.rolling(window=7).mean()).astype(np.float16)
    df['expanding_sold_mean'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform(lambda x: x.expanding(2).mean()).astype(np.float16)
    
    df['daily_avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id','d'])['sold'].transform('mean').astype(np.float16)
    df['avg_sold'] = df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])['sold'].transform('mean').astype(np.float16)
    df['selling_trend'] = (df['daily_avg_sold'] - df['avg_sold']).astype(np.float16)
    df.drop(['daily_avg_sold','avg_sold'],axis=1,inplace=True)
    
    # restore sold
    df['sold'] = sold_tmp
    
    return df

In [14]:
df_all = copy.copy(df)
df_all = make_new_feature(df_all)
df_all = df_all[df_all['d'] > max(lags)]
df_all.info()

# df_all.to_pickle(os.path.join(data_dir, 'data_all.pkl'))   # save df_all (use this after CV)
df_all.to_pickle(os.path.join(data_dir, 'data_all_lagOlder28Modified.pkl'))   # save df_all (use this after CV)
del df_all
gc.collect()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58327370 entries, 1707440 to 60034809
Data columns (total 41 columns):
id                          int16
item_id                     int16
dept_id                     int8
cat_id                      int8
store_id                    int8
state_id                    int8
d                           int16
sold                        int16
wm_yr_wk                    int16
weekday                     int8
wday                        int8
month                       int8
year                        int16
event_name_1                int8
event_type_1                int8
event_name_2                int8
event_type_2                int8
snap_CA                     int8
snap_TX                     int8
snap_WI                     int8
sell_price                  float16
sold_lag_28                 float16
sold_lag_35                 float16
sold_lag_42                 float16
sold_lag_49                 float16
sold_lag_56                 float

0

# Cross Validation

## Cross Validation for each store (function)

In [15]:
def xgboost_cv_store(df, store):
    def objective(trial):
        # param space
        param = {
            'objective': 'reg:squarederror',
            'n_estimators': 1000,
            'learning_rate': 0.1,
            'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 0.95, 0.05),
            'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.6, 0.95, 0.05),
            'max_depth':  trial.suggest_int('max_depth', 3, 9),
            'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
            'min_child_weight': trial.suggest_loguniform('min_child_weight', 0.1, 10.0),
            'tree_method': 'gpu_hist'
        }
        
        ### START: CV Loop ###
        isBreakCV = False
        fold_iter = 0
        rmse_mean = 0.
        while not(isBreakCV):
            start = time.time()   # start timer
            
            d_valid_last = VALID_LAST_DAY - NUM_FOLD_DAYS * fold_iter
            d_train_last = VALID_LAST_DAY - NUM_FOLD_DAYS * (fold_iter + 1)
            if d_train_last < TRAIN_FIRST_DAY + NUM_FOLD_DAYS * 2:
                isBreakCV = True
            
            # make_new_feature
#             df_fold_file = os.path.join(data_dir, 'data_foldday'+str(NUM_FOLD_DAYS)+'_fold'+str(fold_iter + 1)+'.pkl')
            df_fold_file = os.path.join(data_dir, 'data_foldday'+str(NUM_FOLD_DAYS)+'_fold'+str(fold_iter + 1)+'_lagOlder28Modified.pkl')
            
            if os.path.isfile(df_fold_file):   # make_new_feature 適用後のファイルがあればそれを使う
                df_fold = pd.read_pickle(df_fold_file)
            else:
                df_fold = df[df['d'] <= d_valid_last]
                df_fold = make_new_feature(df_fold, d_train_last)   # train の情報だけを使って特徴量を作成
                df_fold.to_pickle(df_fold_file)   # pickleとして保存しておく

            # Extract store
            df_fold = df_fold[df_fold['store_id'] == store]
                        
            # Split data
            X_train = df_fold[(df_fold['d'] >= TRAIN_FIRST_DAY) & (df_fold['d'] <= d_train_last)].drop('sold', axis=1)
            y_train = df_fold[(df_fold['d'] >= TRAIN_FIRST_DAY) & (df_fold['d'] <= d_train_last)]['sold']
            X_valid = df_fold[(df_fold['d'] > d_train_last) & (df_fold['d'] <= d_valid_last)].drop('sold', axis=1)
            y_valid = df_fold[(df_fold['d'] > d_train_last) & (df_fold['d'] <= d_valid_last)]['sold']
            
            # Train XGBoost model
            model = xgb.XGBRegressor(**param)
#             model.fit(X_train, y_train, eval_set=[(X_train,y_train), (X_valid, y_valid)],
#                  eval_metric='rmse', verbose=10, early_stopping_rounds=100)
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                  eval_metric='rmse', verbose=10, early_stopping_rounds=50)
            y_pred = model.predict(X_valid)
            rmse_mean += np.sqrt(mean_squared_error(y_valid.values, y_pred))
            
            fold_iter += 1
            elapsed_time = time.time() - start   # stop timer
            print("[{0}] Fold{1}: {2} [sec]".format(d_store_id[store], fold_iter, elapsed_time))
        ### END: CV Loop ###
        
        del df_fold, model, X_train, y_train, X_valid, y_valid, y_pred
        gc.collect()
        
        rmse_mean /= fold_iter
        return rmse_mean
    return objective


## Execute CV

In [16]:
stores = sales.store_id.cat.codes.unique().tolist()

# find best params by doing CV (optuna)
for store in tqdm(stores[5:]):
    print('##################### CV START: {0} #####################'.format(d_store_id[store]))
    study = optuna.create_study()
    study.optimize(xgboost_cv_store(df, store), n_trials=OPTUNA_TRIAL)
    
    # save optuna log
    df_trial = study.trials_dataframe()
#     df_trial.to_pickle(os.path.join('report', 'OptunaTrials_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'.pkl'))
    df_trial.to_pickle(os.path.join('report', 'OptunaTrials_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'_lagOlder28Modified.pkl'))
    
    del df_trial
    gc.collect()
    
    # savae best params found by CV
#     bestparam_filename = 'BestParamsSelectedByCV_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'.pkl'
    bestparam_filename = 'BestParamsSelectedByCV_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'_lagOlder28Modified.pkl'
    with open(os.path.join('result', 'best_param_cv', bestparam_filename), 'wb') as f:
        pickle.dump(study.best_params, f)


  0%|                                                                                            | 0/5 [00:00<?, ?it/s]

##################### CV START: TX_2 #####################
[0]	validation_0-rmse:3.60495
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.76390
[20]	validation_0-rmse:2.61986
[30]	validation_0-rmse:2.56457
[40]	validation_0-rmse:2.58455
[50]	validation_0-rmse:2.58544
[60]	validation_0-rmse:2.56771
[70]	validation_0-rmse:2.55816
[80]	validation_0-rmse:2.56009
Stopping. Best iteration:
[39]	validation_0-rmse:2.54629

[TX_2] Fold1: 14.465291738510132 [sec]
[0]	validation_0-rmse:3.95979
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.17383
[20]	validation_0-rmse:3.03018
[30]	validation_0-rmse:2.95976
[40]	validation_0-rmse:2.94666
[50]	validation_0-rmse:3.04966
[60]	validation_0-rmse:3.03805
[70]	validation_0-rmse:3.03242
[80]	validation_0-rmse:3.03992
Stopping. Best iteration:
[39]	validation_0-rmse:2.93812

[TX_2] Fold2: 13.590155124664307 [sec]
[0]	validation_0-rmse:3.59203
Will train until validation_0-r

[I 2020-06-30 21:09:47,257] Finished trial#0 with value: 2.7456154823303223 with parameters: {'subsample': 0.85, 'colsample_bytree': 0.85, 'max_depth': 9, 'gamma': 1.225405273875263e-07, 'min_child_weight': 0.1938952836239413}. Best is trial#0 with value: 2.7456154823303223.


[0]	validation_0-rmse:3.46890
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.79489
[20]	validation_0-rmse:2.73769
[30]	validation_0-rmse:2.70635
[40]	validation_0-rmse:2.66744
[50]	validation_0-rmse:2.64374
[60]	validation_0-rmse:2.61368
[70]	validation_0-rmse:2.60024
[80]	validation_0-rmse:2.59886
[90]	validation_0-rmse:2.60682
[100]	validation_0-rmse:2.60342
[110]	validation_0-rmse:2.61031
[120]	validation_0-rmse:2.58673
[130]	validation_0-rmse:2.58351
[140]	validation_0-rmse:2.58453
[150]	validation_0-rmse:2.57717
[160]	validation_0-rmse:2.57194
[170]	validation_0-rmse:2.56920
[180]	validation_0-rmse:2.56975
[190]	validation_0-rmse:2.57091
[200]	validation_0-rmse:2.56667
[210]	validation_0-rmse:2.56603
[220]	validation_0-rmse:2.56651
[230]	validation_0-rmse:2.56963
[240]	validation_0-rmse:2.56794
[250]	validation_0-rmse:2.56447
[260]	validation_0-rmse:2.56236
[270]	validation_0-rmse:2.56452
[280]	validation_0-rmse:2.56199
[290]	validation_0

[I 2020-06-30 21:10:33,813] Finished trial#1 with value: 2.7332220554351805 with parameters: {'subsample': 0.9, 'colsample_bytree': 0.65, 'max_depth': 4, 'gamma': 0.00040720358754261723, 'min_child_weight': 1.0070213226690587}. Best is trial#1 with value: 2.7332220554351805.


[0]	validation_0-rmse:3.47118
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.78490
[20]	validation_0-rmse:2.65720
[30]	validation_0-rmse:2.60263
[40]	validation_0-rmse:2.57012
[50]	validation_0-rmse:2.53651
[60]	validation_0-rmse:2.55128
[70]	validation_0-rmse:2.52430
[80]	validation_0-rmse:2.52908
[90]	validation_0-rmse:2.53666
[100]	validation_0-rmse:2.53038
[110]	validation_0-rmse:2.53330
[120]	validation_0-rmse:2.53513
Stopping. Best iteration:
[71]	validation_0-rmse:2.52395

[TX_2] Fold1: 9.270452737808228 [sec]
[0]	validation_0-rmse:3.84264
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.21326
[20]	validation_0-rmse:3.10122
[30]	validation_0-rmse:3.02887
[40]	validation_0-rmse:2.95341
[50]	validation_0-rmse:2.93111
[60]	validation_0-rmse:2.93014
[70]	validation_0-rmse:2.91094
[80]	validation_0-rmse:2.90461
[90]	validation_0-rmse:2.89833
[100]	validation_0-rmse:2.89566
[110]	validation_0-rmse:2.88

[I 2020-06-30 21:11:17,990] Finished trial#2 with value: 2.6946929454803468 with parameters: {'subsample': 0.7, 'colsample_bytree': 0.65, 'max_depth': 4, 'gamma': 1.6818407250916347e-06, 'min_child_weight': 0.4118247608158735}. Best is trial#2 with value: 2.6946929454803468.


[0]	validation_0-rmse:3.59709
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.69028
[20]	validation_0-rmse:2.58746
[30]	validation_0-rmse:2.55494
[40]	validation_0-rmse:2.51674
[50]	validation_0-rmse:2.49953
[60]	validation_0-rmse:2.49636
[70]	validation_0-rmse:2.49632
[80]	validation_0-rmse:2.48298
[90]	validation_0-rmse:2.46450
[100]	validation_0-rmse:2.44907
[110]	validation_0-rmse:2.45278
[120]	validation_0-rmse:2.43933
[130]	validation_0-rmse:2.44029
[140]	validation_0-rmse:2.43858
[150]	validation_0-rmse:2.44316
[160]	validation_0-rmse:2.44210
[170]	validation_0-rmse:2.44331
[180]	validation_0-rmse:2.43531
[190]	validation_0-rmse:2.44902
[200]	validation_0-rmse:2.44689
[210]	validation_0-rmse:2.44402
[220]	validation_0-rmse:2.44145
Stopping. Best iteration:
[178]	validation_0-rmse:2.43506

[TX_2] Fold1: 13.850354194641113 [sec]
[0]	validation_0-rmse:3.94932
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0

[I 2020-06-30 21:12:13,305] Finished trial#3 with value: 2.642116832733154 with parameters: {'subsample': 0.85, 'colsample_bytree': 0.75, 'max_depth': 6, 'gamma': 0.04981085863799335, 'min_child_weight': 0.4511100872353685}. Best is trial#3 with value: 2.642116832733154.


[0]	validation_0-rmse:3.59473
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.11553
[20]	validation_0-rmse:3.15188
[30]	validation_0-rmse:3.17598
[40]	validation_0-rmse:3.18982
[50]	validation_0-rmse:3.16174
[60]	validation_0-rmse:3.13536
Stopping. Best iteration:
[15]	validation_0-rmse:3.10310

[TX_2] Fold1: 7.998403310775757 [sec]
[0]	validation_0-rmse:3.94770
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.49144
[20]	validation_0-rmse:3.50918
[30]	validation_0-rmse:3.46969
[40]	validation_0-rmse:3.38612
[50]	validation_0-rmse:3.33611
[60]	validation_0-rmse:3.32779
[70]	validation_0-rmse:3.29830
[80]	validation_0-rmse:3.26899
[90]	validation_0-rmse:3.26669
[100]	validation_0-rmse:3.24029
[110]	validation_0-rmse:3.22614
[120]	validation_0-rmse:3.22176
[130]	validation_0-rmse:3.21473
[140]	validation_0-rmse:3.20996
[150]	validation_0-rmse:3.20243
[160]	validation_0-rmse:3.19478
[170]	validation_0-rmse:3

[I 2020-06-30 21:13:05,348] Finished trial#4 with value: 3.0709649085998536 with parameters: {'subsample': 0.95, 'colsample_bytree': 0.8, 'max_depth': 3, 'gamma': 2.1862635418108604e-07, 'min_child_weight': 8.27039125466486}. Best is trial#3 with value: 2.642116832733154.


[0]	validation_0-rmse:3.44778
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.45122
[20]	validation_0-rmse:2.38560
[30]	validation_0-rmse:2.34566
[40]	validation_0-rmse:2.31829
[50]	validation_0-rmse:2.30086
[60]	validation_0-rmse:2.31116
[70]	validation_0-rmse:2.31082
[80]	validation_0-rmse:2.31309
[90]	validation_0-rmse:2.31471
[100]	validation_0-rmse:2.31509
Stopping. Best iteration:
[50]	validation_0-rmse:2.30086

[TX_2] Fold1: 12.443676471710205 [sec]
[0]	validation_0-rmse:3.87050
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.96145
[20]	validation_0-rmse:2.87218
[30]	validation_0-rmse:2.83259
[40]	validation_0-rmse:2.79368
[50]	validation_0-rmse:2.78835
[60]	validation_0-rmse:2.78776
[70]	validation_0-rmse:2.78888
[80]	validation_0-rmse:2.78604
[90]	validation_0-rmse:2.78385
[100]	validation_0-rmse:2.78618
[110]	validation_0-rmse:2.78611
[120]	validation_0-rmse:2.78594
[130]	validation_0-rmse:2.7

[I 2020-06-30 21:14:01,526] Finished trial#5 with value: 2.5719700336456297 with parameters: {'subsample': 0.8, 'colsample_bytree': 0.6, 'max_depth': 8, 'gamma': 1.4694272121635845e-08, 'min_child_weight': 0.25199692639983184}. Best is trial#5 with value: 2.5719700336456297.


[0]	validation_0-rmse:3.57667
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.94787
[20]	validation_0-rmse:2.89330
[30]	validation_0-rmse:2.90727
[40]	validation_0-rmse:2.88339
[50]	validation_0-rmse:2.89041
[60]	validation_0-rmse:2.84889
[70]	validation_0-rmse:2.82722
[80]	validation_0-rmse:2.86401
[90]	validation_0-rmse:2.85858
[100]	validation_0-rmse:2.85726
[110]	validation_0-rmse:2.87154
Stopping. Best iteration:
[67]	validation_0-rmse:2.82426

[TX_2] Fold1: 9.180057764053345 [sec]
[0]	validation_0-rmse:3.93458
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.24715
[20]	validation_0-rmse:3.16812
[30]	validation_0-rmse:3.14856
[40]	validation_0-rmse:3.12200
[50]	validation_0-rmse:3.10457
[60]	validation_0-rmse:3.08080
[70]	validation_0-rmse:3.06264
[80]	validation_0-rmse:3.04402
[90]	validation_0-rmse:3.03120
[100]	validation_0-rmse:3.02470
[110]	validation_0-rmse:3.02391
[120]	validation_0-rmse:3.02

[I 2020-06-30 21:14:46,697] Finished trial#6 with value: 2.914606523513794 with parameters: {'subsample': 0.8, 'colsample_bytree': 0.85, 'max_depth': 4, 'gamma': 3.103710533894073e-05, 'min_child_weight': 0.10110171207210424}. Best is trial#5 with value: 2.5719700336456297.


[0]	validation_0-rmse:3.58095
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.10858
[20]	validation_0-rmse:3.05883
[30]	validation_0-rmse:3.01907
[40]	validation_0-rmse:3.02164
[50]	validation_0-rmse:3.03353
[60]	validation_0-rmse:3.00490
[70]	validation_0-rmse:2.99377
[80]	validation_0-rmse:2.98678
[90]	validation_0-rmse:2.94475
[100]	validation_0-rmse:2.91589
[110]	validation_0-rmse:2.91110
[120]	validation_0-rmse:2.89796
[130]	validation_0-rmse:2.89656
[140]	validation_0-rmse:2.88170
[150]	validation_0-rmse:2.86577
[160]	validation_0-rmse:2.86217
[170]	validation_0-rmse:2.85841
[180]	validation_0-rmse:2.85718
[190]	validation_0-rmse:2.85904
[200]	validation_0-rmse:2.85814
[210]	validation_0-rmse:2.84953
[220]	validation_0-rmse:2.84877
[230]	validation_0-rmse:2.84289
[240]	validation_0-rmse:2.84033
[250]	validation_0-rmse:2.84545
[260]	validation_0-rmse:2.83737
[270]	validation_0-rmse:2.82973
[280]	validation_0-rmse:2.87715
[290]	validation_0

[I 2020-06-30 21:15:43,292] Finished trial#7 with value: 3.089828062057495 with parameters: {'subsample': 0.9, 'colsample_bytree': 0.9, 'max_depth': 5, 'gamma': 1.0379056754710156e-08, 'min_child_weight': 1.393402087291755}. Best is trial#5 with value: 2.5719700336456297.


[0]	validation_0-rmse:3.57945
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.85036
[20]	validation_0-rmse:2.79641
[30]	validation_0-rmse:2.76892
[40]	validation_0-rmse:2.75494
[50]	validation_0-rmse:2.75171
[60]	validation_0-rmse:2.71863
[70]	validation_0-rmse:2.67901
[80]	validation_0-rmse:2.67067
[90]	validation_0-rmse:2.67279
[100]	validation_0-rmse:2.65785
[110]	validation_0-rmse:2.65711
[120]	validation_0-rmse:2.65563
[130]	validation_0-rmse:2.65151
[140]	validation_0-rmse:2.65429
[150]	validation_0-rmse:2.65309
[160]	validation_0-rmse:2.65610
[170]	validation_0-rmse:2.65572
Stopping. Best iteration:
[124]	validation_0-rmse:2.65017

[TX_2] Fold1: 10.329758644104004 [sec]
[0]	validation_0-rmse:3.93148
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.24353
[20]	validation_0-rmse:3.13754
[30]	validation_0-rmse:3.12073
[40]	validation_0-rmse:3.06575
[50]	validation_0-rmse:3.03854
[60]	validation_0-rmse

[I 2020-06-30 21:16:33,250] Finished trial#8 with value: 2.795354652404785 with parameters: {'subsample': 0.9, 'colsample_bytree': 0.75, 'max_depth': 4, 'gamma': 1.5428642135880798e-08, 'min_child_weight': 3.6609918024041974}. Best is trial#5 with value: 2.5719700336456297.


[0]	validation_0-rmse:3.57482
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.14082
[20]	validation_0-rmse:3.02626
[30]	validation_0-rmse:2.92494
[40]	validation_0-rmse:2.86769
[50]	validation_0-rmse:2.88944
[60]	validation_0-rmse:2.89548
[70]	validation_0-rmse:2.88671
[80]	validation_0-rmse:2.89105
[90]	validation_0-rmse:2.88917
Stopping. Best iteration:
[41]	validation_0-rmse:2.86663

[TX_2] Fold1: 8.629385948181152 [sec]
[0]	validation_0-rmse:3.93611
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:3.59976
[20]	validation_0-rmse:3.45226
[30]	validation_0-rmse:3.37426
[40]	validation_0-rmse:3.37431
[50]	validation_0-rmse:3.36140
[60]	validation_0-rmse:3.36339
[70]	validation_0-rmse:3.35205
[80]	validation_0-rmse:3.33124
[90]	validation_0-rmse:3.31387
[100]	validation_0-rmse:3.29620
[110]	validation_0-rmse:3.29122
[120]	validation_0-rmse:3.28686
[130]	validation_0-rmse:3.28273
[140]	validation_0-rmse:3.27

[I 2020-06-30 21:17:29,761] Finished trial#9 with value: 3.0488515853881837 with parameters: {'subsample': 0.85, 'colsample_bytree': 0.9, 'max_depth': 4, 'gamma': 0.10162320886498524, 'min_child_weight': 1.5826309013654887}. Best is trial#5 with value: 2.5719700336456297.


[0]	validation_0-rmse:3.44807
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.48917
[20]	validation_0-rmse:2.40206
[30]	validation_0-rmse:2.36722
[40]	validation_0-rmse:2.35272
[50]	validation_0-rmse:2.35711
[60]	validation_0-rmse:2.35934
[70]	validation_0-rmse:2.36197
[80]	validation_0-rmse:2.36648
Stopping. Best iteration:
[39]	validation_0-rmse:2.35040

[TX_2] Fold1: 14.001842260360718 [sec]
[0]	validation_0-rmse:3.85207
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.97244
[20]	validation_0-rmse:2.87479
[30]	validation_0-rmse:2.81650
[40]	validation_0-rmse:2.77669
[50]	validation_0-rmse:2.75952
[60]	validation_0-rmse:2.75891
[70]	validation_0-rmse:2.75403
[80]	validation_0-rmse:2.75957
[90]	validation_0-rmse:2.75640
[100]	validation_0-rmse:2.75779
Stopping. Best iteration:
[56]	validation_0-rmse:2.75331

[TX_2] Fold2: 14.694905996322632 [sec]
[0]	validation_0-rmse:3.50622
Will train until validation

[I 2020-06-30 21:18:32,472] Finished trial#10 with value: 2.583181095123291 with parameters: {'subsample': 0.6, 'colsample_bytree': 0.6, 'max_depth': 9, 'gamma': 0.0010941126406432754, 'min_child_weight': 0.10038267164525866}. Best is trial#5 with value: 2.5719700336456297.


[0]	validation_0-rmse:3.44807
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.48917
[20]	validation_0-rmse:2.40424
[30]	validation_0-rmse:2.36084
[40]	validation_0-rmse:2.34319
[50]	validation_0-rmse:2.32797
[60]	validation_0-rmse:2.33984
[70]	validation_0-rmse:2.34278
[80]	validation_0-rmse:2.34554
[90]	validation_0-rmse:2.34807
Stopping. Best iteration:
[48]	validation_0-rmse:2.32735

[TX_2] Fold1: 14.692296981811523 [sec]
[0]	validation_0-rmse:3.85207
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.99464
[20]	validation_0-rmse:2.89695
[30]	validation_0-rmse:2.84653
[40]	validation_0-rmse:2.80525
[50]	validation_0-rmse:2.79379
[60]	validation_0-rmse:2.79025
[70]	validation_0-rmse:2.78781
[80]	validation_0-rmse:2.78400
[90]	validation_0-rmse:2.78694
[100]	validation_0-rmse:2.78436
[110]	validation_0-rmse:2.78040
[120]	validation_0-rmse:2.77413
[130]	validation_0-rmse:2.77271
[140]	validation_0-rmse:2.7

[I 2020-06-30 21:19:44,703] Finished trial#11 with value: 2.580069923400879 with parameters: {'subsample': 0.6, 'colsample_bytree': 0.6, 'max_depth': 9, 'gamma': 0.0014947129383570562, 'min_child_weight': 0.1052828965200365}. Best is trial#5 with value: 2.5719700336456297.


[0]	validation_0-rmse:3.45404
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.46615
[20]	validation_0-rmse:2.37920
[30]	validation_0-rmse:2.33990
[40]	validation_0-rmse:2.31797
[50]	validation_0-rmse:2.33604
[60]	validation_0-rmse:2.35624
[70]	validation_0-rmse:2.35524
[80]	validation_0-rmse:2.36049
Stopping. Best iteration:
[39]	validation_0-rmse:2.31622

[TX_2] Fold1: 10.392425298690796 [sec]
[0]	validation_0-rmse:3.84871
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.87200
[20]	validation_0-rmse:2.78107
[30]	validation_0-rmse:2.74846
[40]	validation_0-rmse:2.75118
[50]	validation_0-rmse:2.75633
[60]	validation_0-rmse:2.76126
[70]	validation_0-rmse:2.76658
[80]	validation_0-rmse:2.76392
Stopping. Best iteration:
[34]	validation_0-rmse:2.73032

[TX_2] Fold2: 9.802509784698486 [sec]
[0]	validation_0-rmse:3.49450
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.7

[I 2020-06-30 21:20:29,715] Finished trial#12 with value: 2.571170949935913 with parameters: {'subsample': 0.65, 'colsample_bytree': 0.6, 'max_depth': 7, 'gamma': 0.003351610111917839, 'min_child_weight': 0.2199065549666421}. Best is trial#12 with value: 2.571170949935913.


[0]	validation_0-rmse:3.44889
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.55572
[20]	validation_0-rmse:2.47085
[30]	validation_0-rmse:2.44663
[40]	validation_0-rmse:2.42116
[50]	validation_0-rmse:2.41828
[60]	validation_0-rmse:2.43004
[70]	validation_0-rmse:2.44518
[80]	validation_0-rmse:2.44693
[90]	validation_0-rmse:2.45175
Stopping. Best iteration:
[45]	validation_0-rmse:2.41616

[TX_2] Fold1: 10.542816400527954 [sec]
[0]	validation_0-rmse:3.85318
Will train until validation_0-rmse hasn't improved in 50 rounds.
[10]	validation_0-rmse:2.91790
[20]	validation_0-rmse:2.81938
[30]	validation_0-rmse:2.78525
[40]	validation_0-rmse:2.77006
[50]	validation_0-rmse:2.76616
[60]	validation_0-rmse:2.77136
[70]	validation_0-rmse:2.77980
[80]	validation_0-rmse:2.76999
Stopping. Best iteration:
[39]	validation_0-rmse:2.75490

[TX_2] Fold2: 9.884717226028442 [sec]


KeyboardInterrupt: 

In [None]:
del df
gc.collect()

## Train & Predict by best model (function)

In [None]:
# def xgboost_by_bestparam_for_valid(df, store, best_params, valid_preds):
#     print('##################### Traing4ValidByBestParam START: {0} #####################'.format(d_store_id[store]))
#     print('Best Parameters of store {0} model: {1}'.format(d_store_id[store], best_params))
    
#     # prepare data set
#     df_store = df[df['store_id'] == store]
#     X_train = df_store[df_store['d'] < VALID_FIRST_DAY].drop('sold', axis=1)
#     y_train = df_store[df_store['d'] < VALID_FIRST_DAY]['sold']
#     X_valid = df_store[(df_store['d'] >= VALID_FIRST_DAY) & (df_store['d'] <= VALID_LAST_DAY)].drop('sold', axis=1)
    
#     # train XGBoost by using best_params
#     model = xgb.XGBRegressor(**best_params)
#     model.fit(X_train, y_train, eval_metric='rmse')
    
#     # save model for validation
# #     modelfile = 'BestModel4valid_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'.pkl'
#     modelfile = 'BestModel4valid_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'_lagOlder28Modified.pkl'
#     with open(os.path.join('result', 'best_model', modelfile), 'wb') as f:
#         pickle.dump(model, f)
    
#     # predict for valid
#     valid_preds[X_valid.index] = model.predict(X_valid)
#     del df_store, X_train, y_train, X_valid, model
#     gc.collect()
    

In [None]:
# def xgboost_by_bestparam_for_test(df, store, best_params, eval_preds):
#     print('##################### Traing4TestByBestParam START: {0} #####################'.format(d_store_id[store]))
    
#     # prepare data set
#     df_store = df[df['store_id'] == store]
#     X_train = df_store[df_store['d'] <= VALID_LAST_DAY].drop('sold', axis=1)
#     y_train = df_store[df_store['d'] <= VALID_LAST_DAY]['sold']
#     X_test = df_store[df_store['d'] > VALID_LAST_DAY].drop('sold', axis=1)
    
#     # train XGBoost by using best_params
#     model = xgb.XGBRegressor(**best_params)
#     model.fit(X_train, y_train, eval_metric='rmse')
        
#     # predict for test
#     eval_preds[X_test.index] = model.predict(X_test)
    
#     # save model for test
# #     modelfile = 'BestModel4test_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'.pkl'
#     modelfile = 'BestModel4test_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'_lagOlder28Modified.pkl'
#     with open(os.path.join('result', 'best_model', modelfile), 'wb') as f:
#         pickle.dump(model, f)
    
#     del df_store, X_train, y_train, X_test, model
#     gc.collect()
    

## Train & Predict by best model

In [None]:
# # train & predict (best model)
# # df_all = pd.read_pickle(os.path.join(data_dir, 'data_all.pkl'))
# df_all = pd.read_pickle(os.path.join(data_dir, 'data_all_lagOlder28Modified.pkl'))

# valid = df_all[(df_all['d'] >= VALID_FIRST_DAY) & (df_all['d'] <= VALID_LAST_DAY)][['id','d','sold']]
# test = df_all[df_all['d'] > VALID_LAST_DAY][['id','d','sold']]
# valid_preds = valid['sold']
# eval_preds = test['sold']

# for store in tqdm(stores):
#     # read best param selected by CV
# #     bestparam_filename = 'BestParamsSelectedByCV_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'.pkl'
#     bestparam_filename = 'BestParamsSelectedByCV_'+d_store_id[store]+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'_lagOlder28Modified.pkl'
#     with open(os.path.join('result', 'best_param_cv', bestparam_filename), 'rb') as f:
#         best_params = pickle.load(f)

#     # add other params
#     best_params['objective'] = 'reg:squarederror'
#     best_params['n_estimators'] = 1000
#     best_params['learning_rate'] = 0.1
#     best_params['tree_method'] = 'gpu_hist'
    
#     # best model for valid
#     xgboost_by_bestparam_for_valid(df_all, store, best_params, valid_preds) # 大丈夫?

    
#     # best model for test
#     xgboost_by_bestparam_for_test(df_all, store, best_params, eval_preds)


In [None]:
# # save valid_preds
# # valid_preds_filename = 'ValidPreds_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'.pkl'
# valid_preds_filename = 'ValidPreds_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'_lagOlder28Modified.pkl'
# with open(os.path.join('result', 'preds', valid_preds_filename), 'wb') as f:
#     pickle.dump(valid_preds, f)
    
# # save test_preds
# # eval_preds_filename = 'EvalPreds_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'.pkl'
# eval_preds_filename = 'EvalPreds_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'_lagOlder28Modified.pkl'
# with open(os.path.join('result', 'preds', eval_preds_filename), 'wb') as f:
#     pickle.dump(eval_preds, f)

In [None]:
# del df_all
# gc.collect()

# Make submission

In [None]:
# #Set actual equal to false if you want to top in the public leaderboard :P
# actual = True
# if actual == False:
#     #Get the validation results(We already have them as less than one month left for competition to end)
#     validation = sales[['id']+['d_' + str(i) for i in range(1914,1942)]]
#     validation['id']=pd.read_csv(os.path.join(rawdata_dir, 'sales_train_validation.csv')).id
#     validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
# else:
#     #Get the actual validation results
#     valid['sold'] = valid_preds
#     validation = valid[['id','d','sold']]
#     validation = pd.pivot(validation, index='id', columns='d', values='sold').reset_index()
#     validation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
#     validation.id = validation.id.map(d_id).str.replace('evaluation','validation')

# #Get the evaluation results
# test['sold'] = eval_preds
# evaluation = test[['id','d','sold']]
# evaluation = pd.pivot(evaluation, index='id', columns='d', values='sold').reset_index()
# evaluation.columns=['id'] + ['F' + str(i + 1) for i in range(28)]
# #Remap the category id to their respective categories
# evaluation.id = evaluation.id.map(d_id)

# #Prepare the submission
# submit = pd.concat([validation,evaluation]).reset_index(drop=True)

# isNegative2Zero = True
# if isNegative2Zero:
#     submit = submit.set_index("id", drop=True)
#     submit[submit < 0] = 0
#     submit = submit.reset_index()

# # submit_file = 'xgboost_cv_UseValidPredsIs'+str(actual)+'_Negative2ZeroIs'+str(isNegative2Zero)+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'.csv'
# submit_file = 'xgboost_cv_UseValidPredsIs'+str(actual)+'_Negative2ZeroIs'+str(isNegative2Zero)+'_foldday'+str(NUM_FOLD_DAYS)+'_useday'+str(NUM_USE_DAY)+'_lagOlder28Modified.csv'
# submit.to_csv(os.path.join('submission_point', submit_file), index=False)