In [59]:
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import xgboost
from xgboost import XGBRegressor
from xgboost import plot_importance

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, MinMaxScaler

%matplotlib inline
sns.set(style="darkgrid")
pd.set_option('display.float_format', lambda x: '%.2f' % x)
warnings.filterwarnings("ignore")

from pandas.testing import assert_frame_equal
from lightgbm import LGBMRegressor


In [58]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [6]:
train = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail3\train.csv')
test = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail3\test.csv')
submission = pd.read_csv(r'C:\Users\pc\Desktop\Data Science Folder\Retail3\sample_submission.csv')

In [7]:
print('\n******* title ********\n', 'shape: ', train.shape, '\n', train.columns, '\n', train.describe(), '\n', train.isnull().sum() )

print('\n******* title ********\n', 'shape: ', test.shape, '\n', test.columns, '\n', test.describe(), '\n', test.isnull().sum() )


print('\n******* title ********\n', 'shape: ', submission.shape, '\n', submission.columns, '\n', submission.describe(), '\n', submission.isnull().sum() )



******* title ********
 shape:  (913000, 4) 
 Index(['date', 'store', 'item', 'sales'], dtype='object') 
           store      item     sales
count 913000.00 913000.00 913000.00
mean       5.50     25.50     52.25
std        2.87     14.43     28.80
min        1.00      1.00      0.00
25%        3.00     13.00     30.00
50%        5.50     25.50     47.00
75%        8.00     38.00     70.00
max       10.00     50.00    231.00 
 date     0
store    0
item     0
sales    0
dtype: int64

******* title ********
 shape:  (45000, 4) 
 Index(['id', 'date', 'store', 'item'], dtype='object') 
             id    store     item
count 45000.00 45000.00 45000.00
mean  22499.50     5.50    25.50
std   12990.53     2.87    14.43
min       0.00     1.00     1.00
25%   11249.75     3.00    13.00
50%   22499.50     5.50    25.50
75%   33749.25     8.00    38.00
max   44999.00    10.00    50.00 
 id       0
date     0
store    0
item     0
dtype: int64

******* title ********
 shape:  (45000, 2) 
 Index

In [8]:
test_shop_ids = test['store'].unique()
test_item_ids = test['item'].unique()

lk_train = train[train['store'].isin(test_shop_ids)]

lk_train = train[train['item'].isin(test_item_ids)]







In [9]:
lk_train.equals(train)

True

In [10]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [11]:
train['date'] = pd.to_datetime(train.date,format="%Y-%m-%d")

train['month'] = train['date'].dt.month
train['day'] = train['date'].dt.day

train.head()

Unnamed: 0,date,store,item,sales,month,day
0,2013-01-01,1,1,13,1,1
1,2013-01-02,1,1,11,1,2
2,2013-01-03,1,1,14,1,3
3,2013-01-04,1,1,13,1,4
4,2013-01-05,1,1,10,1,5


In [12]:
train = train.sort_values('date')


In [13]:
train2 = train.copy()

train2 = train2.groupby(by=['month', 'day', 'store', 'item'], as_index=False).agg({'sales':['mean', 'min', 'max', 'std']})

train2.head()

Unnamed: 0_level_0,month,day,store,item,sales,sales,sales,sales
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,mean,min,max,std
0,1,1,1,1,14.6,9,21,5.18
1,1,1,1,2,34.8,32,40,3.27
2,1,1,1,3,22.6,15,36,8.08
3,1,1,1,4,14.0,9,21,4.8
4,1,1,1,5,14.0,6,25,6.96


In [14]:
train2.columns = ['month', 'day', 'store', 'item', 'mean_sales', 'min_sales', 'max_sales', 'std_sales']

train2.head()


Unnamed: 0,month,day,store,item,mean_sales,min_sales,max_sales,std_sales
0,1,1,1,1,14.6,9,21,5.18
1,1,1,1,2,34.8,32,40,3.27
2,1,1,1,3,22.6,15,36,8.08
3,1,1,1,4,14.0,9,21,4.8
4,1,1,1,5,14.0,6,25,6.96


In [15]:
train2.shape

(183000, 8)

In [16]:
train2.describe()

Unnamed: 0,month,day,store,item,mean_sales,min_sales,max_sales,std_sales
count,183000.0,183000.0,183000.0,183000.0,183000.0,183000.0,183000.0,182500.0
mean,6.51,15.76,5.5,25.5,52.21,38.86,66.31,11.14
std,3.45,8.81,2.87,14.43,26.47,21.42,32.88,6.08
min,1.0,1.0,1.0,1.0,4.2,0.0,5.0,0.0
25%,4.0,8.0,3.0,13.0,30.6,22.0,40.0,6.58
50%,7.0,16.0,5.5,25.5,48.0,35.0,61.0,9.91
75%,10.0,23.0,8.0,38.0,69.4,52.0,87.0,14.46
max,12.0,31.0,10.0,50.0,160.8,134.0,231.0,46.73


# empty df

In [17]:
store_ids = train2['store'].unique()
item_ids = train2['item'].unique()
empty_df = []
for i in range (1,13):
    for j in range(1,32):
        for store in store_ids:
            for item in item_ids:
                empty_df.append([i, j, store, item])


    
empty_df = pd.DataFrame(empty_df, columns=['month', 'day','store','item'])

In [18]:
train2 = pd.merge(empty_df, train2, on=['month', 'day','store','item'], how='left')
train.fillna(0, inplace=True)

print( train2.head() )

print( 'train2:  ', train2.shape ) # became 6 million rows

   month  day  store  item  mean_sales  min_sales  max_sales  std_sales
0      1    1      1     1       14.60       9.00      21.00       5.18
1      1    1      1     2       34.80      32.00      40.00       3.27
2      1    1      1     3       22.60      15.00      36.00       8.08
3      1    1      1     4       14.00       9.00      21.00       4.80
4      1    1      1     5       14.00       6.00      25.00       6.96
train2:   (186000, 8)


# shift

In [19]:
train3 = train2.copy()

train3['shift_sales'] =  train3.groupby(['store', 'item'])['mean_sales'].shift(-1)

print ( train3.head(5) )
print( train3.isnull().sum())


   month  day  store  item  mean_sales  min_sales  max_sales  std_sales  \
0      1    1      1     1       14.60       9.00      21.00       5.18   
1      1    1      1     2       34.80      32.00      40.00       3.27   
2      1    1      1     3       22.60      15.00      36.00       8.08   
3      1    1      1     4       14.00       9.00      21.00       4.80   
4      1    1      1     5       14.00       6.00      25.00       6.96   

   shift_sales  
0        14.40  
1        38.80  
2        24.80  
3        13.60  
4         9.60  
month             0
day               0
store             0
item              0
mean_sales     3000
min_sales      3000
max_sales      3000
std_sales      3500
shift_sales    3500
dtype: int64


In [20]:
# min_prediods= 1 =) we don't have null values ( at least
# one observation required to have a value)
# Min value 
f_min = lambda x: x.rolling(window=3, min_periods=1).min() 
# Max value
f_max = lambda x: x.rolling(window=3, min_periods=1).max()
# Mean value
f_mean = lambda x: x.rolling(window=3, min_periods=1).mean()
# Standard deviation
# std: dispersion of a dataset relative to its mean and is calculated as the square root of the variance
f_std = lambda x: x.rolling(window=3, min_periods=1).std()

function_list = [f_min, f_max, f_mean, f_std]
function_name = ['min', 'max', 'mean', 'std']
for i in range(len(function_list)):
    train3[('sales_lag_%s' % function_name[i])] = train.groupby(['store', 'item'])['sales'].apply(function_list[i])

# Fill the empty std features with 0
train3.fillna(0, inplace=True)

train3.head()

Unnamed: 0,month,day,store,item,mean_sales,min_sales,max_sales,std_sales,shift_sales,sales_lag_min,sales_lag_max,sales_lag_mean,sales_lag_std
0,1,1,1,1,14.6,9.0,21.0,5.18,14.4,13.0,13.0,13.0,0.0
1,1,1,1,2,34.8,32.0,40.0,3.27,38.8,11.0,13.0,12.0,1.41
2,1,1,1,3,22.6,15.0,36.0,8.08,24.8,11.0,14.0,12.67,1.53
3,1,1,1,4,14.0,9.0,21.0,4.8,13.6,11.0,14.0,12.67,1.53
4,1,1,1,5,14.0,6.0,25.0,6.96,9.6,10.0,14.0,12.33,2.08


In [21]:
lag_list = [1, 2, 3]

for lag in lag_list:
    ft_name = ('sales_shifted%s' % lag)
    train3[ft_name] = train3.groupby(['store' , 'item'])['mean_sales'].shift(lag)
    # Fill the empty shifted features with 0
    train3[ft_name].fillna(0, inplace=True)

train3.head()
    

Unnamed: 0,month,day,store,item,mean_sales,min_sales,max_sales,std_sales,shift_sales,sales_lag_min,sales_lag_max,sales_lag_mean,sales_lag_std,sales_shifted1,sales_shifted2,sales_shifted3
0,1,1,1,1,14.6,9.0,21.0,5.18,14.4,13.0,13.0,13.0,0.0,0.0,0.0,0.0
1,1,1,1,2,34.8,32.0,40.0,3.27,38.8,11.0,13.0,12.0,1.41,0.0,0.0,0.0
2,1,1,1,3,22.6,15.0,36.0,8.08,24.8,11.0,14.0,12.67,1.53,0.0,0.0,0.0
3,1,1,1,4,14.0,9.0,21.0,4.8,13.6,11.0,14.0,12.67,1.53,0.0,0.0,0.0
4,1,1,1,5,14.0,6.0,25.0,6.96,9.6,10.0,14.0,12.33,2.08,0.0,0.0,0.0


In [22]:
train3

Unnamed: 0,month,day,store,item,mean_sales,min_sales,max_sales,std_sales,shift_sales,sales_lag_min,sales_lag_max,sales_lag_mean,sales_lag_std,sales_shifted1,sales_shifted2,sales_shifted3
0,1,1,1,1,14.60,9.00,21.00,5.18,14.40,13.00,13.00,13.00,0.00,0.00,0.00,0.00
1,1,1,1,2,34.80,32.00,40.00,3.27,38.80,11.00,13.00,12.00,1.41,0.00,0.00,0.00
2,1,1,1,3,22.60,15.00,36.00,8.08,24.80,11.00,14.00,12.67,1.53,0.00,0.00,0.00
3,1,1,1,4,14.00,9.00,21.00,4.80,13.60,11.00,14.00,12.67,1.53,0.00,0.00,0.00
4,1,1,1,5,14.00,6.00,25.00,6.96,9.60,10.00,14.00,12.33,2.08,0.00,0.00,0.00
5,1,1,1,6,35.80,30.00,52.00,9.18,35.00,10.00,13.00,11.67,1.53,0.00,0.00,0.00
6,1,1,1,7,34.20,25.00,42.00,6.69,36.00,10.00,12.00,10.67,1.15,0.00,0.00,0.00
7,1,1,1,8,46.20,33.00,65.00,13.74,45.40,9.00,12.00,10.33,1.53,0.00,0.00,0.00
8,1,1,1,9,30.60,18.00,47.00,13.01,26.80,9.00,12.00,10.33,1.53,0.00,0.00,0.00
9,1,1,1,10,42.20,33.00,53.00,8.17,47.20,9.00,12.00,10.00,1.73,0.00,0.00,0.00


In [23]:
train3['item_trend'] = train['sales']

for lag in lag_list:
    ft_name = ('sales_shifted%s' % lag)
    train3['item_trend'] -= train3[ft_name]

train3['item_trend'] /= len(lag_list) + 1

train3.head()


Unnamed: 0,month,day,store,item,mean_sales,min_sales,max_sales,std_sales,shift_sales,sales_lag_min,sales_lag_max,sales_lag_mean,sales_lag_std,sales_shifted1,sales_shifted2,sales_shifted3,item_trend
0,1,1,1,1,14.6,9.0,21.0,5.18,14.4,13.0,13.0,13.0,0.0,0.0,0.0,0.0,3.25
1,1,1,1,2,34.8,32.0,40.0,3.27,38.8,11.0,13.0,12.0,1.41,0.0,0.0,0.0,2.75
2,1,1,1,3,22.6,15.0,36.0,8.08,24.8,11.0,14.0,12.67,1.53,0.0,0.0,0.0,3.5
3,1,1,1,4,14.0,9.0,21.0,4.8,13.6,11.0,14.0,12.67,1.53,0.0,0.0,0.0,3.25
4,1,1,1,5,14.0,6.0,25.0,6.96,9.6,10.0,14.0,12.33,2.08,0.0,0.0,0.0,2.5


In [24]:
# Shop mean encoding
gp_store_mean = train3.groupby(['store']).agg({'mean_sales': ['mean']})
#print( gp_store_mean ) # groupby eliminate all columns but shop_id and item_cnt_month: mean
gp_store_mean.columns = ['shop_mean']
#print( gp_store_mean )
gp_store_mean.reset_index(inplace=True)
print( gp_store_mean )

train3 = pd.merge(train3, gp_store_mean, on=['store'], how='left')

train3.head()

   store  shop_mean
0      1      46.47
1      2      65.90
2      3      58.53
3      4      53.98
4      5      39.10
5      6      39.06
6      7      35.75
7      8      63.06
8      9      54.12
9     10      57.72


Unnamed: 0,month,day,store,item,mean_sales,min_sales,max_sales,std_sales,shift_sales,sales_lag_min,sales_lag_max,sales_lag_mean,sales_lag_std,sales_shifted1,sales_shifted2,sales_shifted3,item_trend,shop_mean
0,1,1,1,1,14.6,9.0,21.0,5.18,14.4,13.0,13.0,13.0,0.0,0.0,0.0,0.0,3.25,46.47
1,1,1,1,2,34.8,32.0,40.0,3.27,38.8,11.0,13.0,12.0,1.41,0.0,0.0,0.0,2.75,46.47
2,1,1,1,3,22.6,15.0,36.0,8.08,24.8,11.0,14.0,12.67,1.53,0.0,0.0,0.0,3.5,46.47
3,1,1,1,4,14.0,9.0,21.0,4.8,13.6,11.0,14.0,12.67,1.53,0.0,0.0,0.0,3.25,46.47
4,1,1,1,5,14.0,6.0,25.0,6.96,9.6,10.0,14.0,12.33,2.08,0.0,0.0,0.0,2.5,46.47


In [25]:
# Item mean encoding.
gp_item_mean = train3.groupby(['item']).agg({'mean_sales': ['mean']})
gp_item_mean.columns = ['item_mean']
gp_item_mean.reset_index(inplace=True)
train3 = pd.merge(train3, gp_item_mean, on=['item'], how='left')

# Shop with item mean encoding.
gp_shop_item_mean = train3.groupby(['store', 'item']).agg({'mean_sales': ['mean']})
gp_shop_item_mean.columns = ['store_item_mean']
gp_shop_item_mean.reset_index(inplace=True)
train3 = pd.merge(train3, gp_shop_item_mean, on=['store', 'item'], how='left')

In [26]:
# Month mean encoding.
gp_month_mean = train3.groupby(['month']).agg({'mean_sales': ['mean']})
gp_month_mean.columns = ['month_mean']
gp_month_mean.reset_index(inplace=True)
train3 = pd.merge(train3, gp_month_mean, on=['month'], how='left')

In [27]:

train_set = train3

#train_set = train3[ train3['month']<=10 ]
#print( train_set.shape )

validation_set = train3[ train3['month']>10 ]
print ( validation_set.shape)

#print( 'remember we have excluded first 3 months')


(31000, 21)


In [28]:
X_train = train_set.drop(['mean_sales'], axis=1)
Y_train = train_set['mean_sales'].astype(int)
X_validation = validation_set.drop(['mean_sales'], axis=1)
Y_validation = validation_set['mean_sales'].astype(int)

# random forest regressor

In [29]:
rf_features = X_train.columns

rf_train = X_train[rf_features]
rf_val = X_validation[rf_features]
rf_model = RandomForestRegressor(n_estimators=50, max_depth=7, random_state=0, n_jobs=-1)
rf_model.fit(rf_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [60]:
rf_train_pred = rf_model.predict(rf_train)
rf_val_pred = rf_model.predict(rf_val)

rmse = np.sqrt(mean_squared_error(rf_train_pred, Y_train))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(rf_val_pred, Y_validation))
print('rmse of validation : ' , rmse)


x = smape(rf_train_pred, Y_train)
print('smape of train : ' , x)
               
x = smape(rf_val_pred, Y_validation)
print('smape of validation : ' , x)


print( rf_model.score(rf_train, Y_train) )
print( rf_model.score(rf_val, Y_validation) )

rmse of train :  2.613594373496178
rmse of validation :  2.431748554049246
smape of train :  4.238790342884243
smape of validation :  4.418068688033247
0.9906674983581286
0.989862186448763


# linear regression

In [31]:
lr_features = X_train.columns


lr_train = X_train[lr_features]
lr_val = X_validation[lr_features]


In [32]:
lr_scaler = MinMaxScaler()
lr_scaler.fit(lr_train)
lr_train = lr_scaler.transform(lr_train)
lr_val = lr_scaler.transform(lr_val)
lr_model = LinearRegression(n_jobs=-1)
lr_model.fit(lr_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [61]:
lr_train_pred = lr_model.predict(lr_train)
lr_val_pred = lr_model.predict(lr_val)

rmse = np.sqrt(mean_squared_error(lr_train_pred, Y_train))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(lr_val_pred, Y_validation))
print('rmse of validation : ' , rmse)


print( lr_model.score(lr_train, Y_train) )
print( lr_model.score(lr_val, Y_validation) )

x = smape(rf_train_pred, Y_train)
print('smape of train : ' , x)
               
x = smape(rf_val_pred, Y_validation)
print('smape of validation : ' , x)


rmse of train :  2.642420661757992
rmse of validation :  2.470857892324716
0.9904604999402513
0.9895334752752717
smape of train :  4.238790342884243
smape of validation :  4.418068688033247


# lightgbm

In [34]:
lgbm_model=LGBMRegressor(
        n_estimators=200,
        learning_rate=0.03,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40)

In [62]:
lgbm_model.fit(X_train, Y_train)


lightgbm_train_pred = lgbm_model.predict(X_train)
lightgbm_val_pred = lgbm_model.predict(X_validation)

rmse = np.sqrt(mean_squared_error(Y_train, lightgbm_train_pred ))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(Y_validation, lightgbm_val_pred))
print('rmse of validation : ' , rmse)

x = smape(rf_train_pred, Y_train)
print('smape of train : ' , x)
               
x = smape(rf_val_pred, Y_validation)
print('smape of validation : ' , x)

print( lgbm_model.score(X_train, Y_train) )
print( lgbm_model.score(X_validation, Y_validation) )

rmse of train :  2.2160863030253357
rmse of validation :  2.070914240858201
smape of train :  4.238790342884243
smape of validation :  4.418068688033247
0.9932904245498907
0.9926475649583539


In [36]:
"""
xgb_features = X_train.columns


xgb_train = X_train[xgb_features]
xgb_val = X_validation[xgb_features]

xgb_model = XGBRegressor(max_depth=8, 
                         n_estimators=500, 
                         min_child_weight=1000,  
                         colsample_bytree=0.7, 
                         subsample=0.7, 
                         eta=0.3, 
                         seed=0)
xgb_model.fit(xgb_train, 
              Y_train, 
              eval_metric="rmse", 
              eval_set=[(xgb_train, Y_train), (xgb_val, Y_validation)], 
              verbose=20, 
              early_stopping_rounds=20)
    """

'\nxgb_features = X_train.columns\n\n\nxgb_train = X_train[xgb_features]\nxgb_val = X_validation[xgb_features]\n\nxgb_model = XGBRegressor(max_depth=8, \n                         n_estimators=500, \n                         min_child_weight=1000,  \n                         colsample_bytree=0.7, \n                         subsample=0.7, \n                         eta=0.3, \n                         seed=0)\nxgb_model.fit(xgb_train, \n              Y_train, \n              eval_metric="rmse", \n              eval_set=[(xgb_train, Y_train), (xgb_val, Y_validation)], \n              verbose=20, \n              early_stopping_rounds=20)\n    '

In [37]:
"""
xgb_train_pred = xgb_model.predict(xgb_train)
xgb_val_pred = xgb_model.predict(xgb_val)


from sklearn.metrics import mean_squared_error 

rmse = np.sqrt(mean_squared_error(xgb_train_pred, Y_train))
print('rmse of train : ' , rmse)
               
rmse = np.sqrt(mean_squared_error(xgb_val_pred, Y_validation))
print('rmse of validation : ' , rmse)

print( xgb_model.score(X_train, Y_train) )
print( xgb_model.score(X_validation, Y_validation) )

"""

"\nxgb_train_pred = xgb_model.predict(xgb_train)\nxgb_val_pred = xgb_model.predict(xgb_val)\n\n\nfrom sklearn.metrics import mean_squared_error \n\nrmse = np.sqrt(mean_squared_error(xgb_train_pred, Y_train))\nprint('rmse of train : ' , rmse)\n               \nrmse = np.sqrt(mean_squared_error(xgb_val_pred, Y_validation))\nprint('rmse of validation : ' , rmse)\n\nprint( xgb_model.score(X_train, Y_train) )\nprint( xgb_model.score(X_validation, Y_validation) )\n\n"

In [38]:
def XGB_regressor(train_X, train_y, test_X, test_y, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'reg:linear'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'mae'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())

    xgtrain = xgboost.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgboost.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgboost.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgboost.DMatrix(test_X)
        model = xgboost.train(plst, xgtrain, num_rounds)
        
    return model    
    
    
dmatrix_model = XGB_regressor(train_X = X_train, train_y = Y_train, test_X = X_validation, test_y = Y_validation)


[0]	train-mae:45.445	test-mae:40.9645
Multiple eval metrics have been passed: 'test-mae' will be used for early stopping.

Will train until test-mae hasn't improved in 20 rounds.
[1]	train-mae:40.9027	test-mae:36.8629
[2]	train-mae:36.8152	test-mae:33.1731
[3]	train-mae:33.1354	test-mae:29.851
[4]	train-mae:29.8249	test-mae:26.8625
[5]	train-mae:26.8438	test-mae:24.1715
[6]	train-mae:24.1612	test-mae:21.7512
[7]	train-mae:21.7463	test-mae:19.5738
[8]	train-mae:19.5733	test-mae:17.6144
[9]	train-mae:17.6174	test-mae:15.8502
[10]	train-mae:15.8562	test-mae:14.2619
[11]	train-mae:14.2716	test-mae:12.8342
[12]	train-mae:12.8452	test-mae:11.5476
[13]	train-mae:11.5619	test-mae:10.3912
[14]	train-mae:10.4068	test-mae:9.35004
[15]	train-mae:9.36828	test-mae:8.41513
[16]	train-mae:8.43392	test-mae:7.57576
[17]	train-mae:7.59393	test-mae:6.81859
[18]	train-mae:6.84924	test-mae:6.14497
[19]	train-mae:6.17144	test-mae:5.53673
[20]	train-mae:5.56472	test-mae:4.99084
[21]	train-mae:5.0224	test-mae:

[201]	train-mae:1.57179	test-mae:1.49667
[202]	train-mae:1.5716	test-mae:1.49577
[203]	train-mae:1.57131	test-mae:1.49544
[204]	train-mae:1.57091	test-mae:1.49531
[205]	train-mae:1.57036	test-mae:1.49455
[206]	train-mae:1.57012	test-mae:1.49443
[207]	train-mae:1.56976	test-mae:1.4942
[208]	train-mae:1.56934	test-mae:1.49371
[209]	train-mae:1.569	test-mae:1.49336
[210]	train-mae:1.56864	test-mae:1.49308
[211]	train-mae:1.56815	test-mae:1.49271
[212]	train-mae:1.56754	test-mae:1.49237
[213]	train-mae:1.56696	test-mae:1.49181
[214]	train-mae:1.5666	test-mae:1.49161
[215]	train-mae:1.56628	test-mae:1.4915
[216]	train-mae:1.56591	test-mae:1.49121
[217]	train-mae:1.56545	test-mae:1.49099
[218]	train-mae:1.56503	test-mae:1.49068
[219]	train-mae:1.56472	test-mae:1.49054
[220]	train-mae:1.56431	test-mae:1.49019
[221]	train-mae:1.56384	test-mae:1.49005
[222]	train-mae:1.56361	test-mae:1.48999
[223]	train-mae:1.56296	test-mae:1.48978
[224]	train-mae:1.56262	test-mae:1.48967
[225]	train-mae:1.5621

[402]	train-mae:1.49965	test-mae:1.43891
[403]	train-mae:1.49946	test-mae:1.43888
[404]	train-mae:1.49916	test-mae:1.43834
[405]	train-mae:1.49888	test-mae:1.43807
[406]	train-mae:1.49857	test-mae:1.43761
[407]	train-mae:1.49821	test-mae:1.43727
[408]	train-mae:1.49808	test-mae:1.43712
[409]	train-mae:1.49761	test-mae:1.43689
[410]	train-mae:1.49732	test-mae:1.43671
[411]	train-mae:1.49712	test-mae:1.43666
[412]	train-mae:1.49696	test-mae:1.43664
[413]	train-mae:1.49669	test-mae:1.43621
[414]	train-mae:1.49639	test-mae:1.43596
[415]	train-mae:1.49609	test-mae:1.43571
[416]	train-mae:1.49567	test-mae:1.43557
[417]	train-mae:1.49526	test-mae:1.43532
[418]	train-mae:1.49495	test-mae:1.43517
[419]	train-mae:1.49475	test-mae:1.43496
[420]	train-mae:1.49437	test-mae:1.43458
[421]	train-mae:1.49415	test-mae:1.43451
[422]	train-mae:1.49379	test-mae:1.43409
[423]	train-mae:1.49344	test-mae:1.4337
[424]	train-mae:1.49317	test-mae:1.43358
[425]	train-mae:1.49253	test-mae:1.43202
[426]	train-mae:1

# work on test 

In [39]:
print( train3.head() )
print ( test.head() )

   month  day  store  item  mean_sales  min_sales  max_sales  std_sales  \
0      1    1      1     1       14.60       9.00      21.00       5.18   
1      1    1      1     2       34.80      32.00      40.00       3.27   
2      1    1      1     3       22.60      15.00      36.00       8.08   
3      1    1      1     4       14.00       9.00      21.00       4.80   
4      1    1      1     5       14.00       6.00      25.00       6.96   

   shift_sales  sales_lag_min     ...      sales_lag_mean  sales_lag_std  \
0        14.40          13.00     ...               13.00           0.00   
1        38.80          11.00     ...               12.00           1.41   
2        24.80          11.00     ...               12.67           1.53   
3        13.60          11.00     ...               12.67           1.53   
4         9.60          10.00     ...               12.33           2.08   

   sales_shifted1  sales_shifted2  sales_shifted3  item_trend  shop_mean  \
0            0.0

In [40]:
test['date'] = pd.to_datetime(test.date,format="%Y-%m-%d")

test['month'] = test['date'].dt.month
test['day'] = test['date'].dt.day

test.head()

Unnamed: 0,id,date,store,item,month,day
0,0,2018-01-01,1,1,1,1
1,1,2018-01-02,1,1,1,2
2,2,2018-01-03,1,1,1,3
3,3,2018-01-04,1,1,1,4
4,4,2018-01-05,1,1,1,5


In [41]:
test = test.drop(['date'], axis = 1)
test.head()

Unnamed: 0,id,store,item,month,day
0,0,1,1,1,1
1,1,1,1,1,2
2,2,1,1,1,3
3,3,1,1,1,4
4,4,1,1,1,5


In [42]:
test.head()

Unnamed: 0,id,store,item,month,day
0,0,1,1,1,1
1,1,1,1,1,2
2,2,1,1,1,3
3,3,1,1,1,4
4,4,1,1,1,5


In [43]:
test.head()


Unnamed: 0,id,store,item,month,day
0,0,1,1,1,1
1,1,1,1,1,2
2,2,1,1,1,3
3,3,1,1,1,4
4,4,1,1,1,5


In [44]:
IDs = test['id']
test = test[['month', 'day', 'store', 'item']]
print(IDs.head() )
print( test.head() )
print( test.shape )

0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64
   month  day  store  item
0      1    1      1     1
1      1    2      1     1
2      1    3      1     1
3      1    4      1     1
4      1    5      1     1
(45000, 4)


In [45]:
test = test.merge(train3.drop('mean_sales', axis=1), on=['month', 'day', 'store', 'item'], how='left'    )



In [46]:
print( test.shape )
print( test.isnull().sum() )




(45000, 20)
month              0
day                0
store              0
item               0
min_sales          0
max_sales          0
std_sales          0
shift_sales        0
sales_lag_min      0
sales_lag_max      0
sales_lag_mean     0
sales_lag_std      0
sales_shifted1     0
sales_shifted2     0
sales_shifted3     0
item_trend         0
shop_mean          0
item_mean          0
store_item_mean    0
month_mean         0
dtype: int64


In [47]:
trn = pd.DataFrame( { 'x':[1,1], 'y':[1,5], 'z':[6,7] }  )
tst = pd.DataFrame( { 'x':[1,1,1], 'y':[1,3,5] } )

In [48]:
print('*** tst **** \n', tst )
print('*** trn **** \n', trn )

*** tst **** 
    x  y
0  1  1
1  1  3
2  1  5
*** trn **** 
    x  y  z
0  1  1  6
1  1  5  7


In [49]:
#l = [0,1,1]
#if l in tst:
 #   print('ok')

In [50]:
#for index, row in test.iterrows():
 #   print(row)

In [51]:
test.head()

Unnamed: 0,month,day,store,item,min_sales,max_sales,std_sales,shift_sales,sales_lag_min,sales_lag_max,sales_lag_mean,sales_lag_std,sales_shifted1,sales_shifted2,sales_shifted3,item_trend,shop_mean,item_mean,store_item_mean,month_mean
0,1,1,1,1,9.0,21.0,5.18,14.4,13.0,13.0,13.0,0.0,0.0,0.0,0.0,3.25,46.47,21.61,19.64,35.52
1,1,2,1,1,11.0,19.0,2.97,12.6,21.0,23.0,22.0,1.0,14.6,0.0,0.0,2.1,46.47,21.61,19.64,35.52
2,1,3,1,1,10.0,16.0,2.41,15.0,15.0,32.0,22.33,8.74,14.4,14.6,0.0,-3.5,46.47,21.61,19.64,35.52
3,1,4,1,1,12.0,20.0,3.16,13.4,12.0,20.0,16.0,4.0,12.6,14.4,14.6,-6.4,46.47,21.61,19.64,35.52
4,1,5,1,1,10.0,17.0,2.61,15.0,25.0,30.0,27.67,2.52,15.0,12.6,14.4,-4.25,46.47,21.61,19.64,35.52


In [52]:


test_pred = dmatrix_model.predict(xgboost.DMatrix(test), ntree_limit = dmatrix_model.best_ntree_limit)


test_pred

array([14.179281, 14.32444 , 12.838291, ..., 70.541695, 67.12164 ,
       66.09892 ], dtype=float32)

In [53]:
sub = pd.DataFrame(IDs, columns=['id'])
sub['sales'] = test_pred
sub.to_csv('submission.csv', index=False)
sub.head(10)
                

Unnamed: 0,id,sales
0,0,14.18
1,1,14.32
2,2,12.84
3,3,14.96
4,4,13.32
5,5,14.95
6,6,11.71
7,7,13.92
8,8,14.44
9,9,15.68
