# Problem Statement

One of the largest retail chains in the world wants to use their vast data source to build an efficient forecasting model to predict the sales for each SKU in its portfolio at its 76 different stores using historical sales data for the past 3 years on a week-on-week basis. Sales and promotional information is also available for each week - product and store wise.

However, no other information regarding stores and products are available. Can you still forecast accurately the sales values for every such product/SKU-store combination for the next 12 weeks accurately? If yes, then dive right in

# Variable Definition

record_ID: Unique ID for each week store sku combination
week: Starting Date of the week
store_id: Unique ID for each store (no numerical order to be assumed)
sku_id: Unique ID for each product (no numerical order to be assumed)
total_price: Sales Price of the product 
base_price: Base price of the product
is_featured_sku: Was part of the featured item of the week
is_display_sku: Product was on display at a prominent place at the store
units_sold(Target): Total Units sold for that week-store-sku combination

# Approach (high level)

Considered this as a regression problem with 'units_sold' as a target
Generated following new features:
(a) Count of records per 'sku-id','store-id' and combination of both
(b) Average units sold per 'sku-id','store-id' and combination of both
(c) Average base-price & total-price per 'sku-id','store-id' and combination of both
(d) Week of the year
(e) Week number from start of data
(f) Week of the month
(g) Sine & Cosine transform of week number to capture cyclic nature
(e) Price difference percent between base price & total-price
Categorical Encoded 'sku-id' & 'store-id' with MEstimateEncoder()
Trained the data on RandomForest & LGBM Regressor
Tuned the above models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 

from sklearn.metrics import mean_squared_error,mean_squared_log_error
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,GridSearchCV,RandomizedSearchCV,cross_val_score,RepeatedKFold
from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer,RobustScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PolynomialFeatures

import xgboost as xgb
import lightgbm as lgb
import sklearn.ensemble as ensemble
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier,RandomForestRegressor,BaggingRegressor,AdaBoostRegressor,GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,LogisticRegression,Lasso, Ridge,LogisticRegressionCV,RidgeCV,LassoCV,ElasticNetCV,OrthogonalMatchingPursuit,ElasticNet,LassoLarsCV,BayesianRidge
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC,SVR
from scipy import stats
from scipy.stats import norm, skew
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.kernel_ridge import KernelRidge


from category_encoders.ordinal import OrdinalEncoder
from category_encoders.woe import WOEEncoder
from category_encoders.target_encoder import TargetEncoder
from category_encoders.sum_coding import SumEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.one_hot import OneHotEncoder
from scipy.special import boxcox1p
from bayes_opt import BayesianOptimization

In [2]:
warnings.filterwarnings('ignore')

In [3]:
import os
print(os.listdir("../input"))

['supply-chain']


In [4]:
train=pd.read_csv('/kaggle/input/supply-chain/Traindata.csv')
test=pd.read_csv('/kaggle/input/supply-chain/DataSetRetail.csv')
submission=pd.read_csv('/kaggle/input/supply-chain/submission format.csv')

In [5]:
train.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
0,1,17-01-2011,8091,216418,99.0375,111.8625,0,0,20
1,2,17-01-2011,8091,216419,99.0375,99.0375,0,0,28
2,3,17-01-2011,8091,216425,133.95,133.95,0,0,19
3,4,17-01-2011,8091,216233,133.95,133.95,0,0,44
4,5,17-01-2011,8091,217390,141.075,141.075,0,0,52


In [6]:
test.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku
0,212645,16-07-2013,8091,216418,108.3,108.3,0,0
1,212646,16-07-2013,8091,216419,109.0125,109.0125,0,0
2,212647,16-07-2013,8091,216425,133.95,133.95,0,0
3,212648,16-07-2013,8091,216233,133.95,133.95,0,0
4,212649,16-07-2013,8091,217390,176.7,176.7,0,0


In [7]:
submission.head()

Unnamed: 0,record_ID,units_sold
0,212645,0
1,212646,0
2,212647,0
3,212648,0
4,212649,0


In [8]:
train.describe()

Unnamed: 0,record_ID,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold
count,150150.0,150150.0,150150.0,150149.0,150150.0,150150.0,150150.0,150150.0
mean,106271.555504,9199.422511,254761.132468,206.626751,219.425927,0.095611,0.1332,51.674206
std,61386.037861,615.591445,85547.306447,103.308516,110.961712,0.294058,0.339792,60.207904
min,1.0,8023.0,216233.0,41.325,61.275,0.0,0.0,1.0
25%,53111.25,8562.0,217217.0,130.3875,133.2375,0.0,0.0,20.0
50%,106226.5,9371.0,222087.0,198.075,205.9125,0.0,0.0,35.0
75%,159452.75,9731.0,245338.0,233.7,234.4125,0.0,0.0,62.0
max,212644.0,9984.0,679023.0,562.1625,562.1625,1.0,1.0,2876.0


In [9]:
train.dtypes

record_ID            int64
week                object
store_id             int64
sku_id               int64
total_price        float64
base_price         float64
is_featured_sku      int64
is_display_sku       int64
units_sold           int64
dtype: object

In [10]:
train.isna().sum()

record_ID          0
week               0
store_id           0
sku_id             0
total_price        1
base_price         0
is_featured_sku    0
is_display_sku     0
units_sold         0
dtype: int64

In [11]:
#Imputing missing value with the relevant total price
train.total_price=train.total_price.fillna(469.5375)

In [12]:
print(train.isna().sum().sum())
print(test.isna().sum().sum())

0
0


# Feature Engineering

In [13]:
train.columns

Index(['record_ID', 'week', 'store_id', 'sku_id', 'total_price', 'base_price',
       'is_featured_sku', 'is_display_sku', 'units_sold'],
      dtype='object')

In [14]:
#New Feature Creation functions

def gen_count_id(train,test,col,name):
    temp=train.groupby(col)['record_ID'].count().reset_index().rename(columns={'record_ID':name})
    train=pd.merge(train,temp,how='left',on=col)
    test=pd.merge(test,temp,how='left',on=col)
    train[name]=train[name].astype(float)
    test[name]=test[name].astype(float)
    train[name].fillna(np.median(temp[name]),inplace=True)
    test[name].fillna(np.median(temp[name]),inplace=True)
    return train,test

def gen_average_units(train,test,col,name):
    temp=train.groupby(col)['units_sold'].mean().reset_index().rename(columns={'units_sold':name})
    train=pd.merge(train,temp,how='left',on=col)
    test=pd.merge(test,temp,how='left',on=col)
    train[name].fillna(np.median(temp[name]),inplace=True)
    test[name].fillna(np.median(temp[name]),inplace=True)
    return train,test

def gen_average_price(train,test,col,price='base_price',name='name'):
    temp=train.groupby(col)[price].mean().reset_index().rename(columns={price:name})
    train=pd.merge(train,temp,how='left',on=col)
    test=pd.merge(test,temp,how='left',on=col)
    train[name].fillna(np.median(temp[name]),inplace=True)
    test[name].fillna(np.median(temp[name]),inplace=True)
    return train,test

In [15]:
train,test = gen_count_id(train,test,col=['sku_id','store_id'],name='count_id_sku_store') #Genearting count of records per 'sku-id & store-id' 
train,test = gen_count_id(train,test,col=['sku_id'],name='count_id_sku') #Genearting count of records per 'sku-id'
train,test = gen_count_id(train,test,col=['store_id'],name='count_id_store') #Genearting count of records per 'store-id'

train,test = gen_average_units(train,test,col=['sku_id','store_id'],name='count_sku_store_id') #Genearting average units sold per 'sku-id & store-id'
train,test = gen_average_units(train,test,col=['store_id'],name='count_store_id') #Genearting average units sold per 'store-id'
train,test = gen_average_units(train,test,col=['sku_id'],name='count_sku_id') #Genearting average units sold per 'sku-id'

train,test = gen_average_price(train,test,col=['sku_id','store_id'],price='base_price',name='price_sku_store') #Genearting average base price per 'sku-id & store-id'
train,test = gen_average_price(train,test,col=['sku_id','store_id'],price='total_price',name='price_to_sku_store') #Genearting average total price per 'sku-id & store-id'
train,test = gen_average_price(train,test,col=['store_id'],price='base_price',name='price_store_id') #Genearting average base price per 'store-id'
train,test = gen_average_price(train,test,col=['sku_id'],price='base_price',name='price_sku_id') #Genearting average base price per 'sku-id'
train,test = gen_average_price(train,test,col=['store_id'],price='total_price',name='price_to_store_id') #Genearting average total price per 'store-id'
train,test = gen_average_price(train,test,col=['sku_id'],price='total_price',name='price_to_sku_id') #Genearting average total price per 'sku-id'

In [16]:
#Converting week feature
le = OrdinalEncoder()
train['week_1']=le.fit_transform(train['week'])
le = OrdinalEncoder()
test['week_1']=le.fit_transform(test['week'])+130

#Creating week number feature
train['week_num']=train.week_1%52
test['week_num']=test.week_1%52

train['week_num1']=train.week_1%4
test['week_num1']=test.week_1%4

# Encoding 'week' it using sine and cosine transform; considering it as a cyclic feature 
train['week_sin'] = np.sin(2 * np.pi * train['week_1'] / 52.143)
train['week_cos'] = np.cos(2 * np.pi * train['week_1'] / 52.143)
test['week_sin'] = np.sin(2 * np.pi * test['week_1'] / 52.143)
test['week_cos'] = np.cos(2 * np.pi * test['week_1'] / 52.143)

#Creating feature: percent difference between base price and checkout price.
train['price_diff_percent'] = (train['base_price'] - train['total_price']) / train['base_price']
test['price_diff_percent'] = (test['base_price'] - test['total_price']) / test['base_price']

In [17]:
train.tail()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,units_sold,count_id_sku_store,...,price_store_id,price_sku_id,price_to_store_id,price_to_sku_id,week_1,week_num,week_num1,week_sin,week_cos,price_diff_percent
150145,212638,09-07-2013,9984,223245,235.8375,235.8375,0,0,38,130.0,...,197.030107,214.878438,186.580537,203.073612,130,26,2,0.043065,-0.999072,0.0
150146,212639,09-07-2013,9984,223153,235.8375,235.8375,0,0,30,130.0,...,197.030107,220.461485,186.580537,206.303241,130,26,2,0.043065,-0.999072,0.0
150147,212642,09-07-2013,9984,245338,357.675,483.7875,1,1,31,130.0,...,197.030107,476.750449,186.580537,432.766248,130,26,2,0.043065,-0.999072,0.260677
150148,212643,09-07-2013,9984,547934,141.7875,191.6625,0,1,12,130.0,...,197.030107,174.616247,186.580537,166.385369,130,26,2,0.043065,-0.999072,0.260223
150149,212644,09-07-2013,9984,679023,234.4125,234.4125,0,0,15,130.0,...,197.030107,209.117532,186.580537,198.309455,130,26,2,0.043065,-0.999072,0.0


In [18]:
test.head()

Unnamed: 0,record_ID,week,store_id,sku_id,total_price,base_price,is_featured_sku,is_display_sku,count_id_sku_store,count_id_sku,...,price_store_id,price_sku_id,price_to_store_id,price_to_sku_id,week_1,week_num,week_num1,week_sin,week_cos,price_diff_percent
0,212645,16-07-2013,8091,216418,108.3,108.3,0,0,130.0,8840.0,...,181.312372,94.688268,172.272756,91.982702,131,27,3,-0.077343,-0.997005,0.0
1,212646,16-07-2013,8091,216419,109.0125,109.0125,0,0,130.0,8710.0,...,181.312372,94.908763,172.272756,92.349162,131,27,3,-0.077343,-0.997005,0.0
2,212647,16-07-2013,8091,216425,133.95,133.95,0,0,130.0,8580.0,...,181.312372,128.28521,172.272756,125.156355,131,27,3,-0.077343,-0.997005,0.0
3,212648,16-07-2013,8091,216233,133.95,133.95,0,0,130.0,9620.0,...,181.312372,128.101871,172.272756,124.942208,131,27,3,-0.077343,-0.997005,0.0
4,212649,16-07-2013,8091,217390,176.7,176.7,0,0,130.0,9100.0,...,181.312372,158.990538,172.272756,150.867334,131,27,3,-0.077343,-0.997005,0.0


In [19]:
X=train[list(set(train.columns)-set(['record_ID','units_sold','week']))]
Y= np.log1p(train['units_sold'])
X_test=test[list(set(test.columns)-set(['record_ID','week']))]

In [20]:
X.dtypes

count_id_sku          float64
price_sku_id          float64
count_store_id        float64
store_id                int64
price_diff_percent    float64
count_id_store        float64
week_cos              float64
price_sku_store       float64
sku_id                  int64
total_price           float64
week_sin              float64
count_id_sku_store    float64
price_store_id        float64
price_to_store_id     float64
price_to_sku_store    float64
count_sku_store_id    float64
week_num1               int64
count_sku_id          float64
is_display_sku          int64
is_featured_sku         int64
week_1                  int64
price_to_sku_id       float64
base_price            float64
week_num                int64
dtype: object

In [21]:
X['sku_id'] = X['sku_id'].astype('category')
X['store_id'] = X['store_id'].astype('category')

In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150150 entries, 0 to 150149
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   count_id_sku        150150 non-null  float64 
 1   price_sku_id        150150 non-null  float64 
 2   count_store_id      150150 non-null  float64 
 3   store_id            150150 non-null  category
 4   price_diff_percent  150150 non-null  float64 
 5   count_id_store      150150 non-null  float64 
 6   week_cos            150150 non-null  float64 
 7   price_sku_store     150150 non-null  float64 
 8   sku_id              150150 non-null  category
 9   total_price         150150 non-null  float64 
 10  week_sin            150150 non-null  float64 
 11  count_id_sku_store  150150 non-null  float64 
 12  price_store_id      150150 non-null  float64 
 13  price_to_store_id   150150 non-null  float64 
 14  price_to_sku_store  150150 non-null  float64 
 15  count_sku_store_i

In [23]:
print(len(X_test.columns))
print(len(X.columns))

24
24


In [24]:
print(X_test.isna().sum().sum())
print(X.isna().sum().sum())

0
0


In [25]:
category_list=['store_id','sku_id']

In [26]:
encoder_final=MEstimateEncoder()
encoder_final.fit(X[category_list], Y)

cat_enc = encoder_final.transform(X[category_list], Y)
continuous_train = X.drop(columns= category_list)
X = pd.concat([cat_enc,continuous_train],axis=1)

test_enc=encoder_final.transform(X_test[category_list])
continuous_test=X_test.drop(columns= category_list)
X_test=pd.concat([test_enc,continuous_test],axis=1)

In [27]:
X.head()

Unnamed: 0,store_id,sku_id,count_id_sku,price_sku_id,count_store_id,price_diff_percent,count_id_store,week_cos,price_sku_store,total_price,...,price_to_sku_store,count_sku_store_id,week_num1,count_sku_id,is_display_sku,is_featured_sku,week_1,price_to_sku_id,base_price,week_num
0,3.250779,4.201821,8840.0,94.688268,32.805983,0.11465,1170.0,0.992749,105.800769,99.0375,...,102.117692,26.376923,1,88.923869,0,0,1,91.982702,111.8625,1
1,3.250779,4.051982,8710.0,94.908763,32.805983,0.0,1170.0,0.992749,105.839135,99.0375,...,102.473942,28.307692,1,72.182664,0,0,1,92.349162,99.0375,1
2,3.250779,3.303344,8580.0,128.28521,32.805983,0.0,1170.0,0.992749,131.204135,133.95,...,126.671538,25.669231,1,34.019231,0,0,1,125.156355,133.95,1
3,3.250779,3.715659,9620.0,128.101871,32.805983,0.0,1170.0,0.992749,129.916154,133.95,...,126.167308,29.107692,1,46.821206,0,0,1,124.942208,133.95,1
4,3.250779,3.850743,9100.0,158.990538,32.805983,0.0,1170.0,0.992749,164.439519,141.075,...,152.968269,29.469231,1,62.312747,0,0,1,150.867334,141.075,1


In [28]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150150 entries, 0 to 150149
Data columns (total 24 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   store_id            150150 non-null  float64
 1   sku_id              150150 non-null  float64
 2   count_id_sku        150150 non-null  float64
 3   price_sku_id        150150 non-null  float64
 4   count_store_id      150150 non-null  float64
 5   price_diff_percent  150150 non-null  float64
 6   count_id_store      150150 non-null  float64
 7   week_cos            150150 non-null  float64
 8   price_sku_store     150150 non-null  float64
 9   total_price         150150 non-null  float64
 10  week_sin            150150 non-null  float64
 11  count_id_sku_store  150150 non-null  float64
 12  price_store_id      150150 non-null  float64
 13  price_to_store_id   150150 non-null  float64
 14  price_to_sku_store  150150 non-null  float64
 15  count_sku_store_id  150150 non-nul

In [29]:
X_test.head()

Unnamed: 0,store_id,sku_id,count_id_sku,price_sku_id,count_store_id,price_diff_percent,count_id_store,week_cos,price_sku_store,total_price,...,price_to_sku_store,count_sku_store_id,week_num1,count_sku_id,is_display_sku,is_featured_sku,week_1,price_to_sku_id,base_price,week_num
0,3.250779,4.201821,8840.0,94.688268,32.805983,0.0,1170.0,-0.997005,105.800769,108.3,...,102.117692,26.376923,3,88.923869,0,0,131,91.982702,108.3,27
1,3.250779,4.051982,8710.0,94.908763,32.805983,0.0,1170.0,-0.997005,105.839135,109.0125,...,102.473942,28.307692,3,72.182664,0,0,131,92.349162,109.0125,27
2,3.250779,3.303344,8580.0,128.28521,32.805983,0.0,1170.0,-0.997005,131.204135,133.95,...,126.671538,25.669231,3,34.019231,0,0,131,125.156355,133.95,27
3,3.250779,3.715659,9620.0,128.101871,32.805983,0.0,1170.0,-0.997005,129.916154,133.95,...,126.167308,29.107692,3,46.821206,0,0,131,124.942208,133.95,27
4,3.250779,3.850743,9100.0,158.990538,32.805983,0.0,1170.0,-0.997005,164.439519,176.7,...,152.968269,29.469231,3,62.312747,0,0,131,150.867334,176.7,27


# Model Building

In [30]:
X.columns

Index(['store_id', 'sku_id', 'count_id_sku', 'price_sku_id', 'count_store_id',
       'price_diff_percent', 'count_id_store', 'week_cos', 'price_sku_store',
       'total_price', 'week_sin', 'count_id_sku_store', 'price_store_id',
       'price_to_store_id', 'price_to_sku_store', 'count_sku_store_id',
       'week_num1', 'count_sku_id', 'is_display_sku', 'is_featured_sku',
       'week_1', 'price_to_sku_id', 'base_price', 'week_num'],
      dtype='object')

In [31]:
del X['week_num1']

In [32]:
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size = 0.2,random_state=23)

In [33]:
len(x_train.columns)

23

In [34]:
rf_base = RandomForestRegressor()
rf_base.fit(x_train,y_train)


rf_tuned = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
                      max_features='sqrt', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=10,
                      min_weight_fraction_leaf=0.0, n_estimators=600,
                      n_jobs=None, oob_score=True, random_state=None,
                      verbose=0, warm_start=False)
rf_tuned.fit(x_train,y_train)

RandomForestRegressor(max_depth=30, max_features='sqrt', min_samples_split=10,
                      n_estimators=600, oob_score=True)

In [35]:
model_lgb_base=lgb.LGBMRegressor(objective='regression')
model_lgb_base.fit(x_train,y_train)

model_lgb_tuned=lgb.LGBMRegressor(bagging_fraction=0.8, bagging_frequency=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.1, max_depth=30,
              min_child_samples=20, min_child_weight=30, min_data_in_leaf=70,
              min_split_gain=0.0001, n_estimators=200, n_jobs=-1,
              num_leaves=1200, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

model_lgb_tuned.fit(x_train,y_train)

LGBMRegressor(bagging_fraction=0.8, bagging_frequency=4, feature_fraction=0.5,
              max_depth=30, min_child_weight=30, min_data_in_leaf=70,
              min_split_gain=0.0001, n_estimators=200, num_leaves=1200)

In [36]:
prediction_rfb_valid=rf_base.predict(x_valid)
prediction_rft_valid=rf_tuned.predict(x_valid)
prediction_lgbmb_valid=model_lgb_base.predict(x_valid)
prediction_lgbmt_valid=model_lgb_tuned.predict(x_valid)

rf_base_msle=100*mean_squared_log_error(y_valid,prediction_rfb_valid)
rf_tuned_msle=100*mean_squared_log_error(y_valid,prediction_rft_valid)
lgbm_base_msle=100*mean_squared_log_error(y_valid,prediction_lgbmb_valid)
lgbm_tuned_msle=100*mean_squared_log_error(y_valid,prediction_lgbmt_valid)

prediction_ensemble_base=(((1-rf_base_msle)*prediction_rfb_valid)+((1-lgbm_base_msle)*prediction_lgbmb_valid))/(2-rf_base_msle-lgbm_base_msle)
prediction_ensemble_tuned=(((1-rf_tuned_msle)*prediction_rft_valid)+((1-lgbm_tuned_msle)*prediction_lgbmt_valid))/(2-rf_tuned_msle-lgbm_tuned_msle)

ensemble_base_msle=100*mean_squared_log_error(y_valid,prediction_ensemble_base)
ensemble_tuned_msle=100*mean_squared_log_error(y_valid,prediction_ensemble_tuned)


print("RF Base: {}; RF Tuned: {}".format(rf_base_msle,rf_tuned_msle))
print("LGBM Base: {}; LGBM Tuned: {}".format(lgbm_base_msle,lgbm_tuned_msle))
print("Ensemble Base: {}; Ensemble Tuned: {}".format(ensemble_base_msle,ensemble_tuned_msle))

RF Base: 0.84662639504341; RF Tuned: 0.8534504127247554
LGBM Base: 0.9034433568006958; LGBM Tuned: 0.7297141894931608
Ensemble Base: 0.8185060670127743; Ensemble Tuned: 0.7461411084989064


In [37]:
model = lgb.LGBMRegressor(bagging_fraction=0.8, bagging_frequency=4, boosting_type='gbdt',
              class_weight=None, colsample_bytree=1.0, feature_fraction=0.5,
              importance_type='split', learning_rate=0.1, max_depth=30,
              min_child_samples=20, min_child_weight=30, min_data_in_leaf=70,
              min_split_gain=0.0001, n_estimators=100, n_jobs=-1,
              num_leaves=1400, objective=None, random_state=None, reg_alpha=0.0,
              reg_lambda=0.0, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

model.fit(X,Y)

LGBMRegressor(bagging_fraction=0.8, bagging_frequency=4, feature_fraction=0.5,
              max_depth=30, min_child_weight=30, min_data_in_leaf=70,
              min_split_gain=0.0001, num_leaves=1400)

In [38]:
X_test.head()

Unnamed: 0,store_id,sku_id,count_id_sku,price_sku_id,count_store_id,price_diff_percent,count_id_store,week_cos,price_sku_store,total_price,...,price_to_sku_store,count_sku_store_id,week_num1,count_sku_id,is_display_sku,is_featured_sku,week_1,price_to_sku_id,base_price,week_num
0,3.250779,4.201821,8840.0,94.688268,32.805983,0.0,1170.0,-0.997005,105.800769,108.3,...,102.117692,26.376923,3,88.923869,0,0,131,91.982702,108.3,27
1,3.250779,4.051982,8710.0,94.908763,32.805983,0.0,1170.0,-0.997005,105.839135,109.0125,...,102.473942,28.307692,3,72.182664,0,0,131,92.349162,109.0125,27
2,3.250779,3.303344,8580.0,128.28521,32.805983,0.0,1170.0,-0.997005,131.204135,133.95,...,126.671538,25.669231,3,34.019231,0,0,131,125.156355,133.95,27
3,3.250779,3.715659,9620.0,128.101871,32.805983,0.0,1170.0,-0.997005,129.916154,133.95,...,126.167308,29.107692,3,46.821206,0,0,131,124.942208,133.95,27
4,3.250779,3.850743,9100.0,158.990538,32.805983,0.0,1170.0,-0.997005,164.439519,176.7,...,152.968269,29.469231,3,62.312747,0,0,131,150.867334,176.7,27


In [39]:
del X_test['week_num1']

In [40]:
prediction=model.predict(X_test)

In [41]:
final_prediction=np.round(np.expm1(prediction))
submission['units_sold']=final_prediction

In [42]:
submission.head()

Unnamed: 0,record_ID,units_sold
0,212645,15.0
1,212646,20.0
2,212647,28.0
3,212648,30.0
4,212649,20.0


In [44]:
submission.to_csv('sales_forecast_submission.csv', index=False)