# Importing Libraries

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# Loading Pickled Files

In [None]:
df = pd.read_pickle('/content/drive/MyDrive/Applied AI/Capstone/Predict Future Sales/Final_Files/pickled_df.pkl')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8801390 entries, 0 to 8801389
Data columns (total 42 columns):
 #   Column                             Dtype  
---  ------                             -----  
 0   date_block_num                     int8   
 1   shop_id                            int8   
 2   item_id                            int16  
 3   item_cnt_month                     float16
 4   city_enc                           int8   
 5   item_category_id                   int8   
 6   Category_id                        int8   
 7   first_sale_date                    int8   
 8   month                              int8   
 9   first_instance_of_shop             int8   
 10  item_cnt_month_lag-3               float16
 11  item_cnt_month_lag-2               float16
 12  item_cnt_month_lag-1               float16
 13  item_rate_month_lag-3              float16
 14  item_rate_month_lag-2              float16
 15  item_rate_month_lag-1              float16
 16  item_price_lag-3  

# Splitting the Data

In [None]:
X_train = df[df["date_block_num"]<33]
y_train = X_train["item_cnt_month"]
X_train = X_train.drop("item_cnt_month", axis=1)

X_val = df[df["date_block_num"] == 33]
y_val = X_val["item_cnt_month"]
X_val = X_val.drop("item_cnt_month", axis=1)

X_test = df[df["date_block_num"]==34]
X_test = X_test.drop("item_cnt_month", axis=1)

# Random Forest Regressor

In [None]:
%%time
from sklearn.ensemble import  RandomForestRegressor
rf_regressor =  RandomForestRegressor(n_estimators = 100, max_features = 'sqrt', max_depth = 5, random_state = 18)

rf_regressor.fit(X_train,y_train)

CPU times: user 9min 12s, sys: 6.59 s, total: 9min 18s
Wall time: 9min 16s


RandomForestRegressor(max_depth=5, max_features='sqrt', random_state=18)

In [None]:
# Calculating RMSE Score for Val dataset
import math
from sklearn.metrics import mean_squared_error
pred = rf_regressor.predict(X_val)
MSE = mean_squared_error(y_val, pred)
RMSE = math.sqrt(MSE)
print(f'RMSE value is: {RMSE}')


RMSE value is: 0.8842876779995373


# Linear Regression Model

In [None]:
%%time

from sklearn.linear_model import LinearRegression
LinearModel=LinearRegression()
LinearModel.fit(X_train,y_train)

# Calculating RMSE Score for Val dataset
pred = LinearModel.predict(X_val)
MSE = mean_squared_error(y_val, pred)
RMSE = math.sqrt(MSE)
print(f'RMSE value is: {RMSE}')


RMSE value is: 0.8934985989257929
CPU times: user 49.4 s, sys: 25.1 s, total: 1min 14s
Wall time: 34.9 s


# XGBoost

In [None]:
import xgboost

In [None]:
xgb_regressor=xgboost.XGBRegressor(colsample_bytree=0.5, gamma=0.1, min_child_weight=7,silent=False,
                                   validate_parameters = True, max_depth=3 )

In [None]:
%%time

xgb_regressor.fit(X_train,y_train)

CPU times: user 25min 35s, sys: 7.77 s, total: 25min 43s
Wall time: 25min 36s


XGBRegressor(colsample_bytree=0.5, gamma=0.1, min_child_weight=7, silent=False,
             validate_parameters=True)

In [None]:
pred = xgb_regressor.predict(X_val)
MSE = mean_squared_error(y_val, pred)
RMSE = math.sqrt(MSE)
print(f'RMSE value is: {RMSE}')


RMSE value is: 0.8690382993662482


# Bagging Regressor

In [None]:
%%time

from sklearn.ensemble import BaggingRegressor

bag_regressor = BaggingRegressor(n_estimators=35,random_state=1, max_samples=1000,)

bag_regressor = bag_regressor.fit(X_train,y_train)

CPU times: user 7min 56s, sys: 6.02 s, total: 8min 2s
Wall time: 7min 58s


In [None]:
pred = bag_regressor.predict(X_val)
MSE = mean_squared_error(y_val, pred)
RMSE = math.sqrt(MSE)
print(f'RMSE value is: {RMSE}')

RMSE value is: 0.9447815211138822


# Light Gradient Boost

In [None]:
%%time
# Try Light Gradient Boosting Machine, parameters can be altered for further accuracy.

import lightgbm as lgb

params = {'metric': 'rmse',
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise' : True,
          'random_state': 10,
         'num_rounds':600,
         'early_stopping':150}

lgb_train = lgb.Dataset(X_train, y_train)

lgb_val = lgb.Dataset(X_val, y_val)

model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=50)



Training until validation scores don't improve for 150 rounds.
[50]	training's rmse: 1.1301	valid_1's rmse: 0.989922
[100]	training's rmse: 1.05244	valid_1's rmse: 0.933817
[150]	training's rmse: 0.998374	valid_1's rmse: 0.900684
[200]	training's rmse: 0.961513	valid_1's rmse: 0.881552
[250]	training's rmse: 0.935576	valid_1's rmse: 0.870126
[300]	training's rmse: 0.916309	valid_1's rmse: 0.866266
[350]	training's rmse: 0.902279	valid_1's rmse: 0.863024
[400]	training's rmse: 0.891612	valid_1's rmse: 0.860515
[450]	training's rmse: 0.882833	valid_1's rmse: 0.858662
[500]	training's rmse: 0.875557	valid_1's rmse: 0.85893
[550]	training's rmse: 0.869308	valid_1's rmse: 0.858227
[600]	training's rmse: 0.863655	valid_1's rmse: 0.857906
Did not meet early stopping. Best iteration is:
[600]	training's rmse: 0.863655	valid_1's rmse: 0.857906
CPU times: user 22min 10s, sys: 7.01 s, total: 22min 17s
Wall time: 22min 11s


# Stacking

In [None]:
X_val.shape

(221676, 41)

In [None]:
X = df[df["date_block_num"]<34]
y = X["item_cnt_month"]
X = X.drop("item_cnt_month", axis=1)

from sklearn.model_selection import train_test_split
x_train, x_test, y_Train, y_Test = train_test_split(X, y, test_size=0.1, random_state=42)

# XGBoost on Stacked Data

In [None]:
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_Train, y_Test = train_test_split(X_val, y_val, test_size=0.1, random_state=42)

In [None]:
%%time
model1=rf_regressor.predict(x_train)

CPU times: user 41.4 s, sys: 3.67 s, total: 45 s
Wall time: 44.7 s


In [None]:
%%time
model2=bag_regressor.predict(x_train)

CPU times: user 7min 35s, sys: 6.3 s, total: 7min 41s
Wall time: 7min 37s


In [None]:
%%time
model3 = model.predict(x_train)

CPU times: user 11min 3s, sys: 4.04 s, total: 11min 7s
Wall time: 11min 4s


In [None]:
%%time
model4=LinearModel.predict(x_train)

CPU times: user 20.9 s, sys: 3.94 s, total: 24.8 s
Wall time: 23.6 s


In [None]:
pred_df = np.column_stack((model1,model2,model3,model4))


In [None]:
%%time

stack_xgb_regressor=xgboost.XGBRegressor(colsample_bytree=0.5, gamma=0.1, min_child_weight=7,silent=False,
                                   validate_parameters = True, max_depth=3 )
stack_xgb_regressor.fit(pred_df,y_Train)

CPU times: user 7min 18s, sys: 464 ms, total: 7min 19s
Wall time: 7min 17s


XGBRegressor(colsample_bytree=0.5, gamma=0.1, min_child_weight=7, silent=False,
             validate_parameters=True)

In [None]:
def stacking_test(df,stack_xgb_regressor,rf_regressor,bag_regressor,model,LinearModel):


  model1=rf_regressor.predict(df)


  model2=bag_regressor.predict(df)


  model3 = model.predict(df)

  
  model4=LinearModel.predict(df)

  df_test = np.column_stack((model1,model2,model3,model4))
  print(df_test.shape)

  test_pred = stack_xgb_regressor.predict(df_test)

  return test_pred




In [None]:
x_test.shape

(858719, 41)

In [None]:
%%time

pred = stacking_test(x_test,stack_xgb_regressor,rf_regressor,bag_regressor,model,LinearModel)
MSE = mean_squared_error(y_Test, pred)
RMSE = math.sqrt(MSE)
print(f'RMSE value is: {RMSE}')

(858719, 4)
RMSE value is: 0.953470136345822
CPU times: user 2min 13s, sys: 3.51 s, total: 2min 17s
Wall time: 2min 14s


# LGB on Stacked Data

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, Y_Train, Y_Test = train_test_split(pred_df, y_Train, test_size=0.2, random_state=42)

In [None]:
params = {'metric': 'rmse',
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise' : True,
          'random_state': 10,
         'num_rounds':600,
         'early_stopping':150}

In [None]:
lgb_train = lgb.Dataset(x_train,Y_Train)

lgb_val = lgb.Dataset(x_test, Y_Test)

model = lgb.train(params=params, train_set=lgb_train, valid_sets=(lgb_train, lgb_val), verbose_eval=50)



Training until validation scores don't improve for 150 rounds.
[50]	training's rmse: 1.11315	valid_1's rmse: 1.10934
[100]	training's rmse: 1.02473	valid_1's rmse: 1.02213
[150]	training's rmse: 0.966025	valid_1's rmse: 0.964648
[200]	training's rmse: 0.926888	valid_1's rmse: 0.926732
[250]	training's rmse: 0.901545	valid_1's rmse: 0.902517
[300]	training's rmse: 0.884875	valid_1's rmse: 0.886914
[350]	training's rmse: 0.873772	valid_1's rmse: 0.876755
[400]	training's rmse: 0.866759	valid_1's rmse: 0.870579
[450]	training's rmse: 0.861972	valid_1's rmse: 0.866559
[500]	training's rmse: 0.858326	valid_1's rmse: 0.863691
[550]	training's rmse: 0.855753	valid_1's rmse: 0.861792
[600]	training's rmse: 0.853906	valid_1's rmse: 0.860605
Did not meet early stopping. Best iteration is:
[600]	training's rmse: 0.853906	valid_1's rmse: 0.860605


# Saving LGBM Model

In [None]:
import pickle

pickle.dump(model, open('/content/drive/MyDrive/Applied AI/Capstone/Predict Future Sales/Final_Files/LGBM_Final_Model.pkl', 'wb'))

In [None]:
pickled_model = pickle.load(open('/content/drive/MyDrive/Applied AI/Capstone/Predict Future Sales/Final_Files/LGBM_Final_Model.pkl', 'rb'))

array([5.20445618, 0.04454566, 0.03145738, ..., 0.39012119, 0.6617227 ,
       0.03824956])

In [None]:
pickled_model.predict(x_test)

array([5.20445618, 0.04454566, 0.03145738, ..., 0.39012119, 0.6617227 ,
       0.03824956])