In [3]:
import pandas as pd
import numpy as np
import pickle
import joblib
import sys
import os

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

from pygam import LinearGAM

sys.path.append(os.path.abspath(os.path.join(os.getcwd(),'..')))
from Utils.Model import models_blending_list, model_blending, model_blending_predictions, model_building

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv("../Data/df_le.csv", index_col=0)

In [5]:
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Categories,Outlet_Age,Item_MRP_Categories
0,9.3,0,0.016047,249.8092,9,1,0,1,3735.138,1,15,1
1,5.92,1,0.019278,48.2692,3,1,2,2,443.4228,0,5,0
2,17.5,0,0.01676,141.618,9,1,0,1,2097.27,1,15,3
3,19.2,1,0.0,182.095,0,2,2,0,732.38,1,16,3
4,8.93,0,0.0,53.8614,1,0,2,1,994.7052,2,27,0


In [6]:
df.shape

(8523, 12)

In [7]:
X = df.drop(columns=['Item_Outlet_Sales'])
X

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Categories,Outlet_Age,Item_MRP_Categories
0,9.300,0,0.016047,249.8092,9,1,0,1,1,15,1
1,5.920,1,0.019278,48.2692,3,1,2,2,0,5,0
2,17.500,0,0.016760,141.6180,9,1,0,1,1,15,3
3,19.200,1,0.000000,182.0950,0,2,2,0,1,16,3
4,8.930,0,0.000000,53.8614,1,0,2,1,2,27,0
...,...,...,...,...,...,...,...,...,...,...,...
8518,6.865,0,0.056783,214.5218,1,0,2,1,1,27,1
8519,8.380,1,0.046982,108.1570,7,2,1,1,1,12,2
8520,10.600,0,0.035186,85.1224,6,2,1,1,2,10,2
8521,7.210,1,0.145221,103.1332,3,1,2,2,1,5,2


In [8]:
y = df['Item_Outlet_Sales']
y

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64

In [9]:
# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
X_train

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Categories,Outlet_Age,Item_MRP_Categories
549,9.500,1,0.035206,171.3448,9,1,0,1,1,15,3
7757,18.000,0,0.047473,170.5422,7,2,1,1,2,12,3
764,17.600,1,0.076122,111.7202,8,2,0,1,1,17,2
6867,8.325,0,0.029845,41.6138,7,2,1,1,1,12,0
2716,12.850,0,0.137228,155.5630,8,2,0,1,1,17,3
...,...,...,...,...,...,...,...,...,...,...,...
5734,9.395,1,0.286345,139.1838,0,2,2,0,1,16,3
5191,15.600,0,0.117575,75.6670,2,2,1,1,1,7,2
5390,17.600,0,0.018944,237.3590,7,2,1,1,2,12,1
860,20.350,0,0.054363,117.9466,2,2,1,1,1,7,2


In [11]:
X_test

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Categories,Outlet_Age,Item_MRP_Categories
7503,14.30000,0,0.026300,79.4302,1,0,2,1,1,27,2
2957,7.93000,0,0.071136,42.7086,8,2,0,1,2,17,0
7031,14.50000,1,0.041313,42.0454,9,1,0,1,1,15,0
1084,11.84746,1,0.044767,173.7054,5,1,2,3,0,29,3
856,10.19500,1,0.012456,197.5110,6,2,1,1,1,10,3
...,...,...,...,...,...,...,...,...,...,...,...
7205,11.80000,1,0.093656,127.1704,8,2,0,1,1,17,2
3257,7.02000,0,0.000000,148.1734,3,1,2,2,1,5,3
6346,14.50000,1,0.041215,42.0454,1,0,2,1,1,27,0
6318,9.80000,1,0.141184,50.5008,3,1,2,2,1,5,0


In [12]:
y_train

549     2386.2272
7757    3103.9596
764     1125.2020
6867     284.2966
2716    4224.5010
          ...    
5734     280.9676
5191    1301.6390
5390    6145.3340
860     1649.8524
7270     965.4100
Name: Item_Outlet_Sales, Length: 6818, dtype: float64

In [13]:
y_test

7503    1743.0644
2957     356.8688
7031     377.5086
1084    5778.4782
856     2356.9320
          ...    
7205    3004.0896
3257     890.8404
6346     629.1810
6318     253.0040
6339     976.7286
Name: Item_Outlet_Sales, Length: 1705, dtype: float64

In [14]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)

X_train_scaled.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Categories,Outlet_Age,Item_MRP_Categories
0,-0.799489,1.344216,-0.600703,0.470709,1.520681,-0.665305,-1.383482,-0.259489,-0.182954,-0.136169,1.058184
1,1.208955,-0.743928,-0.362159,0.457877,0.813523,0.799354,-0.149659,-0.259489,1.727281,-0.493521,1.058184
2,1.11444,1.344216,0.194933,-0.482625,1.167102,0.799354,-1.383482,-0.259489,-0.182954,0.102066,0.120236
3,-1.077127,-0.743928,-0.704944,-1.603553,0.813523,0.799354,-0.149659,-0.259489,-0.182954,-0.493521,-1.75566
4,-0.007926,-0.743928,1.383177,0.218375,1.167102,0.799354,-1.383482,-0.259489,-0.182954,0.102066,1.058184


In [16]:
with open('../Pickle Files/Scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [17]:
X_test_scaled = scaler.transform(X_test)

In [18]:
models = [('LinearRegression', LinearRegression()), ('ElasticNet', ElasticNet()), 
          ('RandomForestRegressor', RandomForestRegressor()), ('ExtraTreesRegressor', ExtraTreesRegressor()), 
          ('GradientBoostingRegressor', GradientBoostingRegressor()),
          ('MLPRegressor', MLPRegressor())]

In [19]:
model_building(X_train_scaled, y_train, 3, models)

LinearRegression 0.4963111999380221
ElasticNet 0.4462428019307918
RandomForestRegressor 0.5517727235580955
ExtraTreesRegressor 0.5084885825599266
GradientBoostingRegressor 0.5923023747367715
MLPRegressor 0.5470535597213777


In [20]:
model_building(X_train_scaled, y_train, 5, models)

LinearRegression 0.4966295593034424
ElasticNet 0.4463419910017688
RandomForestRegressor 0.5506580610870581
ExtraTreesRegressor 0.5085900862965536
GradientBoostingRegressor 0.5940270517546347
MLPRegressor 0.5592708794034928


In [21]:
# Linear GAM

gam = LinearGAM().gridsearch(X_train_scaled.values, y_train.values)
gam.summary()

  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--
  9% (1 of 11) |##                       | Elapsed Time: 0:00:00 ETA:   0:00:06
 18% (2 of 11) |####                     | Elapsed Time: 0:00:01 ETA:   0:00:05
 27% (3 of 11) |######                   | Elapsed Time: 0:00:01 ETA:   0:00:04
 36% (4 of 11) |#########                | Elapsed Time: 0:00:02 ETA:   0:00:03
 45% (5 of 11) |###########              | Elapsed Time: 0:00:02 ETA:   0:00:03
 54% (6 of 11) |#############            | Elapsed Time: 0:00:03 ETA:   0:00:02
 63% (7 of 11) |###############          | Elapsed Time: 0:00:03 ETA:   0:00:01
 72% (8 of 11) |##################       | Elapsed Time: 0:00:04 ETA:   0:00:01
 81% (9 of 11) |####################     | Elapsed Time: 0:00:04 ETA:   0:00:00
 90% (10 of 11) |#####################   | Elapsed Time: 0:00:04 ETA:   0:00:00
100% (11 of 11) |########################| Elapsed Time: 0:00:05 Time:  0:00:05


LinearGAM                                                                                                 
Distribution:                        NormalDist Effective DoF:                                     26.7769
Link Function:                     IdentityLink Log Likelihood:                               -102286.7839
Number of Samples:                         6818 AIC:                                           204629.1218
                                                AICc:                                          204629.3572
                                                GCV:                                          1316668.1529
                                                Scale:                                        1307363.4949
                                                Pseudo R-Squared:                                   0.5598
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(0)                              [10

In [22]:
# Voting Regressor

regressor1 = LinearRegression()
regressor2 = GradientBoostingRegressor()
regressor3 = MLPRegressor()

voting_regressor = VotingRegressor(estimators=[('LR', regressor1), ('GBR', regressor2), ('MLP', regressor3)])
scores = cross_val_score(voting_regressor, X_train_scaled, y_train, cv=6)

In [23]:
scores

array([0.55895152, 0.5892069 , 0.57681643, 0.55503737, 0.60232914,
       0.54736159])

In [24]:
scores.mean()

0.5716171599145953

In [25]:
# Stacking

regressor1 = LinearRegression()
regressor2 = GradientBoostingRegressor()

level1_estimators = [('LR', regressor1), ('GBR', regressor2)]
level2_estimator = MLPRegressor()

stacking_regressor = StackingRegressor(estimators=level1_estimators, final_estimator=level2_estimator)
scores = cross_val_score(stacking_regressor, X_train_scaled, y_train, cv=6)
# scores = cross_val_score(stacking_regressor, X_train_scaled, y_train, cv=10)

In [26]:
scores

array([0.58007755, 0.60777453, 0.60126634, 0.57348253, 0.62154107,
       0.56652378])

In [27]:
scores.mean()

0.591777633568646

In [28]:
# Blending

X_train1, X_val, y_train1, y_val = train_test_split(X_train_scaled, y_train, test_size=0.1, random_state=42)

models = models_blending_list()
model_blender = model_blending(models, X_train1, X_val, y_train1, y_val)
scores = cross_val_score(model_blender, X_train_scaled, y_train, cv=6)
scores

array([0.5470483 , 0.57985827, 0.56748985, 0.54604016, 0.5943861 ,
       0.53631004])

In [29]:
scores.mean()

0.5618554532968754

In [30]:
# Blending model predictions
y_pred = model_blending_predictions(models, model_blender, X_test_scaled)
print('R^2 of Blending is ')
print(r2_score(y_test, y_pred))

# Stacking model predictions
model_stack = stacking_regressor.fit(X_train_scaled, y_train)
y_pred = model_stack.predict(X_test_scaled)
print('R^2 of Stacking is ')
print(r2_score(y_test, y_pred))

# Voting Regressor model predictions
model_voting = voting_regressor.fit(X_train_scaled, y_train)
y_pred = model_voting.predict(X_test_scaled)
print('R^2 of Voting is ')
print(r2_score(y_test, y_pred))

# Gradient Boosting Regressor model predictions
model_gbm = GradientBoostingRegressor()
model_gbm.fit(X_train_scaled, y_train)
y_pred = model_gbm.predict(X_test_scaled)
print('R^2 of GBM is ')
print(r2_score(y_test, y_pred))


R^2 of Blending is 
0.6043028886816211
R^2 of Stacking is 
0.6079858304563734
R^2 of Voting is 
0.594267134647419
R^2 of GBM is 
0.6069637540862726


In [33]:
# For interpretability we arre choosing GBM model.
with open('../Pickle Files/GBM_Model.pkl', 'wb') as file:
    pickle.dump(model_gbm, file)


In [34]:
with open('../Pickle Files/GBM_Model.pkl', 'rb') as file:
    model = pickle.load(file)

predicted_sales = model.predict(X_test_scaled)
print(f'Predicted Outlet Sales for the Item: ${predicted_sales}')

Predicted Outlet Sales for the Item: $[1277.97598449  684.85009372  716.35859719 ...  724.37210875  628.52804095
 1641.84092283]
