In [27]:
import pandas as pd
import numpy as np
import pickle
import sys
import os

from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from pygam import LinearGAM

sys.path.append(os.path.abspath(os.path.join(os.getcwd(),'..')))
from Utils.Model import models_blending_list, model_blending, model_blending_predictions, model_building

import warnings
warnings.filterwarnings('ignore')

In [28]:
df = pd.read_csv("../Data/df_le_scaled.csv", index_col=0)

In [29]:
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Item_Categories,Outlet_Age,Item_MRP_Categories
0,-0.840601,-0.738147,-0.970732,1.747454,1.507813,-0.66408,-1.369334,-0.252658,0.910601,-0.179795,-0.139541,-0.81748
1,-1.639173,1.354743,-0.908111,-1.489023,-0.607071,-0.66408,1.091569,1.002972,-1.01844,-2.095286,-1.334103,-1.757239
2,1.096763,-0.738147,-0.956917,0.01004,1.507813,-0.66408,-1.369334,-0.252658,-0.049238,-0.179795,-0.139541,1.06204
3,1.498411,1.354743,-1.281758,0.66005,-1.664513,0.799954,1.091569,-1.508289,-0.849103,-0.179795,-0.020085,1.06204
4,-0.928018,-0.738147,-1.281758,-1.39922,-1.312032,-2.128115,1.091569,-0.252658,-0.695373,1.735696,1.293934,-1.757239


In [30]:
df.shape

(8523, 12)

In [31]:
X = df.drop(columns=['Item_Outlet_Sales'])
X

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Categories,Outlet_Age,Item_MRP_Categories
0,-0.840601,-0.738147,-0.970732,1.747454,1.507813,-0.664080,-1.369334,-0.252658,-0.179795,-0.139541,-0.817480
1,-1.639173,1.354743,-0.908111,-1.489023,-0.607071,-0.664080,1.091569,1.002972,-2.095286,-1.334103,-1.757239
2,1.096763,-0.738147,-0.956917,0.010040,1.507813,-0.664080,-1.369334,-0.252658,-0.179795,-0.139541,1.062040
3,1.498411,1.354743,-1.281758,0.660050,-1.664513,0.799954,1.091569,-1.508289,-0.179795,-0.020085,1.062040
4,-0.928018,-0.738147,-1.281758,-1.399220,-1.312032,-2.128115,1.091569,-0.252658,1.735696,1.293934,-1.757239
...,...,...,...,...,...,...,...,...,...,...,...
8518,-1.415903,-0.738147,-0.181193,1.180783,-1.312032,-2.128115,1.091569,-0.252658,-0.179795,1.293934,-0.817480
8519,-1.057964,1.354743,-0.371154,-0.527301,0.802852,0.799954,-0.138882,-0.252658,-0.179795,-0.497909,0.122280
8520,-0.533458,-0.738147,-0.599784,-0.897208,0.450371,0.799954,-0.138882,-0.252658,1.735696,-0.736822,0.122280
8521,-1.334392,1.354743,1.532880,-0.607977,-0.607071,-0.664080,1.091569,1.002972,-0.179795,-1.334103,0.122280


In [32]:
y = df['Item_Outlet_Sales']
y

0       0.910601
1      -1.018440
2      -0.049238
3      -0.849103
4      -0.695373
          ...   
8518    0.349915
8519   -0.956402
8520   -0.579100
8521   -0.196725
8522   -0.829594
Name: Item_Outlet_Sales, Length: 8523, dtype: float64

In [33]:
# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
X_train

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Categories,Outlet_Age,Item_MRP_Categories
549,-0.793348,1.354743,-0.599405,0.487415,1.507813,-0.664080,-1.369334,-0.252658,-0.179795,-0.139541,1.062040
7757,1.214895,-0.738147,-0.361643,0.474526,0.802852,0.799954,-0.138882,-0.252658,1.735696,-0.497909,1.062040
764,1.120389,1.354743,0.193620,-0.470081,1.155333,0.799954,-1.369334,-0.252658,-0.179795,0.099372,0.122280
6867,-1.070958,-0.738147,-0.703303,-1.595901,0.802852,0.799954,-0.138882,-0.252658,-0.179795,-0.497909,-1.757239
2716,-0.001864,-0.738147,1.377965,0.233979,1.155333,0.799954,-1.369334,-0.252658,-0.179795,0.099372,1.062040
...,...,...,...,...,...,...,...,...,...,...,...
5734,-0.818156,1.354743,4.268121,-0.029050,-1.664513,0.799954,1.091569,-1.508289,-0.179795,-0.020085,1.062040
5191,0.647861,-0.738147,0.997049,-1.049049,-0.959551,0.799954,-0.138882,-0.252658,-0.179795,-1.095190,0.122280
5390,1.120389,-0.738147,-0.914595,1.547519,0.802852,0.799954,-0.138882,-0.252658,1.735696,-0.497909,-0.817480
860,1.770115,-0.738147,-0.228111,-0.370092,-0.959551,0.799954,-0.138882,-0.252658,-0.179795,-1.095190,0.122280


In [35]:
X_test

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Categories,Outlet_Age,Item_MRP_Categories
7503,0.340718,-0.738147,-0.772020,-0.988617,-1.312032,-2.128115,1.091569,-0.252658,-0.179795,1.293934,0.122280
2957,-1.164282,-0.738147,0.096983,-1.578319,1.155333,0.799954,-1.369334,-0.252658,1.735696,0.099372,-1.757239
7031,0.387971,1.354743,-0.481034,-1.588970,1.507813,-0.664080,-1.369334,-0.252658,-0.179795,-0.139541,-1.757239
1084,-0.238728,1.354743,-0.414092,0.525323,0.097891,-0.664080,1.091569,2.258603,-2.095286,1.532846,1.062040
856,-0.629145,1.354743,-1.040342,0.907611,0.450371,0.799954,-0.138882,-0.252658,-0.179795,-0.736822,1.062040
...,...,...,...,...,...,...,...,...,...,...,...
7205,-0.249941,1.354743,0.533459,-0.221970,1.155333,0.799954,-1.369334,-0.252658,-0.179795,0.099372,0.122280
3257,-1.379282,-0.738147,-1.281758,0.115312,-0.607071,-0.664080,1.091569,1.002972,-0.179795,-1.334103,1.062040
6346,0.387971,1.354743,-0.482942,-1.588970,-1.312032,-2.128115,1.091569,-0.252658,-0.179795,1.293934,-1.757239
6318,-0.722469,1.354743,1.454639,-1.453187,-0.607071,-0.664080,1.091569,1.002972,-0.179795,-1.334103,-1.757239


In [36]:
y_train

549     0.120100
7757    0.540712
764    -0.618898
6867   -1.111693
2716    1.197382
          ...   
5734   -1.113644
5191   -0.515501
5390    2.323046
860    -0.311437
7270   -0.712541
Name: Item_Outlet_Sales, Length: 6818, dtype: float64

In [37]:
y_test

7503   -0.256812
2957   -1.069164
7031   -1.057068
1084    2.108058
856     0.102932
          ...   
7205    0.482185
3257   -0.756241
6346   -0.909581
6318   -1.130031
6339   -0.705908
Name: Item_Outlet_Sales, Length: 1705, dtype: float64

In [38]:
models = [('LinearRegression', LinearRegression()), ('ElasticNet', ElasticNet()), 
          ('RandomForestRegressor', RandomForestRegressor()), ('ExtraTreesRegressor', ExtraTreesRegressor()), 
          ('GradientBoostingRegressor', GradientBoostingRegressor()),
          ('MLPRegressor', MLPRegressor())]

In [39]:
model_building(X_train, y_train, 3, models)

LinearRegression 0.4963111999380221
ElasticNet 0.050613336457214074
RandomForestRegressor 0.5491339765586986
ExtraTreesRegressor 0.5068825945911802
GradientBoostingRegressor 0.5919927815549019
MLPRegressor 0.5780200335846292


In [40]:
model_building(X_train, y_train, 5, models)

LinearRegression 0.4966295593034424
ElasticNet 0.05091409727371508
RandomForestRegressor 0.5522427971468143
ExtraTreesRegressor 0.5079607885690548
GradientBoostingRegressor 0.5936175387753281
MLPRegressor 0.5791154305796506


In [41]:
# Linear GAM

gam = LinearGAM().gridsearch(X_train.values, y_train.values)
gam.summary()

  0% (0 of 11) |                         | Elapsed Time: 0:00:00 ETA:  --:--:--


  9% (1 of 11) |##                       | Elapsed Time: 0:00:00 ETA:   0:00:05
 18% (2 of 11) |####                     | Elapsed Time: 0:00:01 ETA:   0:00:04
 27% (3 of 11) |######                   | Elapsed Time: 0:00:01 ETA:   0:00:04
 36% (4 of 11) |#########                | Elapsed Time: 0:00:02 ETA:   0:00:03
 45% (5 of 11) |###########              | Elapsed Time: 0:00:02 ETA:   0:00:03
 54% (6 of 11) |#############            | Elapsed Time: 0:00:03 ETA:   0:00:02
 63% (7 of 11) |###############          | Elapsed Time: 0:00:04 ETA:   0:00:02
 72% (8 of 11) |##################       | Elapsed Time: 0:00:04 ETA:   0:00:01
 81% (9 of 11) |####################     | Elapsed Time: 0:00:05 ETA:   0:00:01
 90% (10 of 11) |#####################   | Elapsed Time: 0:00:06 ETA:   0:00:00
100% (11 of 11) |########################| Elapsed Time: 0:00:07 Time:  0:00:07


LinearGAM                                                                                                 
Distribution:                        NormalDist Effective DoF:                                     26.7769
Link Function:                     IdentityLink Log Likelihood:                                 -8368.5583
Number of Samples:                         6818 AIC:                                            16792.6704
                                                AICc:                                           16792.9059
                                                GCV:                                                0.4522
                                                Scale:                                               0.449
                                                Pseudo R-Squared:                                   0.5598
Feature Function                  Lambda               Rank         EDoF         P > x        Sig. Code   
s(0)                              [10

In [42]:
# Voting Regressor

regressor1 = LinearRegression()
regressor2 = GradientBoostingRegressor()
regressor3 = MLPRegressor()

voting_regressor = VotingRegressor(estimators=[('LR', regressor1), ('GBR', regressor2), ('MLP', regressor3)])
scores = cross_val_score(voting_regressor, X_train, y_train, cv=6)

In [43]:
scores

array([0.566522  , 0.60243912, 0.59581584, 0.56157768, 0.61242467,
       0.56031026])

In [44]:
scores.mean()

0.5831815964732977

In [45]:
# Stacking

regressor1 = LinearRegression()
regressor2 = GradientBoostingRegressor()

level1_estimators = [('LR', regressor1), ('GBR', regressor2)]
level2_estimator = MLPRegressor()

stacking_regressor = StackingRegressor(estimators=level1_estimators, final_estimator=level2_estimator)
scores = cross_val_score(stacking_regressor, X_train, y_train, cv=6)
# scores = cross_val_score(stacking_regressor, X_train, y_train, cv=10)

In [46]:
scores

array([0.5802029 , 0.60801363, 0.60448885, 0.5739029 , 0.61838796,
       0.56723052])

In [47]:
scores.mean()

0.592037791989303

In [48]:
# Blending

X_train1, X_val, y_train1, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

models = models_blending_list()
model_blender = model_blending(models, X_train1, X_val, y_train1, y_val)
scores = cross_val_score(model_blender, X_train, y_train, cv=6)
scores

array([0.545117  , 0.59339672, 0.59464261, 0.55669926, 0.62258553,
       0.55406087])

In [49]:
scores.mean()

0.5777503293326237

In [50]:
# Blending model predictions
y_pred = model_blending_predictions(models, model_blender, X_test)
print('R^2 of Blending is ')
print(r2_score(y_test, y_pred))

# Stacking model predictions
model_stack = stacking_regressor.fit(X_train, y_train)
y_pred = model_stack.predict(X_test)
print('R^2 of Stacking is ')
print(r2_score(y_test, y_pred))

# Voting Regressor model predictions
model_voting = voting_regressor.fit(X_train, y_train)
y_pred = model_voting.predict(X_test)
print('R^2 of Voting is ')
print(r2_score(y_test, y_pred))

# Gradient Boosting Regressor model predictions
model_gbm = GradientBoostingRegressor()
model_gbm.fit(X_train, y_train)
y_pred = model_gbm.predict(X_test)
print('R^2 of GBM is ')
print(r2_score(y_test, y_pred))


R^2 of Blending is 
0.6059392228174556
R^2 of Stacking is 
0.6102198431278503
R^2 of Voting is 
0.6053462776569933
R^2 of GBM is 
0.6069622482385981


In [51]:
# For interpretability we arre choosing GBM model.
with open('GBM_Model.pkl', 'wb') as file:
    pickle.dump(model_gbm, file)
