# BigMart Sales Prediction Part 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression,LogisticRegression,ridge_regression,ElasticNet
from sklearn.tree import DecisionTreeRegressor,ExtraTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor,RandomForestRegressor
from xgboost import XGBRegressor,XGBRFRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score,accuracy_score
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline


In [2]:
train = pd.read_csv("train.csv")
train

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
0,FDW44,9.500,Regular,0.035206,Fruits and Vegetables,171.3448,OUT049,1999,Medium,Tier 1,Supermarket Type1,2386.2273
1,NCF54,18.000,Low Fat,0.047473,Household,170.5422,OUT045,2002,Medium,Tier 2,Supermarket Type1,3103.9597
2,FDY03,17.600,Regular,0.076122,Meat,111.7202,OUT046,1997,Small,Tier 1,Supermarket Type1,1125.2020
3,FDQ20,8.325,Low Fat,0.029845,Fruits and Vegetables,41.6138,OUT045,2002,Medium,Tier 2,Supermarket Type1,284.2966
4,FDP34,12.850,Low Fat,0.137228,Snack Foods,155.5630,OUT046,1997,Small,Tier 1,Supermarket Type1,4224.5010
...,...,...,...,...,...,...,...,...,...,...,...,...
6813,FDY08,9.395,Regular,0.286345,Fruits and Vegetables,139.1838,OUT010,1998,Medium,Tier 3,Grocery Store,280.9676
6814,FDC41,15.600,Low Fat,0.117575,Frozen Foods,75.6670,OUT017,2007,Medium,Tier 2,Supermarket Type1,1301.6390
6815,NCQ53,17.600,Low Fat,0.018944,Health and Hygiene,237.3590,OUT045,2002,Medium,Tier 2,Supermarket Type1,6145.3340
6816,FDL46,20.350,Low Fat,0.054363,Snack Foods,117.9466,OUT017,2007,Medium,Tier 2,Supermarket Type1,1649.8524


In [3]:
test = pd.read_csv("test.csv")
test

Unnamed: 0,item_identifier,item_weight,item_fat_content,item_visibility,item_type,item_mrp,outlet_identifier,outlet_establishment_year,outlet_size,outlet_location_type,outlet_type,item_outlet_sales
0,FDI28,14.300000,Low Fat,0.026300,Frozen Foods,79.4302,OUT013,1987,High,Tier 3,Supermarket Type1,1743.0645
1,NCM17,7.930000,Low Fat,0.071136,Health and Hygiene,42.7086,OUT046,1997,Small,Tier 1,Supermarket Type1,356.8688
2,FDC14,14.500000,Regular,0.041313,Canned,42.0454,OUT049,1999,Medium,Tier 1,Supermarket Type1,377.5086
3,DRC36,12.857645,Regular,0.044767,Soft Drinks,173.7054,OUT027,1985,Medium,Tier 3,Supermarket Type3,5778.4780
4,FDS27,10.195000,Regular,0.012456,Meat,197.5110,OUT035,2004,Small,Tier 2,Supermarket Type1,2356.9320
...,...,...,...,...,...,...,...,...,...,...,...,...
1700,FDJ34,11.800000,Regular,0.093656,Snack Foods,127.1704,OUT046,1997,Small,Tier 1,Supermarket Type1,3004.0896
1701,FDS55,7.020000,Low Fat,0.000000,Fruits and Vegetables,148.1734,OUT018,2009,Medium,Tier 3,Supermarket Type2,890.8404
1702,FDC14,14.500000,Regular,0.041215,Canned,42.0454,OUT013,1987,High,Tier 3,Supermarket Type1,629.1810
1703,FDY12,9.800000,Regular,0.141184,Baking Goods,50.5008,OUT018,2009,Medium,Tier 3,Supermarket Type2,253.0040


In [4]:
class CustomTransformer(BaseEstimator,TransformerMixin):

    def fit(self, X, y=None):
        return self
    
    def fit_transform(self,X,y=None):
        
            try:
                lblEn = LabelEncoder()
                columns = ['item_fat_content','item_type','outlet_size','outlet_location_type','outlet_type']
                data = X.copy()
                for col in columns:
                      data[col]= lblEn.fit_transform(data[col])
                data = data.drop(['item_identifier','outlet_identifier','outlet_establishment_year'],axis=1)
                return np.array(data)

            except Exception as e:
                raise SalesException(e, sys) from e
        
    
    def transform(self,X,y=None):
        return self.fit_transform(X=X)

In [5]:
Transformer = CustomTransformer()

In [6]:
Train = Transformer.fit_transform(X=train)
Train

array([[9.5000000e+00, 1.0000000e+00, 3.5205867e-02, ..., 0.0000000e+00,
        1.0000000e+00, 2.3862273e+03],
       [1.8000000e+01, 0.0000000e+00, 4.7473136e-02, ..., 1.0000000e+00,
        1.0000000e+00, 3.1039597e+03],
       [1.7600000e+01, 1.0000000e+00, 7.6121830e-02, ..., 0.0000000e+00,
        1.0000000e+00, 1.1252020e+03],
       ...,
       [1.7600000e+01, 0.0000000e+00, 1.8943666e-02, ..., 1.0000000e+00,
        1.0000000e+00, 6.1453340e+03],
       [2.0350000e+01, 0.0000000e+00, 5.4362696e-02, ..., 1.0000000e+00,
        1.0000000e+00, 1.6498524e+03],
       [1.6350000e+01, 0.0000000e+00, 1.6993204e-02, ..., 0.0000000e+00,
        1.0000000e+00, 9.6541000e+02]])

In [7]:
Test = Transformer.transform(X=test)
Test

array([[1.4300000e+01, 0.0000000e+00, 2.6299797e-02, ..., 2.0000000e+00,
        1.0000000e+00, 1.7430645e+03],
       [7.9300000e+00, 0.0000000e+00, 7.1135870e-02, ..., 0.0000000e+00,
        1.0000000e+00, 3.5686880e+02],
       [1.4500000e+01, 1.0000000e+00, 4.1313200e-02, ..., 0.0000000e+00,
        1.0000000e+00, 3.7750860e+02],
       ...,
       [1.4500000e+01, 1.0000000e+00, 4.1214745e-02, ..., 2.0000000e+00,
        1.0000000e+00, 6.2918100e+02],
       [9.8000000e+00, 1.0000000e+00, 1.4118382e-01, ..., 2.0000000e+00,
        2.0000000e+00, 2.5300400e+02],
       [7.9050000e+00, 0.0000000e+00, 5.5098433e-02, ..., 1.0000000e+00,
        1.0000000e+00, 9.7672860e+02]])

## Splitting the Train data 

In [8]:
xtrain,ytrain,xtest,ytest = Train[:,:-1],Train[:,-1],Test[:,:-1],Test[:,-1]

In [9]:
pipeline_lr = Pipeline([('Linear_model', LinearRegression())])
#pipeline_logr = Pipeline([('Logistic_model', LogisticRegression())])
pipeline_ElS = Pipeline([('Elastic_model', ElasticNet())])
pipeline_dc = Pipeline([('dc_model', DecisionTreeRegressor())])
pipeline_Edc = Pipeline([('Edc_model', ExtraTreeRegressor())])
pipeline_rf = Pipeline([('rf_model', RandomForestRegressor())])
pipeline_AdB = Pipeline([('Adaboost_model', AdaBoostRegressor())])
pipeline_Bag = Pipeline([('Bagging_model', BaggingRegressor())])
pipeline_GDB = Pipeline([('Gradientboost', GradientBoostingRegressor())])
pipeline_xg = Pipeline([('xgboost', XGBRegressor())])
pipeline_xgrf = Pipeline([('xgboost', XGBRFRegressor())])

In [11]:
models = {
    'linear_regression':pipeline_lr,
    #'logistic_regression' : pipeline_logr,
    'ElasticNet' : pipeline_ElS,
    'Decision Tree' : pipeline_dc,
    'Extra Tree' : pipeline_Edc,
    'Random Forest' : pipeline_rf,
    'Adaboost' : pipeline_AdB,
    'Bagging' : pipeline_Bag,
    'Gradientboost' : pipeline_GDB,
    'XGBoost' : pipeline_xg,
    'XGBoostRF' : pipeline_xgrf,
   
}

def model_prediction(name, model):
        model.fit(xtrain, ytrain)
        y_train_pred = model.predict(xtrain)
        y_test_pred = model.predict(xtest)
        MAE = mean_absolute_error(ytest,y_test_pred)
        MSE = mean_squared_error(ytest,y_test_pred)
        R2 = r2_score(ytest,y_test_pred)
        train_acc = r2_score(ytrain, y_train_pred)
        test_acc = r2_score(ytest, y_test_pred)
        train_rmse = np.sqrt(mean_squared_error(ytrain, y_train_pred))
        test_rmse = np.sqrt(mean_squared_error(ytest, y_test_pred))
        model_accuracy = (2 * (train_acc * test_acc)) / (train_acc + test_acc)
        diff_test_train_acc = abs(test_acc - train_acc)
        print(f"Model name : {name}")
        print(f"mean_absolute_error for test data is: {MAE}")
        print(f"mean_squared_error for test data is: {MSE}")
        print(f"r2_score for test data is: {R2}")
        print(f"model accuracy is: {model_accuracy}")
        print(f"difference_in_test_train_acc is: {diff_test_train_acc}\n")

        
for name, model in models.items():
    model_prediction(name, model)

Model name : linear_regression
mean_absolute_error for test data is: 856.1001291548591
mean_squared_error for test data is: 1293619.8058890766
r2_score for test data is: 0.5240492820293436
model accuracy is: 0.5137070569078588
difference_in_test_train_acc is: 0.020284138415284536

Model name : ElasticNet
mean_absolute_error for test data is: 891.5841673852548
mean_squared_error for test data is: 1438050.765731585
r2_score for test data is: 0.47091000670185434
model accuracy is: 0.46119541260739133
difference_in_test_train_acc is: 0.019036476961644677

Model name : Decision Tree
mean_absolute_error for test data is: 1016.8402161876833
mean_squared_error for test data is: 2179370.8092979626
r2_score for test data is: 0.19816232196850092
model accuracy is: 0.3307770881042786
difference_in_test_train_acc is: 0.8018376780314991

Model name : Extra Tree
mean_absolute_error for test data is: 1021.8542228152492
mean_squared_error for test data is: 2157664.14696557
r2_score for test data is: 0.