- Based on the feature selection we have some relevant features lets build models with those features

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,Normalizer,StandardScaler,PolynomialFeatures
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV,train_test_split

In [2]:
def regression_validation(y_predict,y_test) :
    print("Mean Absolute Error = ",mean_absolute_error(y_predict,y_test))
    print("Mean Squared Error = ",mean_squared_error(y_predict,y_test))
    print("Root Mean Squared Error = ",np.sqrt(mean_squared_error(y_predict,y_test)))
    print("R2 Score = ",r2_score(y_predict,y_test))

In [3]:
encoder = LabelEncoder()
scaling =  StandardScaler()

In [4]:
data = pd.read_csv('Bigmart Sales Modelling Dataset.csv')
data.head()

Unnamed: 0,Item_Identifier,Item_Code,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,Food,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,Drinks,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,Food,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,Food,19.2,Regular,0.070482,Fruits and Vegetables,182.095,OUT010,1998,Medium,Tier 3,Grocery Store,732.38
4,NCD19,Non-Consumable,8.93,No Fat,0.070482,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [5]:
X = data.loc[:,['Item_Code','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Identifier','Outlet_Size',
                'Outlet_Location_Type','Outlet_Type']]
y = data.Item_Outlet_Sales

In [6]:
for  cols in X.select_dtypes(['object']):
    X[cols] = encoder.fit_transform(X[cols])


In [7]:
X.head()

Unnamed: 0,Item_Code,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,1,0,0.016047,249.8092,9,1,0,1
1,0,2,0.019278,48.2692,3,1,2,2
2,1,0,0.01676,141.618,9,1,0,1
3,1,2,0.070482,182.095,0,1,2,0
4,2,1,0.070482,53.8614,1,0,2,1


## Linear Regression

In [8]:
X = data.loc[:,['Item_Code','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Identifier','Outlet_Size',
                'Outlet_Location_Type','Outlet_Type']]
for  cols in X.select_dtypes(['object']):
    X[cols] = encoder.fit_transform(X[cols])

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=5)

In [10]:
modelLR = LinearRegression()
modelLR.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
y_predict = modelLR.predict(X_test)
regression_validation(y_predict,y_test)

Mean Absolute Error =  900.7921698301203
Mean Squared Error =  1386172.1715202064
Root Mean Squared Error =  1177.3581322266418
R2 Score =  0.07876410542951773


## Polynomial Regression

In [12]:
poly = PolynomialFeatures(degree=2)
X = poly.fit_transform(X)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=5)

In [13]:
modelLR = LinearRegression()
modelLR.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [14]:
y_predict = modelLR.predict(X_test)
regression_validation(y_predict,y_test)

Mean Absolute Error =  778.7923888042089
Mean Squared Error =  1135561.8710198987
Root Mean Squared Error =  1065.627454141408
R2 Score =  0.35007191725903974


## Lasso Regression

In [15]:
X = data.loc[:,['Item_Code','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Identifier','Outlet_Size',
                'Outlet_Location_Type','Outlet_Type']]
for  cols in X.select_dtypes(['object']):
    X[cols] = encoder.fit_transform(X[cols])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [16]:
modelLa = Lasso(alpha=1.0,random_state=3)
modelLa.fit(X_train,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=3,
   selection='cyclic', tol=0.0001, warm_start=False)

In [17]:
y_predict = modelLa.predict(X_test)
regression_validation(y_predict,y_test)

Mean Absolute Error =  890.1997301937444
Mean Squared Error =  1376383.913137191
Root Mean Squared Error =  1173.19389409304
R2 Score =  0.07176036461729107


## Ridge Regression

In [18]:
X = data.loc[:,['Item_Code','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Identifier','Outlet_Size',
                'Outlet_Location_Type','Outlet_Type']]
for  cols in X.select_dtypes(['object']):
    X[cols] = encoder.fit_transform(X[cols])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [19]:
modelRi = Ridge(alpha=1.0)
modelRi.fit(X_train,y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [20]:
y_predict = modelRi.predict(X_test)
regression_validation(y_predict,y_test)

Mean Absolute Error =  889.0737732227458
Mean Squared Error =  1374078.1486445728
Root Mean Squared Error =  1172.2107953113948
R2 Score =  0.07663749022294952


## Stochastic Gradient Descent

In [21]:
X = data.loc[:,['Item_Code','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Identifier','Outlet_Size',
                'Outlet_Location_Type','Outlet_Type']]
for  cols in X.select_dtypes(['object']):
    X[cols] = encoder.fit_transform(X[cols])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [22]:
modelSD = SGDRegressor()
modelSD.fit(X_train,y_train)



SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=None, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

In [23]:
y_predict = modelSD.predict(X_test)
regression_validation(y_predict,y_test)

Mean Absolute Error =  1180817359048.8816
Mean Squared Error =  1.7773207449171285e+24
Root Mean Squared Error =  1333161934994.0684
R2 Score =  -3.5775444352876518


## Desicion Tree

In [24]:
X = data.loc[:,['Item_Code','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Identifier',
                'Outlet_Location_Type','Outlet_Type']]
for  cols in X.select_dtypes(['object']):
    X[cols] = encoder.fit_transform(X[cols])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [25]:
modelDT = DecisionTreeRegressor(criterion='mse',max_depth=5)
modelDT.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=5, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [26]:
y_predict = modelDT.predict(X_test)
regression_validation(y_predict,y_test)

Mean Absolute Error =  742.967408634172
Mean Squared Error =  1117976.9294119687
Root Mean Squared Error =  1057.3442814012703
R2 Score =  0.3568384178636902


## RandomForest 

In [27]:
X = data.loc[:,['Item_Code','Item_Fat_Content','Item_Visibility','Item_MRP','Outlet_Identifier',
                'Outlet_Location_Type','Outlet_Type']]
for  cols in X.select_dtypes(['object']):
    X[cols] = encoder.fit_transform(X[cols])

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=2)

In [28]:
modelRF = RandomForestRegressor(max_depth=5,n_estimators=8,random_state=10)
modelRF.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=8, n_jobs=1,
           oob_score=False, random_state=10, verbose=0, warm_start=False)

In [29]:
y_predict = modelRF.predict(X_test)
regression_validation(y_predict,y_test)

Mean Absolute Error =  737.6450656985775
Mean Squared Error =  1105848.5052457755
Root Mean Squared Error =  1051.5933174216045
R2 Score =  0.36322340855662383


## SVM

In [30]:
modelSVM = SVR(kernel='linear',C=100)
modelSVM.fit(X_train,y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [31]:
y_predict = modelSVM.predict(X_test)
regression_validation(y_predict,y_test)

Mean Absolute Error =  883.6828516509213
Mean Squared Error =  1397599.10033088
Root Mean Squared Error =  1182.2009559845906
R2 Score =  -0.14466681947729843


## Conclusion
- The ML algorithm that perform the best was Random Forest with RMSE = 1051 
- In Future we will be looking at hyperparameter tuning and build models.