In [2]:
# A chain of supermarket sotre wishes to build a sales prediction model that can help it understand what products sell in which 
# type of store. The data set contains a snappshot of historical sales from a selection of 10 stores on 1559 SKUs. The focus is
# on building a predictive model for sales based on the available features. 

#The first analysis is done using the statistical model followed by the machine learning approach.

In [3]:
import pandas as pd 
import numpy as np 
import sklearn as skl 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm

In [4]:
train_df = pd.read_csv('Train.csv')
test_df  = pd.read_csv('Test.csv')
sales_df = pd.read_csv('sales-data.csv')

In [5]:
train_df.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [6]:
test_df.isna().sum()

Item_Identifier                 0
Item_Weight                   976
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  1606
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [7]:
train_df.describe()
#sales_df.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867,2181.288914
std,4.643456,0.051598,62.275067,8.37176,1706.499616
min,4.555,0.0,31.29,1985.0,33.29
25%,8.77375,0.026989,93.8265,1987.0,834.2474
50%,12.6,0.053931,143.0128,1999.0,1794.331
75%,16.85,0.094585,185.6437,2004.0,3101.2964
max,21.35,0.328391,266.8884,2009.0,13086.9648


# Step 1 : Traditional Method of Building the Model 

In [8]:
# In traditional approach we begin with a set of hypotheses around what to expect before we look at the output from the model.
# For this problem, few probable hypothesis could be that expected SKU sales would be more for the outlets where (1) the items
# is displayed prominently , (2) stores that are large and (3) price of the SKU is less on average.

# The next step once a set of hypotheses are formulated is to incorporate specific inputs as features into a model(usually linear)
# and test if our hypotheses are actually true.

In [9]:
train_df.columns

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')

In [10]:
#df1 = df[['a', 'b']]
#df1 = df.iloc[:, 0:2] # Remember that Python does not slice inclusive of the ending index.
x = train_df[['Item_Visibility','Item_MRP','Outlet_Establishment_Year']]
print(train_df.columns)
#x = train_df['Outlet_Type']
y = train_df['Item_Outlet_Sales']

Index(['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility',
       'Item_Type', 'Item_MRP', 'Outlet_Identifier',
       'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Outlet_Sales'],
      dtype='object')


In [11]:
sales_model = LinearRegression().fit(x,y)

In [12]:
sales_model.coef_

array([-4382.52991617,    15.55676418,   -12.6179102 ])

In [13]:
#X = sm.add_constant(X.ravel())
results = sm.OLS(y,x).fit()
results.summary() 

0,1,2,3
Dep. Variable:,Item_Outlet_Sales,R-squared (uncentered):,0.749
Model:,OLS,Adj. R-squared (uncentered):,0.749
Method:,Least Squares,F-statistic:,8467.0
Date:,"Sat, 22 Jan 2022",Prob (F-statistic):,0.0
Time:,01:41:20,Log-Likelihood:,-73762.0
No. Observations:,8523,AIC:,147500.0
Df Residuals:,8520,BIC:,147600.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Item_Visibility,-4216.6575,291.318,-14.474,0.000,-4787.712,-3645.603
Item_MRP,15.5645,0.241,64.457,0.000,15.091,16.038
Outlet_Establishment_Year,0.1328,0.021,6.327,0.000,0.092,0.174

0,1,2,3
Omnibus:,814.484,Durbin-Watson:,1.989
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1716.759
Skew:,0.613,Prob(JB):,0.0
Kurtosis:,4.825,Cond. No.,38800.0


In [14]:
#This is the traditional marketing research approach. It is important to note that generating a set of hypotheses about 
#the data generating process and collecting data allows the analyst to test these hypotheses. 

In [15]:
train_x = train_df.drop('Item_Outlet_Sales',axis=1)

In [16]:
#Step 1 : Metric choice and data split 
#Since this is a problem involves prediction of numeric outcome i.e. sales , we choose RMSE as the metric. We then split the 
#training data further into training and vaildation subsets.
x_train,x_test,y_train,y_test = train_test_split(train_x,y,random_state=123,test_size=0.20)

In [17]:
x_train.reset_index().drop('index',axis=1)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDS33,,Regular,0.216108,Snack Foods,86.8514,OUT019,1985,Small,Tier 1,Grocery Store
1,NCY05,13.500,Low Fat,0.055076,Health and Hygiene,33.3874,OUT049,1999,Medium,Tier 1,Supermarket Type1
2,FDC51,10.895,Regular,0.009614,Dairy,122.9730,OUT013,1987,High,Tier 3,Supermarket Type1
3,FDX03,15.850,Regular,0.061045,Meat,44.7744,OUT013,1987,High,Tier 3,Supermarket Type1
4,FDZ58,17.850,Low Fat,0.052472,Snack Foods,123.7072,OUT017,2007,,Tier 2,Supermarket Type1
...,...,...,...,...,...,...,...,...,...,...,...
6813,FDN21,18.600,Low Fat,0.077169,Snack Foods,160.6236,OUT018,2009,Medium,Tier 3,Supermarket Type2
6814,DRF25,,Low Fat,0.068153,Soft Drinks,36.0190,OUT019,1985,Small,Tier 1,Grocery Store
6815,DRC01,5.920,Regular,0.019184,Soft Drinks,50.3692,OUT013,1987,High,Tier 3,Supermarket Type1
6816,FDN12,15.600,Low Fat,0.000000,Baking Goods,111.8544,OUT018,2009,Medium,Tier 3,Supermarket Type2


In [18]:
y_train.reset_index().drop('index',axis=1)

Unnamed: 0,Item_Outlet_Sales
0,354.2056
1,282.2992
2,1231.7300
3,181.0976
4,2450.1440
...,...
6813,1611.2360
6814,73.2380
6815,591.2304
6816,1454.1072


In [19]:
x_test.reset_index().drop('index',axis=1)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDL21,15.850,Regular,0.007145,Snack Foods,41.0480,OUT035,2004,Small,Tier 2,Supermarket Type1
1,FDO08,,Regular,0.094154,Fruits and Vegetables,165.7526,OUT019,1985,Small,Tier 1,Grocery Store
2,DRC12,17.850,Low Fat,0.037886,Soft Drinks,190.4188,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,FDL28,,Regular,0.000000,Frozen Foods,230.0668,OUT019,1985,Small,Tier 1,Grocery Store
4,FDY15,,Regular,0.170001,Dairy,155.9630,OUT027,1985,Medium,Tier 3,Supermarket Type3
...,...,...,...,...,...,...,...,...,...,...,...
1700,FDO52,11.600,Regular,0.077321,Frozen Foods,169.3106,OUT045,2002,,Tier 2,Supermarket Type1
1701,FDM01,7.895,Regular,0.094549,Breakfast,102.6332,OUT035,2004,Small,Tier 2,Supermarket Type1
1702,DRN47,12.100,Low Fat,0.028165,Hard Drinks,178.1660,OUT010,1998,,Tier 3,Grocery Store
1703,FDP45,15.700,Regular,0.030625,Snack Foods,252.2724,OUT046,1997,Small,Tier 1,Supermarket Type1


In [20]:
y_test.reset_index().drop('index',axis=1)

Unnamed: 0,Item_Outlet_Sales
0,479.3760
1,657.8104
2,952.0940
3,691.1004
4,4537.4270
...,...
1700,2224.4378
1701,1640.5312
1702,539.2980
1703,1258.3620


In [21]:
# We reiterate the fact that we do not touch the test data until we have the final models tuned to the training data.

# Step 2 : Pre-processing

In [22]:
x_train['Item_MRP'].describe()

count    6818.000000
mean      141.004066
std        62.342862
min        31.290000
25%        93.846200
50%       143.031200
75%       185.584500
max       266.888400
Name: Item_MRP, dtype: float64

In [23]:
x_train['Item_MRP_sc'] = (x_train['Item_MRP']- x_train['Item_MRP'].mean())/x_train['Item_MRP'].std()
x_train['Item_MRP_sc'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['Item_MRP_sc'] = (x_train['Item_MRP']- x_train['Item_MRP'].mean())/x_train['Item_MRP'].std()


count    6.818000e+03
mean    -2.646819e-15
std      1.000000e+00
min     -1.759850e+00
25%     -7.564277e-01
50%      3.251589e-02
75%      7.150848e-01
max      2.019226e+00
Name: Item_MRP_sc, dtype: float64

In [24]:
x_test['Item_MRP_sc'] = (x_test['Item_MRP']- x_test['Item_MRP'].mean())/x_test['Item_MRP'].std()
x_test['Item_MRP_sc'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Item_MRP_sc'] = (x_test['Item_MRP']- x_test['Item_MRP'].mean())/x_test['Item_MRP'].std()


count    1.705000e+03
mean     2.611987e-15
std      1.000000e+00
min     -1.749264e+00
25%     -7.626631e-01
50%      2.905354e-02
75%      7.251742e-01
max      2.030600e+00
Name: Item_MRP_sc, dtype: float64

In [25]:
x_train['Item_Visibility_sc'] = (x_train['Item_Visibility']- x_train['Item_Visibility'].mean())/x_train['Item_Visibility'].std()
x_train['Item_Visibility_sc'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train['Item_Visibility_sc'] = (x_train['Item_Visibility']- x_train['Item_Visibility'].mean())/x_train['Item_Visibility'].std()


count    6.818000e+03
mean    -1.610151e-15
std      1.000000e+00
min     -1.284823e+00
25%     -7.596378e-01
50%     -2.326221e-01
75%      5.647383e-01
max      5.063513e+00
Name: Item_Visibility_sc, dtype: float64

In [26]:
x_test['Item_Visibility_sc'] = (x_test['Item_Visibility']- x_test['Item_Visibility'].mean())/x_test['Item_Visibility'].std()
x_test['Item_Visibility_sc'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test['Item_Visibility_sc'] = (x_test['Item_Visibility']- x_test['Item_Visibility'].mean())/x_test['Item_Visibility'].std()


count    1.705000e+03
mean    -2.158440e-15
std      1.000000e+00
min     -1.269629e+00
25%     -7.544122e-01
50%     -2.530338e-01
75%      5.165733e-01
max      4.960316e+00
Name: Item_Visibility_sc, dtype: float64

In [27]:
x_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP_sc,Item_Visibility_sc
6926,FDS33,,Regular,0.216108,Snack Foods,86.8514,OUT019,1985,Small,Tier 1,Grocery Store,-0.868627,2.926362
5168,NCY05,13.5,Low Fat,0.055076,Health and Hygiene,33.3874,OUT049,1999,Medium,Tier 1,Supermarket Type1,-1.726207,-0.211592
2095,FDC51,10.895,Regular,0.009614,Dairy,122.973,OUT013,1987,High,Tier 3,Supermarket Type1,-0.289224,-1.097482
4448,FDX03,15.85,Regular,0.061045,Meat,44.7744,OUT013,1987,High,Tier 3,Supermarket Type1,-1.543555,-0.095265
3874,FDZ58,17.85,Low Fat,0.052472,Snack Foods,123.7072,OUT017,2007,,Tier 2,Supermarket Type1,-0.277447,-0.262326


In [28]:
x_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP_sc,Item_Visibility_sc
4532,FDL21,15.85,Regular,0.007145,Snack Foods,41.048,OUT035,2004,Small,Tier 2,Supermarket Type1,-1.610728,-1.134079
186,FDO08,,Regular,0.094154,Fruits and Vegetables,165.7526,OUT019,1985,Small,Tier 1,Grocery Store,0.399941,0.516573
7557,DRC12,17.85,Low Fat,0.037886,Soft Drinks,190.4188,OUT049,1999,Medium,Tier 1,Supermarket Type1,0.797646,-0.550895
6308,FDL28,,Regular,0.0,Frozen Foods,230.0668,OUT019,1985,Small,Tier 1,Grocery Store,1.436909,-1.269629
7297,FDY15,,Regular,0.170001,Dairy,155.963,OUT027,1985,Medium,Tier 3,Supermarket Type3,0.242099,1.955477


In [29]:
len(x_train)

6818

In [30]:
x_train_01 = x_train.copy()

In [31]:
len(x_test)

1705

In [32]:
#imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
#imputer.fit(x_train_01)

In [33]:
x_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP_sc,Item_Visibility_sc
6926,FDS33,,Regular,0.216108,Snack Foods,86.8514,OUT019,1985,Small,Tier 1,Grocery Store,-0.868627,2.926362
5168,NCY05,13.5,Low Fat,0.055076,Health and Hygiene,33.3874,OUT049,1999,Medium,Tier 1,Supermarket Type1,-1.726207,-0.211592
2095,FDC51,10.895,Regular,0.009614,Dairy,122.973,OUT013,1987,High,Tier 3,Supermarket Type1,-0.289224,-1.097482
4448,FDX03,15.85,Regular,0.061045,Meat,44.7744,OUT013,1987,High,Tier 3,Supermarket Type1,-1.543555,-0.095265
3874,FDZ58,17.85,Low Fat,0.052472,Snack Foods,123.7072,OUT017,2007,,Tier 2,Supermarket Type1,-0.277447,-0.262326


In [34]:
x_train['Outlet_Type']

6926        Grocery Store
5168    Supermarket Type1
2095    Supermarket Type1
4448    Supermarket Type1
3874    Supermarket Type1
              ...        
7382    Supermarket Type2
7763        Grocery Store
5218    Supermarket Type1
1346    Supermarket Type2
3582    Supermarket Type2
Name: Outlet_Type, Length: 6818, dtype: object

In [35]:
x_train['Outlet_Identifier'].value_counts()

OUT046    761
OUT049    757
OUT013    757
OUT017    756
OUT018    740
OUT045    737
OUT035    733
OUT027    733
OUT010    424
OUT019    420
Name: Outlet_Identifier, dtype: int64

In [36]:
Label_enc = LabelEncoder()
Label_enc.fit(x_train['Outlet_Identifier'])
x_train.Outlet_Identifier  = Label_enc.transform(x_train.Outlet_Identifier)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [37]:
def label_enc(col_arr,df):
    Label_enc = LabelEncoder()
    
    for i in range(0,len(col_arr)):
        Label_enc.fit(df[col_arr[i]])
        df[col_arr[i]]  = Label_enc.transform(df[col_arr[i]]) 

In [38]:
label_enc(['Item_Type','Item_Fat_Content','Item_Type','Outlet_Type','Outlet_Identifier','Outlet_Size','Outlet_Location_Type'],x_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_arr[i]]  = Label_enc.transform(df[col_arr[i]])


In [39]:
label_enc(['Outlet_Identifier','Outlet_Type','Item_Type','Item_Fat_Content','Item_Type','Outlet_Identifier','Outlet_Size','Outlet_Location_Type'],x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_arr[i]]  = Label_enc.transform(df[col_arr[i]])


In [40]:
x_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP_sc,Item_Visibility_sc
4532,FDL21,15.85,2,0.007145,13,41.048,6,2004,2,1,1,-1.610728,-1.134079
186,FDO08,,2,0.094154,6,165.7526,4,1985,2,0,0,0.399941,0.516573
7557,DRC12,17.85,1,0.037886,14,190.4188,9,1999,1,0,1,0.797646,-0.550895
6308,FDL28,,2,0.0,5,230.0668,4,1985,2,0,0,1.436909,-1.269629
7297,FDY15,,2,0.170001,4,155.963,5,1985,1,2,3,0.242099,1.955477


In [41]:
#x_train['Item_Weight'].fillna(method = 'ffill')
#x_train['Item_Weight'].fillna(method = 'bfill')

In [42]:
#x_test['Item_Weight'].fillna(method = 'bfill')
#x_test['Item_Weight'].fillna(method = 'ffill')

In [43]:
#x_test.head()

In [44]:
#x_test['Item_Weight'].fillna(method = 'bfill')
#x_test['Item_Weight'].fillna(method = 'ffill')

In [45]:
x_train['Item_Weight'].fillna(x_train['Item_Weight'].mean(),inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [46]:
x_test['Item_Weight'].fillna(x_test['Item_Weight'].mean(),inplace = True)

In [47]:
x_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP_sc,Item_Visibility_sc
6926,FDS33,12.867419,2,0.216108,13,86.8514,4,1985,2,0,0,-0.868627,2.926362
5168,NCY05,13.5,1,0.055076,8,33.3874,9,1999,1,0,1,-1.726207,-0.211592
2095,FDC51,10.895,2,0.009614,4,122.973,1,1987,0,2,1,-0.289224,-1.097482
4448,FDX03,15.85,2,0.061045,10,44.7744,1,1987,0,2,1,-1.543555,-0.095265
3874,FDZ58,17.85,1,0.052472,13,123.7072,2,2007,3,1,1,-0.277447,-0.262326


In [48]:
x_test.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP_sc,Item_Visibility_sc
4532,FDL21,15.85,2,0.007145,13,41.048,6,2004,2,1,1,-1.610728,-1.134079
186,FDO08,12.817953,2,0.094154,6,165.7526,4,1985,2,0,0,0.399941,0.516573
7557,DRC12,17.85,1,0.037886,14,190.4188,9,1999,1,0,1,0.797646,-0.550895
6308,FDL28,12.817953,2,0.0,5,230.0668,4,1985,2,0,0,1.436909,-1.269629
7297,FDY15,12.817953,2,0.170001,4,155.963,5,1985,1,2,3,0.242099,1.955477


# Step 3 : Model Training,performance measurement and model choice.

In [49]:
# Here we usually start with models that have worked for this kind of data from prior experience or even just a choice of go-to 
# models. To illustrate the hyperparameter tuning process involved in training a machine learning model, we use a random forest
# model to learn from the data. This model is importanat hyperparameter that needs to be tuned during training- the number of trees.
# Focus is on tuning the model.

In [50]:
# class sklearn.ensemble.RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2,
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, 
#bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, 
#ccp_alpha=0.0, max_samples=None)

In [51]:
#class sklearn.ensemble.RandomForestRegressor(n_estimators=100, *, criterion='squared_error', max_depth=None, 
#min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, 
#min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, 
#warm_start=False, ccp_alpha=0.0, max_samples=None)

In [69]:
sales_model_random_forest = RandomForestRegressor(n_estimators= 1500,random_state = 42 )

In [53]:
x_train_01 = x_train.drop('Item_Identifier',axis=1)

In [54]:
x_train_01

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_MRP_sc,Item_Visibility_sc
6926,12.867419,2,0.216108,13,86.8514,4,1985,2,0,0,-0.868627,2.926362
5168,13.500000,1,0.055076,8,33.3874,9,1999,1,0,1,-1.726207,-0.211592
2095,10.895000,2,0.009614,4,122.9730,1,1987,0,2,1,-0.289224,-1.097482
4448,15.850000,2,0.061045,10,44.7744,1,1987,0,2,1,-1.543555,-0.095265
3874,17.850000,1,0.052472,13,123.7072,2,2007,3,1,1,-0.277447,-0.262326
...,...,...,...,...,...,...,...,...,...,...,...,...
7382,18.600000,1,0.077169,13,160.6236,3,2009,1,2,2,0.314704,0.218927
7763,12.867419,1,0.068153,14,36.0190,4,1985,2,0,0,-1.683995,0.043244
5218,5.920000,2,0.019184,14,50.3692,1,1987,0,2,1,-1.453813,-0.910993
1346,15.600000,1,0.000000,0,111.8544,3,2009,1,2,2,-0.467570,-1.284823


In [70]:
sales_model_random_forest.fit(x_train_01,y_train)

RandomForestRegressor(n_estimators=1500, random_state=42)

In [71]:
sales_model_random_forest.score(x_train_01,y_train)

0.9405197543019842

In [57]:
x_test_01 = x_test.drop('Item_Identifier',axis=1)

In [58]:
y_predicted = sales_model_random_forest.predict(x_test_01)

In [59]:
y_test_01 = y_test.reset_index().drop('index',axis=1)

In [60]:
y_predicted_01 = pd.Series(y_predicted)

In [61]:
y_predicted_01

0        748.339226
1        358.539958
2       3343.867314
3        543.958600
4       4219.147968
           ...     
1700    2737.869470
1701    1890.252806
1702     505.229014
1703    3915.476588
1704    2336.625100
Length: 1705, dtype: float64

In [62]:
data = {"y_test" : y_test_01,
        "y_predicted" : y_predicted_01}

z = pd.concat(data,axis=1)
z

Unnamed: 0_level_0,y_test,y_predicted
Unnamed: 0_level_1,Item_Outlet_Sales,0
0,479.3760,748.339226
1,657.8104,358.539958
2,952.0940,3343.867314
3,691.1004,543.958600
4,4537.4270,4219.147968
...,...,...
1700,2224.4378,2737.869470
1701,1640.5312,1890.252806
1702,539.2980,505.229014
1703,1258.3620,3915.476588


In [63]:
len(x_test_01)

1705

In [64]:
y_test_01 = np.array(y_test_01)

In [65]:
y_predicted_01 = np.array(y_predicted)

In [66]:
#Calculate absolute errors 
errors = abs(y_predicted_01 - y_test_01)
np.mean(errors)

1724.0617573247325

In [67]:
#calculate the MAPE 
mape = 100* (errors/y_test_01)
np.mean(mape)

254.39006428970697

In [72]:
# To be continued with Decision Tree Project 