<a href="https://colab.research.google.com/github/Rovidicus/Prediction-of-Product-Sales/blob/main/Project1_Part6_core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import modeling tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
# set the default output to pandas
from sklearn import set_config
set_config(transform_output='pandas')

In [3]:
# Transform the training data
X_train_tf = pd.read_csv("Data/X_train_tf.csv")
# Transform the testing data
X_test_tf = pd.read_csv("Data/X_test_tf.csv")
# View the processed training data
X_train_tf.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,...,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.817249,-0.712775,1.828109,1.327849,-0.338062,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.55634,-1.291052,0.603369,1.327849,-0.338062,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.131512,1.813319,0.244541,0.136187,-0.338062,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.169219,-1.004931,-0.952591,0.732018,-1.193861,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.528819,-0.965484,-0.33646,0.493686,1.373536,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [4]:
# View the processed testing data
X_test_tf.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,...,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.3310089,-0.776646,-0.998816,-1.293807,0.517737,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-1.179892,0.100317,-1.585194,-0.102145,-1.193861,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.3784469,-0.482994,-1.595784,0.136187,-0.338062,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4.213344e-16,-0.41544,0.506592,-1.532139,-0.338062,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.6426567,-1.047426,0.886725,0.732018,-1.193861,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [7]:
y_train = pd.read_csv("Data/y_train.csv")
y_test = pd.read_csv("Data/y_test.csv")

# CRISP-DM Phase 4 - Modeling

## Build a linear regression model to predict sales.

In [5]:
# Provide a label for this data
label = 'Test Data'
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [8]:
from sklearn.linear_model import LinearRegression
# Instantiate model
lin_reg = LinearRegression()
#Fit model
lin_reg.fit(X_train_tf, y_train)

In [9]:
 # Test function with default arguments
evaluate_regression(lin_reg, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 847.129
- MSE = 1,297,558.136
- RMSE = 1,139.104
- R^2 = 0.562

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 804.120
- MSE = 1,194,349.715
- RMSE = 1,092.863
- R^2 = 0.567


In [10]:
# Test function by saving results as a datframe
results_df = evaluate_regression(lin_reg, X_train_tf, y_train, X_test_tf, y_test,
                              verbose=False, output_frame=True)
results_df




Unnamed: 0,MAE,MSE,RMSE,R^2
Training Data,847.129,1297558.136,1139.104,0.562
Test Data,804.12,1194349.715,1092.863,0.567


### To what extent is this model overfit/underfit?

- With only a .56 r^2 score on training and testing data and mean values in the hundreds, thousands and millions, the model is very underfit for the task of predicting Item Outlet Sales.

## Build a Random Forest model to predict sales.

In [12]:
# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)

In [14]:
# Fit the model pipeline on the training data only
rf.fit(X_train_tf, y_train)

  rf.fit(X_train_tf, y_train)


In [15]:
# Use custom function to evaluate default model
evaluate_regression(rf, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 296.748
- MSE = 183,363.537
- RMSE = 428.210
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 764.548
- MSE = 1,215,647.820
- RMSE = 1,102.564
- R^2 = 0.559


In [17]:
# Getting parameters for tuning
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [21]:
# Define param grid with options to try
params = {'max_depth': [None,10,15,20],
          'n_estimators':[10,100,150,200],
          'min_samples_leaf':[1,2,3,4],
          'max_features':['sqrt','log2',None,1],
          'oob_score':[True,False]
          }

In [22]:
# Instantiate the gridsearch
gridsearch = GridSearchCV(rf, params, n_jobs=-1, cv = 3, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train_tf, y_train)

Fitting 3 folds for each of 512 candidates, totalling 1536 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [23]:
# Obtain best parameters
gridsearch.best_params_

{'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 1,
 'n_estimators': 100,
 'oob_score': True}

In [24]:
# Define and refit best model
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 642.115
- MSE = 822,190.390
- RMSE = 906.747
- R^2 = 0.722

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 738.477
- MSE = 1,129,483.891
- RMSE = 1,062.772
- R^2 = 0.591


## Did the performance improve?

- Barely in the test data, and much worse in training. That means the forest model isn't overfit at least but much tweaking remains to be done.

## which model to recommend?

- Both models need improvement. However the linear regression model, while consistent, has high bias. With the random forest model there's less overfitting and a slight improvement in r^2 thus that would be recommended.

- With an r^2 of 0.591 the random forest model is accounting for about 59% of the error rate but cannot yet predict 41%. Adjustments need to be made for accuracy.
- The MAE (mean absolute error) for testing is 738, which for item outlet sales a significant variance that needs to be narrowed. This is showing that the model can't yet be relied on to predict future sales within 738 of what its value should be.
- The random forest model training data at least isn't overfit at r2 of 0.722 for training and testing r2 of 0.591. These metrics could be improved possibly with eliminating some irrelevant features and refitting.