<a href="https://colab.research.google.com/github/Rovidicus/Prediction-of-Product-Sales/blob/main/Project1_Part6_core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import modeling tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor
# set the default output to pandas
from sklearn import set_config
set_config(transform_output='pandas')

In [2]:
# Loading in data previously cleaned and transformed
X_train_tf = pd.read_csv("Data/X_train_tf.csv")
X_test_tf = pd.read_csv("Data/X_test_tf.csv")
y_train = pd.read_csv("Data/y_train.csv")
y_test = pd.read_csv("Data/y_test.csv")

In [3]:
# View the processed training data
X_train_tf.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,...,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.817249,-0.712775,1.828109,1.327849,-0.338062,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.55634,-1.291052,0.603369,1.327849,-0.338062,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.131512,1.813319,0.244541,0.136187,-0.338062,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.169219,-1.004931,-0.952591,0.732018,-1.193861,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1.528819,-0.965484,-0.33646,0.493686,1.373536,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [4]:
# View the processed testing data
X_test_tf.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,Item_Type_Breakfast,...,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.3310089,-0.776646,-0.998816,-1.293807,0.517737,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,-1.179892,0.100317,-1.585194,-0.102145,-1.193861,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.3784469,-0.482994,-1.595784,0.136187,-0.338062,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,4.213344e-16,-0.41544,0.506592,-1.532139,-0.338062,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-0.6426567,-1.047426,0.886725,0.732018,-1.193861,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


# CRISP-DM Phase 4 - Modeling

In [5]:
# Provide a label for this data
label = 'Test Data'
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics

def evaluate_regression(reg, X_train_tf, y_train, X_test_tf, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train_tf)

  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test_tf)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )

  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

## Bagging Regressor

In [6]:
from sklearn.ensemble import BaggingRegressor

In [7]:
# Instantiate a Default Model
bagreg = BaggingRegressor(random_state = 42)
# Fit the model on the training data only
bagreg.fit(X_train_tf, y_train)
# Call custom function for evaluation
evaluate_regression(bagreg, X_train_tf, y_train, X_test_tf, y_test)

  return column_or_1d(y, warn=True)


------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 319.186
- MSE = 240,380.649
- RMSE = 490.286
- R^2 = 0.919

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 779.986
- MSE = 1,289,416.156
- RMSE = 1,135.525
- R^2 = 0.533


## Decision Tree Regressor

In [8]:
# For reproducible results, set the random state
dec_tree = DecisionTreeRegressor(random_state = 42)
# Fit the model on the training data only
dec_tree.fit(X_train_tf, y_train)

In [9]:
# Make predictions for training and testing data
train_preds = dec_tree.predict(X_train_tf)
test_preds = dec_tree.predict(X_test_tf)

In [10]:
# Evaluate the model
train_r2_score = r2_score(y_train, train_preds)
test_r2_score = r2_score(y_test, test_preds)
# Print the results
print (f'Training R2 is {train_r2_score:.3f}.')
print (f'Testing R2 is {test_r2_score:.3f}.')

Training R2 is 1.000.
Testing R2 is 0.187.


In [11]:
# What was the depth of our default tree?
dec_tree.get_depth()

40

In [12]:
# Let's go extreme and set our max depth to 2
dec_tree_2 = DecisionTreeRegressor(max_depth = 2, random_state = 42)
# Fit on training data
dec_tree_2.fit(X_train_tf, y_train)
# Get predictions 
train_preds = dec_tree_2.predict(X_train_tf)
test_preds = dec_tree_2.predict(X_test_tf)
# Calculate r2 score
train_2_score = r2_score(y_train, train_preds)
test_2_score = r2_score(y_test, test_preds)
# Print results
print (f'Training R2 is {train_2_score:.3f}.')
print (f'Testing R2 is {test_2_score:.3f}.')

Training R2 is 0.432.
Testing R2 is 0.434.


In [13]:
# Instantiate model with max_depth of 10
dec_tree_10 = DecisionTreeRegressor(max_depth = 10, random_state = 42)
# Fit on training data
dec_tree_10.fit(X_train_tf, y_train)
# Get predictions 
train_preds = dec_tree_10.predict(X_train_tf)
test_preds = dec_tree_10.predict(X_test_tf)
# Calculate r2 score
train_10_score = r2_score(y_train, train_preds)
test_10_score = r2_score(y_test, test_preds)
# Print results
print (f'Training R2 is {train_10_score:.3f}.')
print (f'Testing R2 is {test_10_score:.3f}.')

Training R2 is 0.685.
Testing R2 is 0.527.


In [14]:
# List of values to try for max_depth:
depths = list(range(2, 27)) # will try every value between 2 and 27
# list to collect the results of each depth
scores_list = []
# Loop through depths
for depth in depths:
    dec_tree = DecisionTreeRegressor(max_depth=depth, random_state=42)
    dec_tree.fit(X_train_tf, y_train)
    # Get predictions 
    train_preds = dec_tree.predict(X_train_tf)
    test_preds = dec_tree.predict(X_test_tf)
    # Calculate score
    train_r2 = r2_score(y_train, train_preds)
    test_r2 = r2_score(y_test, test_preds)
    # Add dictionray of scores to score_list
    results = {'max_depth':depth,
               'Train R2':train_r2,
               'Test R2':test_r2}
    scores_list.append(results)
    # Print the results
    print(f'When max_depth is {depth},')
    print(f'- the training R2 is {train_r2:.3f}.')
    print(f'- the testing R2 is {test_r2:.3f}.')
    print()

When max_depth is 2,
- the training R2 is 0.432.
- the testing R2 is 0.434.

When max_depth is 3,
- the training R2 is 0.524.
- the testing R2 is 0.524.

When max_depth is 4,
- the training R2 is 0.583.
- the testing R2 is 0.584.

When max_depth is 5,
- the training R2 is 0.604.
- the testing R2 is 0.595.

When max_depth is 6,
- the training R2 is 0.615.
- the testing R2 is 0.582.

When max_depth is 7,
- the training R2 is 0.627.
- the testing R2 is 0.576.

When max_depth is 8,
- the training R2 is 0.644.
- the testing R2 is 0.557.

When max_depth is 9,
- the training R2 is 0.666.
- the testing R2 is 0.540.

When max_depth is 10,
- the training R2 is 0.685.
- the testing R2 is 0.527.

When max_depth is 11,
- the training R2 is 0.709.
- the testing R2 is 0.507.

When max_depth is 12,
- the training R2 is 0.735.
- the testing R2 is 0.486.

When max_depth is 13,
- the training R2 is 0.762.
- the testing R2 is 0.440.

When max_depth is 14,
- the training R2 is 0.792.
- the testing R2 is 0.

In [15]:
# Let's run the model with our optimized value for max_depth
dec_tree_5 = DecisionTreeRegressor(max_depth = 5, random_state = 42)
# Fit on the training data
dec_tree_5.fit(X_train_tf, y_train)
# Get predictions 
train_preds = dec_tree_5.predict(X_train_tf)
test_preds = dec_tree_5.predict(X_test_tf)
# Calculate score
train_5_score = r2_score(y_train, train_preds)
test_5_score = r2_score(y_test, test_preds)
# Print results
print (f'Training R2 is {train_5_score:.3f}.')
print (f'Testing R2 is {test_5_score:.3f}.')

Training R2 is 0.604.
Testing R2 is 0.595.


In [16]:
# Looking at options for tuning this model
dec_tree.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 26,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 42,
 'splitter': 'best'}

In [17]:
# Setting params
params = {'ccp_alpha': [0,0.3,0.5,0.7],
          'max_depth': [None,4,8,12],
          'min_impurity_decrease':[0.0,0.1,0.2,0.3],
          'min_samples_leaf':[1,4,6,8],
          'max_features':[None,1,2]
          }

In [18]:
# Instantiate the gridsearch
gridsearch = GridSearchCV(dec_tree_5, params, n_jobs=-1, cv = 3, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train_tf, y_train)

Fitting 3 folds for each of 768 candidates, totalling 2304 fits


In [19]:
# Obtain best parameters
gridsearch.best_params_

{'ccp_alpha': 0,
 'max_depth': 4,
 'max_features': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 8}

In [20]:
# Define and refit best model
best_dt = gridsearch.best_estimator_
evaluate_regression(best_dt, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 788.105
- MSE = 1,235,201.599
- RMSE = 1,111.396
- R^2 = 0.583

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 750.161
- MSE = 1,147,721.135
- RMSE = 1,071.317
- R^2 = 0.584


## Linear Regression Model

In [21]:
from sklearn.linear_model import LinearRegression
# Instantiate model
lin_reg = LinearRegression()
#Fit model
lin_reg.fit(X_train_tf, y_train)

In [22]:
 # Test function with default arguments
evaluate_regression(lin_reg, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 847.129
- MSE = 1,297,558.136
- RMSE = 1,139.104
- R^2 = 0.562

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 804.120
- MSE = 1,194,349.715
- RMSE = 1,092.863
- R^2 = 0.567


In [23]:
# Test function by saving results as a datframe
results_df = evaluate_regression(lin_reg, X_train_tf, y_train, X_test_tf, y_test,
                              verbose=False, output_frame=True)
results_df




Unnamed: 0,MAE,MSE,RMSE,R^2
Training Data,847.129,1297558.136,1139.104,0.562
Test Data,804.12,1194349.715,1092.863,0.567


### To what extent is this model overfit/underfit?

- With only a .56 r^2 score on training and testing data and mean values in the hundreds, thousands and millions, the model is very underfit for the task of predicting Item Outlet Sales.

## Build a Random Forest model to predict sales.

In [24]:
# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)

In [25]:
# Fit the model pipeline on the training data only
rf.fit(X_train_tf, y_train)

  rf.fit(X_train_tf, y_train)


In [26]:
# Use custom function to evaluate default model
evaluate_regression(rf, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 296.748
- MSE = 183,363.537
- RMSE = 428.210
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 764.548
- MSE = 1,215,647.820
- RMSE = 1,102.564
- R^2 = 0.559


In [27]:
# Getting parameters for tuning
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [28]:
# Define param grid with options to try
params = {'ccp_alpha': [0,0.3,0.5,0.7],
          'max_depth': [8,10,12],
          'n_estimators':[75,100,125],
          'min_samples_leaf':[4,6,8],
          'max_features':[None]
          }

In [29]:
# Instantiate the gridsearch
gridsearch = GridSearchCV(rf, params, n_jobs=-1, cv = 3, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train_tf, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [30]:
# Obtain best parameters
gridsearch.best_params_

{'ccp_alpha': 0.5,
 'max_depth': 8,
 'max_features': None,
 'min_samples_leaf': 8,
 'n_estimators': 75}

In [31]:
# Define and refit best model
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train_tf, y_train, X_test_tf, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 715.112
- MSE = 1,038,031.023
- RMSE = 1,018.838
- R^2 = 0.649

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 726.556
- MSE = 1,094,859.511
- RMSE = 1,046.355
- R^2 = 0.603


## Did the performance improve?

- Barely in the test data, and much worse in training. That means the forest model isn't overfit at least but much tweaking remains to be done.

## which model to recommend?

- Both models need improvement. However the linear regression model, while consistent, has high bias. With the random forest model there's less overfitting and a slight improvement in r^2 thus that would be recommended.

- The random forest model training data at least isn't overfit at r2 of 0.649 for training and testing r2 of 0.603. As it stands, tuned with gridsearch, this is the best performing model