# Import Statements

In [301]:
# IMPORT STATEMENTS

#Import Python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import ast

# import matplotlib and seaborn to plot charts and graphs
import matplotlib.pyplot as plt
import seaborn as sns

# to split training and testing data
from sklearn.model_selection import train_test_split

# carry out one hot encoding
from feature_engine.encoding import OneHotEncoder

# Import Scalers
# for min-max scaling
from sklearn.preprocessing import MinMaxScaler
# for standardization
from sklearn.preprocessing import StandardScaler
# for robust scaling
from sklearn.preprocessing import RobustScaler

# import XGBRegressor from xgboost package
from xgboost import XGBRegressor

# to evaluate model metrics
from sklearn.metrics import mean_squared_error, r2_score

# import grid search cv to improve decision tree model by helping search for parameters
from sklearn.model_selection import GridSearchCV

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F

# to export model
import joblib

# Functions

In [302]:
def model_performance(model):
    print('the training root mean squared error is: ',  np.sqrt(mean_squared_error(model.predict(X_train_scaled), y_train)))
    print('the testing root mean squared error is: ',  np.sqrt(mean_squared_error(model.predict(X_test_scaled), y_test)))

    print()

    #training mse
    train_mse = mean_squared_error(model.predict(X_train_scaled), y_train)
    print('the training mean squared error is: ', train_mse)
    #testing mse
    test_mse = mean_squared_error(model.predict(X_test_scaled), y_test)
    print('the testing mean squared error is: ', test_mse)

    print()

    print('training accuracy is: ', model.score(X_train_scaled, y_train))
    print('testing accuracy is: ', model.score(X_test_scaled, y_test))


# Load Tables From Snowflake

In [303]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "analytics",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [304]:
# retrieve order details usa table from snowflake
order_data_from_snowflake = session.table("frostbyte_tasty_bytes.analytics.ORDER_DETAILS_USA_MATCHED").collect()

In [305]:
# convert order_data_from_snowflake to pandas dataframe
order_df = pd.DataFrame(order_data_from_snowflake)
#order_df = order_data_from_snowflake.to_pandas()

In [306]:
# Get the total quantity sold for each menu item 
## group by 'MENU_ITEM_ID' and calculate the total quantity sold
total_qty_sold_per_item = order_df.groupby('MENU_ITEM_ID')['QUANTITY'].sum().reset_index()

## rename the 'QUANTITY' column to 'TOTAL_QTY_SOLD'
total_qty_sold_per_item = total_qty_sold_per_item.rename(columns={'QUANTITY': 'TOTAL_QTY_SOLD'})

# Menu Table 

In [307]:
# retrieve menu data from snowflake
menu_data_from_snowflake = session.table("frostbyte_tasty_bytes.raw_pos.MENU").collect()

In [308]:
# convert menu_data_from_snowflake to pandas dataframe
menu_df = pd.DataFrame(menu_data_from_snowflake)

In [309]:
# merge total_qty_sold_per_item with final_product_df
menu_df = pd.merge(menu_df, total_qty_sold_per_item, on='MENU_ITEM_ID')

# Order Table

In [310]:
# preview order table
order_df.head()

Unnamed: 0,ORDER_ID,CUSTOMER_ID,TRUCK_ID,LOCATION_ID,SHIFT_ID,SHIFT_START_TIME,SHIFT_END_TIME,ORDER_CHANNEL,ORDER_TS,SERVED_TS,...,E_MAIL,PHONE_NUMBER,ORDER_DETAIL_ID,MENU_ITEM_ID,DISCOUNT_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE,ORDER_ITEM_DISCOUNT_AMOUNT
0,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494568,21,,0,3,14.0,42.0,
1,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494569,23,,1,1,12.0,12.0,
2,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494570,27,,2,1,6.0,6.0,
3,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494571,28,,3,1,21.0,21.0,
4,452165023,110872,17,14954.0,200544059,15:00:00,22:00:00,,2022-07-30 17:34:42,,...,Jase.Haynes@aol.com,861-138-0409,884494572,29,,4,1,6.0,6.0,


In [311]:
order_df = order_df.drop(["ORDER_ID", "SHIFT_ID", "SHIFT_START_TIME", "SHIFT_END_TIME", "ORDER_CHANNEL", "SERVED_TS", "E_MAIL", "PHONE_NUMBER", "ORDER_DETAIL_ID", "DISCOUNT_ID", "LINE_NUMBER", "QUANTITY", "UNIT_PRICE", "PRICE", "ORDER_ITEM_DISCOUNT_AMOUNT", "BIRTHDAY_DATE", "SIGN_UP_DATE", "CHILDREN_COUNT", "MARITAL_STATUS", "FAVOURITE_BRAND", "GENDER", "PREFERRED_LANGUAGE", "POSTAL_CODE", "COUNTRY", "FIRST_NAME", "MAX_ORDER_TS", "ORDER_DISCOUNT_AMOUNT", "ORDER_TAX_AMOUNT", "ORDER_AMOUNT", "ORDER_CURRENCY", "LAST_NAME"], axis=1)

order_df.head()

Unnamed: 0,CUSTOMER_ID,TRUCK_ID,LOCATION_ID,ORDER_TS,ORDER_TOTAL,CITY,MENU_ITEM_ID
0,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,21
1,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,23
2,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,27
3,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,28
4,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,29


In [312]:
order_df['YEAR'] = order_df['ORDER_TS'].dt.year
order_df['MONTH'] = order_df['ORDER_TS'].dt.month
order_df.head()

Unnamed: 0,CUSTOMER_ID,TRUCK_ID,LOCATION_ID,ORDER_TS,ORDER_TOTAL,CITY,MENU_ITEM_ID,YEAR,MONTH
0,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,21,2022,7
1,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,23,2022,7
2,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,27,2022,7
3,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,28,2022,7
4,110872,17,14954.0,2022-07-30 17:34:42,87.0,Denver,29,2022,7


In [313]:
# Convert 'location_id' column to integers
order_df['LOCATION_ID'] = order_df['LOCATION_ID'].astype(int)

order_df.head()

Unnamed: 0,CUSTOMER_ID,TRUCK_ID,LOCATION_ID,ORDER_TS,ORDER_TOTAL,CITY,MENU_ITEM_ID,YEAR,MONTH
0,110872,17,14954,2022-07-30 17:34:42,87.0,Denver,21,2022,7
1,110872,17,14954,2022-07-30 17:34:42,87.0,Denver,23,2022,7
2,110872,17,14954,2022-07-30 17:34:42,87.0,Denver,27,2022,7
3,110872,17,14954,2022-07-30 17:34:42,87.0,Denver,28,2022,7
4,110872,17,14954,2022-07-30 17:34:42,87.0,Denver,29,2022,7


In [314]:
# Group order total to truck id
SUM_SALES_CITY = order_df.groupby(['YEAR', 'MONTH', 'TRUCK_ID'])['ORDER_TOTAL'].sum().reset_index()
SUM_SALES_CITY.head()

Unnamed: 0,YEAR,MONTH,TRUCK_ID,ORDER_TOTAL
0,2022,6,4,50.0
1,2022,6,5,215.0
2,2022,6,8,360.0
3,2022,6,14,66.0
4,2022,6,15,410.0


# Preview Menu Table

In [315]:
# preview menu table
menu_df.head()

Unnamed: 0,MENU_ID,MENU_TYPE_ID,MENU_TYPE,TRUCK_BRAND_NAME,MENU_ITEM_ID,MENU_ITEM_NAME,ITEM_CATEGORY,ITEM_SUBCATEGORY,COST_OF_GOODS_USD,SALE_PRICE_USD,MENU_ITEM_HEALTH_METRICS_OBJ,TOTAL_QTY_SOLD
0,10001,1,Ice Cream,Freezing Point,10,Lemonade,Beverage,Cold Option,0.65,3.5,"{\n ""menu_item_health_metrics"": [\n {\n ...",607
1,10002,1,Ice Cream,Freezing Point,11,Sugar Cone,Dessert,Cold Option,2.5,6.0,"{\n ""menu_item_health_metrics"": [\n {\n ...",1815
2,10003,1,Ice Cream,Freezing Point,12,Waffle Cone,Dessert,Cold Option,2.5,6.0,"{\n ""menu_item_health_metrics"": [\n {\n ...",1786
3,10004,1,Ice Cream,Freezing Point,13,Two Scoop Bowl,Dessert,Cold Option,3.0,7.0,"{\n ""menu_item_health_metrics"": [\n {\n ...",1682
4,10005,1,Ice Cream,Freezing Point,14,Bottled Water,Beverage,Cold Option,0.5,2.0,"{\n ""menu_item_health_metrics"": [\n {\n ...",629


In [316]:
menu_df = menu_df.drop(["MENU_ID", "MENU_TYPE_ID", "MENU_ITEM_NAME", "COST_OF_GOODS_USD", "MENU_ITEM_HEALTH_METRICS_OBJ"]
                        , axis=1)

In [317]:
menu_df.head()

Unnamed: 0,MENU_TYPE,TRUCK_BRAND_NAME,MENU_ITEM_ID,ITEM_CATEGORY,ITEM_SUBCATEGORY,SALE_PRICE_USD,TOTAL_QTY_SOLD
0,Ice Cream,Freezing Point,10,Beverage,Cold Option,3.5,607
1,Ice Cream,Freezing Point,11,Dessert,Cold Option,6.0,1815
2,Ice Cream,Freezing Point,12,Dessert,Cold Option,6.0,1786
3,Ice Cream,Freezing Point,13,Dessert,Cold Option,7.0,1682
4,Ice Cream,Freezing Point,14,Beverage,Cold Option,2.0,629


# Merge menu table with order table and total sales

In [318]:
# merge menu table with order table
merge_df = pd.merge(menu_df, order_df, on='MENU_ITEM_ID')

KeyError: 'MENU_ITEM_ID'

In [None]:
merge_df.head()

In [None]:
stop

# Location Table

In [None]:
# retrieve menu data from snowflake
location_data_from_snowflake = session.table("frostbyte_tasty_bytes.raw_pos.LOCATION").collect()

In [None]:
# convert menu_data_from_snowflake to pandas dataframe
location_df = pd.DataFrame(location_data_from_snowflake)

In [None]:
location_df.head()

In [None]:
location_df = location_df.drop(["PLACEKEY", "ISO_COUNTRY_CODE", "CITY", "REGION"], axis=1)

location_df.head()

# Final Table

In [None]:
# merge table with location table
final_df = pd.merge(merge_df, location_df, on='LOCATION_ID')

In [None]:
final_df.head()

In [None]:
# Convert 'sale_price_usd' column to 2 decimal places
final_df['SALE_PRICE_USD'] = final_df['SALE_PRICE_USD'].apply(lambda x: '{:.2f}'.format(x))

final_df.head()

In [None]:
stop

# Check for Missing Values

In [None]:
final_df.isnull().sum()

No missing values have been detected. Hence, no process required at this stage.

# Dealing with Outliers

In [None]:
# plot box plot for 'TOTAL_QTY_SOLD' variable
plt.figure(figsize=(3,5))
sns.boxplot(y=final_df['TOTAL_QTY_SOLD'])
plt.title('TOTAL_QTY_SOLD boxplot')
plt.show()

Based on the boxplot shown above, there are no outliers detected. Hence, no process required for this stage.

Note: Outliers for the target variable wold not dealt with even if observed as it would be considered cherry picking convenient data. Furthermore, due to the relatively small number of menu items, if subsetting of the target variable was done, there might be too limited data to build an accurate model.

# Train Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    final_df.drop('TOTAL_QTY_SOLD', axis=1), final_df['TOTAL_QTY_SOLD'], test_size=0.3, random_state=0)

X_train.shape, X_test.shape

# Categorical Variable Encoding 

In [None]:
# let's create the encoder

ohe_enc = OneHotEncoder(
    top_categories=None,
    drop_last=True)  # to return k-1, false to return k

In [None]:
# fit the encoder to the train set: it will learn the variables and 
# categories to encode

ohe_enc.fit(X_train)

In [None]:
X_train = ohe_enc.transform(X_train)
X_test = ohe_enc.transform(X_test)

# Scaling

## Min Max Scaler

In [None]:
# set up the scaler
scaler = MinMaxScaler()

# fit the scaler to the train set, it will learn the parameters
scaler.fit(X_train)

# transform train and test sets
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Standard Scaler

In [None]:
# # set up the scaler
# scaler = StandardScaler()

# # fit the scaler to the train set, it will learn the parameters
# scaler.fit(X_train)

# # transform train and test sets
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

Standard Scaler did not lead to a difference in result.

In [None]:
# # set up the scaler
# scaler = RobustScaler()

# # fit the scaler to the train set, it will learn the parameters
# scaler.fit(X_train)

# # transform train and test sets
# X_train_scaled = scaler.transform(X_train)
# X_test_scaled = scaler.transform(X_test)

Robust Scaler did not lead to a difference in result.

In [None]:
# let's transform the returned NumPy arrays to dataframes
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Build Model

In [None]:
# Build XGBoost Model
xgb = XGBRegressor(n_estimators = 20, learning_rate = 0.1, use_label_encoder = False, eval_metric='logloss')
# fit the model to the training data
xgb.fit(X_train_scaled, y_train)
model_performance(xgb)

# Improve Model (GridSearch CV)

In [None]:
# param_grid = {'n_estimators': [30, 40, 50, 60],
#               'learning_rate': [0.1, 0.2, 0.3, 0.4],
#               'max_depth': [5,6,7,8]}

# xgb_grid = GridSearchCV(xgb, param_grid, cv=5)
# xgb_grid.fit(X_train_scaled, y_train)

# print("Best parameters found: ", xgb_grid.best_params_)
# print("Best score found: ", xgb_grid.best_score_)

# Improved Model Results

In [None]:
# Build XGBoost Model
xgb_improved = XGBRegressor(learning_rate = 0.1, max_depth = 5, n_estimators = 50)
# fit the model to the training data
xgb_improved.fit(X_train_scaled, y_train)

model_performance(xgb_improved)

In [None]:
xgb_improved.feature_importances_