In [71]:
# IMPORT STATEMENTS

#Import Python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import ast

# import matplotlib and seaborn to plot charts and graphs
import matplotlib.pyplot as plt
import seaborn as sns

# Import Snowflake modules
from snowflake.snowpark import Session
import snowflake.snowpark.functions as F

In [72]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "raw_pos",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [73]:
# retrieve order details usa table from snowflake
order_detail = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_DETAIL")
order_header = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_HEADER")
location_table = session.table("frostbyte_tasty_bytes.raw_pos.LOCATION")

In [74]:
# Get a list of LOCATION_IDs where the COUNTRY column's value is 'United States'
## Filter the 'location_table' where the 'COUNTRY' column is 'United States'
filtered_location_table = location_table.filter(location_table['COUNTRY'] == 'United States')

## Select the 'LOCATION_ID' column from the filtered DataFrame
location_id_df = filtered_location_table.select('LOCATION_ID')

## Convert the 'LOCATION_ID' column to a Python list
location_id_list = location_id_df.collect()

## Extract the values from the DataFrame and convert them to a list
location_id_list = [row['LOCATION_ID'] for row in location_id_list]

In [75]:
# Merge the two tables using the 'ORDER_ID' column as the common key
merged_df = order_detail.join(order_header, on='ORDER_ID', how='inner')

In [76]:
# Get rows where the LOCATION_ID is for United States
merged_df = merged_df.filter(F.col('LOCATION_ID').isin(location_id_list))

In [77]:
merged_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"ORDER_DETAIL_ID"  |"MENU_ITEM_ID"  |"l_twl7_DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"r_6639_DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
------------------------------------------------------------------------------------------------------------------------------------

In [78]:
merged_df = merged_df.select("MENU_ITEM_ID", "ORDER_TS", "QUANTITY", "PRICE")

In [79]:
merged_df = merged_df.withColumn('YEAR', F.year(merged_df['ORDER_TS']))
merged_df = merged_df.withColumn('MONTH', F.month(merged_df['ORDER_TS']))

merged_df.show()

----------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"ORDER_TS"           |"QUANTITY"  |"PRICE"  |"YEAR"  |"MONTH"  |
----------------------------------------------------------------------------------
|82              |2021-09-10 11:47:23  |1           |15.0000  |2021    |9        |
|83              |2021-09-10 11:47:23  |1           |9.0000   |2021    |9        |
|82              |2021-09-10 11:48:05  |1           |15.0000  |2021    |9        |
|81              |2021-09-10 11:48:05  |1           |12.0000  |2021    |9        |
|83              |2021-09-10 11:48:05  |2           |18.0000  |2021    |9        |
|84              |2021-09-10 11:48:05  |1           |2.0000   |2021    |9        |
|82              |2021-09-10 11:48:37  |2           |30.0000  |2021    |9        |
|81              |2021-09-10 11:48:37  |1           |12.0000  |2021    |9        |
|83              |2021-09-10 11:48:37  |2           |18.0000  |2021    |9        |
|86 

In [80]:
merged_df = merged_df.select("MENU_ITEM_ID", "ORDER_TS", "QUANTITY", "PRICE", "YEAR")

In [81]:
merged_df.show()

------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"ORDER_TS"           |"QUANTITY"  |"PRICE"  |"YEAR"  |
------------------------------------------------------------------------
|29              |2022-05-07 18:58:14  |1           |6.0000   |2022    |
|22              |2022-05-07 18:58:14  |1           |17.0000  |2022    |
|25              |2022-05-07 18:58:14  |2           |6.0000   |2022    |
|23              |2022-05-07 18:58:38  |1           |12.0000  |2022    |
|25              |2022-05-07 18:58:38  |1           |3.0000   |2022    |
|26              |2022-05-07 18:58:38  |1           |3.0000   |2022    |
|28              |2022-05-07 18:58:38  |1           |21.0000  |2022    |
|21              |2022-05-07 18:58:38  |1           |14.0000  |2022    |
|27              |2022-05-07 18:58:38  |1           |6.0000   |2022    |
|27              |2022-05-07 18:58:51  |1           |6.0000   |2022    |
---------------------------------------------------

In [82]:
df = merged_df.to_pandas()

In [83]:
# Create a new column for the year from the 'ORDER_TS' column
df['YEAR'] = df['ORDER_TS'].dt.year

# Pivot the DataFrame to get the desired table format
result_df = df.pivot_table(index='MENU_ITEM_ID', columns='YEAR', values=['QUANTITY', 'PRICE'],
                           aggfunc={'QUANTITY': 'sum', 'PRICE': 'sum'})

# Flatten the multi-level column index
result_df.columns = [f'{col[1]}_{col[0].upper()}' for col in result_df.columns]

# Reset the index
result_df.reset_index(inplace=True)

# Sort the columns based on the year and then rearrange them in pairs (QUANTITY, PRICE) for each year
sorted_cols = sorted(result_df.columns, key=lambda x: x[-9:-5] + x[-14:-10])
result_df = result_df[sorted_cols]

# Show the result
print(result_df)


    2019_PRICE  2020_PRICE  2021_PRICE  2022_PRICE  MENU_ITEM_ID  \
0      18567.5     91994.0    557627.0    522613.0            10   
1      93582.0    468804.0   2822118.0   2660430.0            11   
2      93672.0    468228.0   2825964.0   2650650.0            12   
3     109487.0    543158.0   3299023.0   3091858.0            13   
4      10422.0     53136.0    317534.0    298794.0            14   
..         ...         ...         ...         ...           ...   
95         NaN   1500521.0  10029514.0  10165045.0           152   
96         NaN   1497408.0  10030801.0  10170446.0           153   
97         NaN     60808.0    411864.0    416368.0           154   
98         NaN     92121.0    615144.0    622926.0           155   
99         NaN     92256.0    617544.0    624612.0           156   

    2019_QUANTITY  2020_QUANTITY  2021_QUANTITY  2022_QUANTITY  
0          5305.0        26284.0       159322.0       149318.0  
1         15597.0        78134.0       470353.0      

In [84]:
result_df.head()

Unnamed: 0,2019_PRICE,2020_PRICE,2021_PRICE,2022_PRICE,MENU_ITEM_ID,2019_QUANTITY,2020_QUANTITY,2021_QUANTITY,2022_QUANTITY
0,18567.5,91994.0,557627.0,522613.0,10,5305.0,26284.0,159322.0,149318.0
1,93582.0,468804.0,2822118.0,2660430.0,11,15597.0,78134.0,470353.0,443405.0
2,93672.0,468228.0,2825964.0,2650650.0,12,15612.0,78038.0,470994.0,441775.0
3,109487.0,543158.0,3299023.0,3091858.0,13,15641.0,77594.0,471289.0,441694.0
4,10422.0,53136.0,317534.0,298794.0,14,5211.0,26568.0,158767.0,149397.0


In [85]:
# Extract the columns with the pattern 'YEAR_QUANTITY' and 'YEAR_PRICE'
quantity_cols = [col for col in result_df.columns if col.endswith('_QUANTITY')]
price_cols = [col for col in result_df.columns if col.endswith('_PRICE')]

# Create a new list with the desired order of columns
desired_order = ['MENU_ITEM_ID'] + sum([list(item) for item in zip(quantity_cols, price_cols)], [])

# Reorder the columns based on the desired order
result_df = result_df[desired_order]

# Show the result
result_df.head()

Unnamed: 0,MENU_ITEM_ID,2019_QUANTITY,2019_PRICE,2020_QUANTITY,2020_PRICE,2021_QUANTITY,2021_PRICE,2022_QUANTITY,2022_PRICE
0,10,5305.0,18567.5,26284.0,91994.0,159322.0,557627.0,149318.0,522613.0
1,11,15597.0,93582.0,78134.0,468804.0,470353.0,2822118.0,443405.0,2660430.0
2,12,15612.0,93672.0,78038.0,468228.0,470994.0,2825964.0,441775.0,2650650.0
3,13,15641.0,109487.0,77594.0,543158.0,471289.0,3299023.0,441694.0,3091858.0
4,14,5211.0,10422.0,26568.0,53136.0,158767.0,317534.0,149397.0,298794.0


In [86]:
col_list = ["2022_QUANTITY", "2022_PRICE"]
for col in col_list:
    result_df[col] = (result_df[col] / 11) * 12

In [87]:
result_df.head()

Unnamed: 0,MENU_ITEM_ID,2019_QUANTITY,2019_PRICE,2020_QUANTITY,2020_PRICE,2021_QUANTITY,2021_PRICE,2022_QUANTITY,2022_PRICE
0,10,5305.0,18567.5,26284.0,91994.0,159322.0,557627.0,162892.363636,570123.3
1,11,15597.0,93582.0,78134.0,468804.0,470353.0,2822118.0,483714.545455,2902287.0
2,12,15612.0,93672.0,78038.0,468228.0,470994.0,2825964.0,481936.363636,2891618.0
3,13,15641.0,109487.0,77594.0,543158.0,471289.0,3299023.0,481848.0,3372936.0
4,14,5211.0,10422.0,26568.0,53136.0,158767.0,317534.0,162978.545455,325957.1


In [88]:
# Calculate year-on-year increase for quantity
result_df['Quantity_Increase_2020 (%)'] = ((result_df['2020_QUANTITY'] - result_df['2019_QUANTITY']) / result_df['2019_QUANTITY']) * 100
result_df['Quantity_Increase_2021 (%)'] = ((result_df['2021_QUANTITY'] - result_df['2020_QUANTITY']) / result_df['2020_QUANTITY']) * 100
result_df['Quantity_Increase_2022 (%)'] = ((result_df['2022_QUANTITY'] - result_df['2021_QUANTITY']) / result_df['2021_QUANTITY']) * 100

# Calculate year-on-year increase for price
result_df['Price_Increase_2020 (%)'] = ((result_df['2020_PRICE'] - result_df['2019_PRICE']) / result_df['2019_PRICE']) * 100
result_df['Price_Increase_2021 (%)'] = ((result_df['2021_PRICE'] - result_df['2020_PRICE']) / result_df['2020_PRICE']) * 100
result_df['Price_Increase_2022 (%)'] = ((result_df['2022_PRICE'] - result_df['2021_PRICE']) / result_df['2021_PRICE']) * 100

In [89]:
result_df.head()

Unnamed: 0,MENU_ITEM_ID,2019_QUANTITY,2019_PRICE,2020_QUANTITY,2020_PRICE,2021_QUANTITY,2021_PRICE,2022_QUANTITY,2022_PRICE,Quantity_Increase_2020 (%),Quantity_Increase_2021 (%),Quantity_Increase_2022 (%),Price_Increase_2020 (%),Price_Increase_2021 (%),Price_Increase_2022 (%)
0,10,5305.0,18567.5,26284.0,91994.0,159322.0,557627.0,162892.363636,570123.3,395.457116,506.155836,2.240973,395.457116,506.155836,2.240973
1,11,15597.0,93582.0,78134.0,468804.0,470353.0,2822118.0,483714.545455,2902287.0,400.955312,501.982492,2.840748,400.955312,501.982492,2.840748
2,12,15612.0,93672.0,78038.0,468228.0,470994.0,2825964.0,481936.363636,2891618.0,399.859083,503.544427,2.323249,399.859083,503.544427,2.323249
3,13,15641.0,109487.0,77594.0,543158.0,471289.0,3299023.0,481848.0,3372936.0,396.0936,507.378148,2.240451,396.0936,507.378148,2.240451
4,14,5211.0,10422.0,26568.0,53136.0,158767.0,317534.0,162978.545455,325957.1,409.84456,497.587323,2.652658,409.84456,497.587323,2.652658


In [90]:
# define the desired column order
desired_columns = ["MENU_ITEM_ID", "2019_QUANTITY",	"2019_PRICE", "Quantity_Increase_2020 (%)", "Price_Increase_2020 (%)", "2020_QUANTITY",	"2020_PRICE",
                       "Quantity_Increase_2021 (%)", "Price_Increase_2021 (%)", "2021_QUANTITY", "2021_PRICE", "Quantity_Increase_2022 (%)", "Price_Increase_2022 (%)",
                       "2022_QUANTITY", "2022_PRICE"]
    
# Reorder the columns based on the desired order
result_df = result_df[desired_order]

In [91]:
result_df.head()

Unnamed: 0,MENU_ITEM_ID,2019_QUANTITY,2019_PRICE,2020_QUANTITY,2020_PRICE,2021_QUANTITY,2021_PRICE,2022_QUANTITY,2022_PRICE
0,10,5305.0,18567.5,26284.0,91994.0,159322.0,557627.0,162892.363636,570123.3
1,11,15597.0,93582.0,78134.0,468804.0,470353.0,2822118.0,483714.545455,2902287.0
2,12,15612.0,93672.0,78038.0,468228.0,470994.0,2825964.0,481936.363636,2891618.0
3,13,15641.0,109487.0,77594.0,543158.0,471289.0,3299023.0,481848.0,3372936.0
4,14,5211.0,10422.0,26568.0,53136.0,158767.0,317534.0,162978.545455,325957.1


In [92]:
# # Extract the columns with the pattern 'YEAR_QUANTITY' and 'YEAR_PRICE'
# quantity_cols = [col for col in result_df.columns if col.endswith('_QUANTITY')]
# price_cols = [col for col in result_df.columns if col.endswith('_PRICE')]
# quantity_increase_cols = [col for col in result_df.columns if col.startswith('Quantity_Increase')]
# price_increase_cols = [col for col in result_df.columns if col.startswith('Price_Increase')]

# # Create a new list with the desired order of columns
# desired_order = ['MENU_ITEM_ID'] + sum([[q, p, qi, pi] for q, p, qi, pi in zip(quantity_cols, price_cols, quantity_increase_cols, price_increase_cols)], [])

# # Reorder the columns based on the desired order
# df = result_df[desired_order]