In [94]:
# IMPORT STATEMENTS

#Import Python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import ast

# Import Snowflake modules
from snowflake.snowpark import Session, DataFrame
from pyspark.sql.functions import year
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import *

# import matplotlib and seaborn to plot charts and graphs
import matplotlib.pyplot as plt
import seaborn as sns

In [95]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "raw_pos",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [96]:
# retrieve order details usa table from snowflake
order_detail = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_DETAIL")
order_header = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_HEADER")
location_table = session.table("frostbyte_tasty_bytes.raw_pos.LOCATION")
menu_table = session.table("frostbyte_tasty_bytes.raw_pos.MENU")

In [97]:
# Get a list of LOCATION_IDs where the COUNTRY column's value is 'United States'
## Filter the 'location_table' where the 'COUNTRY' column is 'United States'
filtered_location_table = location_table.filter(location_table['COUNTRY'] == 'United States')

## Select the 'LOCATION_ID' column from the filtered DataFrame
location_id_df = filtered_location_table.select('LOCATION_ID')

## Convert the 'LOCATION_ID' column to a Python list
location_id_list = location_id_df.collect()

## Extract the values from the DataFrame and convert them to a list
location_id_list = [row['LOCATION_ID'] for row in location_id_list]

In [98]:
# Merge the two tables using the 'ORDER_ID' column as the common key
merged_df = order_detail.join(order_header, on='ORDER_ID', how='inner')

In [99]:
# Get rows where the LOCATION_ID is for United States
merged_df = merged_df.filter(F.col('LOCATION_ID').isin(location_id_list))

In [100]:
merged_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"ORDER_DETAIL_ID"  |"MENU_ITEM_ID"  |"l_9928_DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"r_s599_DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
------------------------------------------------------------------------------------------------------------------------------------

In [101]:
final_df = merged_df.select("MENU_ITEM_ID", "ORDER_ID", "QUANTITY", "UNIT_PRICE", "PRICE", "ORDER_TS", "ORDER_TOTAL")

In [102]:
final_df.show()

-----------------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"ORDER_ID"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_TS"           |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------
|106             |456679686   |1           |3.0000        |3.0000   |2020-11-28 16:18:38  |67.0000        |
|106             |456679687   |3           |3.0000        |9.0000   |2020-11-28 16:20:53  |64.0000        |
|105             |456679687   |1           |3.0000        |3.0000   |2020-11-28 16:20:53  |64.0000        |
|103             |456679687   |1           |10.0000       |10.0000  |2020-11-28 16:20:53  |64.0000        |
|101             |456679687   |3           |8.0000        |24.0000  |2020-11-28 16:20:53  |64.0000        |
|102             |456679687   |2           |9.0000        |18.0000  |2020-11-28 16:20:53  |64.0000        |
|103             |456679688 

In [103]:
final_df = final_df.to_pandas()

In [104]:
final_df

Unnamed: 0,MENU_ITEM_ID,ORDER_ID,QUANTITY,UNIT_PRICE,PRICE,ORDER_TS,ORDER_TOTAL
0,43,438422784,1,15.0,15.0,2021-04-15 19:59:23,28.0
1,41,438422785,4,8.0,32.0,2021-04-15 20:01:21,92.0
2,42,438422785,1,10.0,10.0,2021-04-15 20:01:21,92.0
3,43,438422785,3,15.0,45.0,2021-04-15 20:01:21,92.0
4,44,438422785,1,2.0,2.0,2021-04-15 20:01:21,92.0
...,...,...,...,...,...,...,...
65483023,151,447755751,3,11.0,33.0,2022-09-21 09:14:12,66.0
65483024,152,447755752,2,11.0,22.0,2022-09-21 09:15:24,36.0
65483025,153,447755752,1,11.0,11.0,2022-09-21 09:15:24,36.0
65483026,155,447755752,1,3.0,3.0,2022-09-21 09:15:24,36.0


In [105]:
final_df.to_csv('C:/Users/donsu/Downloads/product_team_orders_info.csv', index=False)

In [106]:
# Get the unique values from the "MENU_ITEM_ID" column
unique_menu_item_ids = final_df["MENU_ITEM_ID"].unique().tolist()

In [107]:
menu_df = menu_table.to_pandas()

In [108]:
# Convert unique_menu_item_ids to a set for faster lookup
unique_menu_item_ids_set = set(unique_menu_item_ids)

# Filter the menu_df DataFrame
menu_df = menu_df[menu_df["MENU_ITEM_ID"].isin(unique_menu_item_ids_set)]

In [109]:
# Convert the string JSON data to a nested dictionary
menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'] = menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'].apply(ast.literal_eval)

# Use json_normalize to flatten the nested JSON data
menu_item_metrics = pd.json_normalize(menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'], record_path='menu_item_health_metrics')

# Rename the columns
menu_item_metrics = menu_item_metrics.rename(columns={
    'is_dairy_free_flag': 'DAIRY_FREE',
    'is_gluten_free_flag': 'GLUTEN_FREE',
    'is_healthy_flag': 'HEALTHY',
    'is_nut_free_flag': 'NUT_FREE'
})

# Replace 'Y' with 'Yes' and 'N' with 'No' in the DataFrame
menu_item_metrics = menu_item_metrics.replace({'Y': 1, 'N': 0})

# Concatenate the flattened DataFrame with the original DataFrame
menu_df = pd.concat([menu_df, menu_item_metrics], axis=1)

# Drop the original 'MENU_ITEM_HEALTH_METRICS_OBJ' and 'ingredients' column 
menu_df = menu_df.drop(columns=['MENU_ITEM_HEALTH_METRICS_OBJ', 'ingredients'])

In [110]:
menu_df.to_csv('C:/Users/donsu/Downloads/product_team_menu_info.csv', index=False)

In [111]:
# # Merge 'menu_filtered_df' and 'final_df' on the common column 'MENU_ITEM_ID'
# final_merged_df = pd.merge(menu_df, final_df, on='MENU_ITEM_ID', how='inner')

In [112]:
# # Assuming 'ORDER_TS' column is in datetime format
# final_merged_df['YEAR'] = final_merged_df['ORDER_TS'].dt.year
# final_merged_df['MONTH'] = final_merged_df['ORDER_TS'].dt.month

In [113]:
# final_merged_df.head()

In [115]:
final_df["PRICE"].sum()

947663807.75