In [84]:
# IMPORT STATEMENTS

#Import Python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import ast

# Import Snowflake modules
from snowflake.snowpark import Session, DataFrame
from pyspark.sql.functions import year
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import *

# import matplotlib and seaborn to plot charts and graphs
import matplotlib.pyplot as plt
import seaborn as sns

In [85]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "raw_pos",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [86]:
# retrieve order details usa table from snowflake
order_detail = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_DETAIL")
order_header = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_HEADER")
location_table = session.table("frostbyte_tasty_bytes.raw_pos.LOCATION")
menu_table = session.table("frostbyte_tasty_bytes.raw_pos.MENU")

In [87]:
# Get a list of LOCATION_IDs where the COUNTRY column's value is 'United States'
## Filter the 'location_table' where the 'COUNTRY' column is 'United States'
filtered_location_table = location_table.filter(location_table['COUNTRY'] == 'United States')

## Select the 'LOCATION_ID' column from the filtered DataFrame
location_id_df = filtered_location_table.select('LOCATION_ID')

## Convert the 'LOCATION_ID' column to a Python list
location_id_list = location_id_df.collect()

## Extract the values from the DataFrame and convert them to a list
location_id_list = [row['LOCATION_ID'] for row in location_id_list]

In [88]:
# Merge the two tables using the 'ORDER_ID' column as the common key
merged_df = order_detail.join(order_header, on='ORDER_ID', how='inner')

In [89]:
# Get rows where the LOCATION_ID is for United States
merged_df = merged_df.filter(F.col('LOCATION_ID').isin(location_id_list))

In [90]:
merged_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"ORDER_DETAIL_ID"  |"MENU_ITEM_ID"  |"l_3yd5_DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"r_k81b_DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
------------------------------------------------------------------------------------------------------------------------------------

In [91]:
final_df = merged_df.select("MENU_ITEM_ID", "ORDER_ID", "QUANTITY", "UNIT_PRICE", "PRICE", "ORDER_TS", "ORDER_TOTAL")

In [92]:
final_df.show()

-----------------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"ORDER_ID"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_TS"           |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------
|71              |444016143   |1           |9.0000        |9.0000   |2022-04-06 16:04:17  |23.0000        |
|72              |444016144   |1           |7.0000        |7.0000   |2022-04-06 16:04:58  |7.0000         |
|75              |444016145   |1           |3.0000        |3.0000   |2022-04-06 16:05:07  |61.0000        |
|74              |444016145   |2           |2.0000        |4.0000   |2022-04-06 16:05:07  |61.0000        |
|73              |444016145   |2           |12.0000       |24.0000  |2022-04-06 16:05:07  |61.0000        |
|72              |444016145   |3           |7.0000        |21.0000  |2022-04-06 16:05:07  |61.0000        |
|71              |444016145 

In [93]:
final_df = final_df.to_pandas()

SnowparkFetchDataException: (1406): Failed to fetch a Pandas Dataframe. The error is: Unable to allocate 6.19 MiB for an array with shape (2, 405760) and data type float64

In [None]:
final_df

Unnamed: 0,MENU_ITEM_ID,ORDER_ID,QUANTITY,UNIT_PRICE,PRICE,ORDER_TS,ORDER_TOTAL
0,72,436294482,2,7.0,14.0,2020-04-24 15:22:16,38.0
1,73,436294482,1,12.0,12.0,2020-04-24 15:22:16,38.0
2,75,436294482,1,3.0,3.0,2020-04-24 15:22:16,38.0
3,71,436294482,1,9.0,9.0,2020-04-24 15:22:16,38.0
4,72,436294483,1,7.0,7.0,2020-04-24 15:22:33,49.0
...,...,...,...,...,...,...,...
65483023,43,438422782,1,15.0,15.0,2021-04-15 19:59:04,71.0
65483024,44,438422782,1,2.0,2.0,2021-04-15 19:59:04,71.0
65483025,43,438422783,1,15.0,15.0,2021-04-15 19:59:10,15.0
65483026,42,438422784,1,10.0,10.0,2021-04-15 19:59:23,28.0


In [None]:
final_df.to_csv('C:/Users/donsu/Downloads/product_team_orders_info.csv', index=False)

In [None]:
# Get the unique values from the "MENU_ITEM_ID" column
unique_menu_item_ids = final_df["MENU_ITEM_ID"].unique().tolist()

In [None]:
menu_df = menu_table.to_pandas()

In [None]:
# Convert unique_menu_item_ids to a set for faster lookup
unique_menu_item_ids_set = set(unique_menu_item_ids)

# Filter the menu_df DataFrame
menu_df = menu_df[menu_df["MENU_ITEM_ID"].isin(unique_menu_item_ids_set)]

In [None]:
# Convert the string JSON data to a nested dictionary
menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'] = menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'].apply(ast.literal_eval)

# Use json_normalize to flatten the nested JSON data
menu_item_metrics = pd.json_normalize(menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'], record_path='menu_item_health_metrics')

# Rename the columns
menu_item_metrics = menu_item_metrics.rename(columns={
    'is_dairy_free_flag': 'DAIRY_FREE',
    'is_gluten_free_flag': 'GLUTEN_FREE',
    'is_healthy_flag': 'HEALTHY',
    'is_nut_free_flag': 'NUT_FREE'
})

# Replace 'Y' with 'Yes' and 'N' with 'No' in the DataFrame
menu_item_metrics = menu_item_metrics.replace({'Y': 1, 'N': 0})

# Concatenate the flattened DataFrame with the original DataFrame
menu_df = pd.concat([menu_df, menu_item_metrics], axis=1)

# Drop the original 'MENU_ITEM_HEALTH_METRICS_OBJ' and 'ingredients' column 
menu_df = menu_df.drop(columns=['MENU_ITEM_HEALTH_METRICS_OBJ', 'ingredients'])

In [None]:
menu_df.to_csv('C:/Users/donsu/Downloads/product_team_menu_info.csv', index=False)

In [None]:
# # Merge 'menu_filtered_df' and 'final_df' on the common column 'MENU_ITEM_ID'
# final_merged_df = pd.merge(menu_df, final_df, on='MENU_ITEM_ID', how='inner')

MemoryError: Unable to allocate 1.95 GiB for an array with shape (4, 65483028) and data type int64

In [None]:
# # Assuming 'ORDER_TS' column is in datetime format
# final_merged_df['YEAR'] = final_merged_df['ORDER_TS'].dt.year
# final_merged_df['MONTH'] = final_merged_df['ORDER_TS'].dt.month

In [None]:
# final_merged_df.head()