In [52]:
# IMPORT STATEMENTS

#Import Python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import ast

# Import Snowflake modules
from snowflake.snowpark import Session, DataFrame
from pyspark.sql.functions import year
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import *

# import matplotlib and seaborn to plot charts and graphs
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "raw_pos",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [54]:
# retrieve order details usa table from snowflake
order_detail = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_DETAIL")
order_header = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_HEADER")
location_table = session.table("frostbyte_tasty_bytes.raw_pos.LOCATION")
menu_table = session.table("frostbyte_tasty_bytes.raw_pos.MENU")

In [55]:
# Get a list of LOCATION_IDs where the COUNTRY column's value is 'United States'
## Filter the 'location_table' where the 'COUNTRY' column is 'United States'
filtered_location_table = location_table.filter(location_table['COUNTRY'] == 'United States')

## Select the 'LOCATION_ID' column from the filtered DataFrame
location_id_df = filtered_location_table.select('LOCATION_ID')

## Convert the 'LOCATION_ID' column to a Python list
location_id_list = location_id_df.collect()

## Extract the values from the DataFrame and convert them to a list
location_id_list = [row['LOCATION_ID'] for row in location_id_list]

In [56]:
# Merge the two tables using the 'ORDER_ID' column as the common key
merged_df = order_detail.join(order_header, on='ORDER_ID', how='inner')

In [57]:
# Get rows where the LOCATION_ID is for United States
merged_df = merged_df.filter(F.col('LOCATION_ID').isin(location_id_list))

In [58]:
merged_df.show()

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"ORDER_ID"  |"ORDER_DETAIL_ID"  |"MENU_ITEM_ID"  |"l_rk2j_DISCOUNT_ID"  |"LINE_NUMBER"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_ITEM_DISCOUNT_AMOUNT"  |"TRUCK_ID"  |"LOCATION_ID"  |"CUSTOMER_ID"  |"r_pvy2_DISCOUNT_ID"  |"SHIFT_ID"  |"SHIFT_START_TIME"  |"SHIFT_END_TIME"  |"ORDER_CHANNEL"  |"ORDER_TS"           |"SERVED_TS"  |"ORDER_CURRENCY"  |"ORDER_AMOUNT"  |"ORDER_TAX_AMOUNT"  |"ORDER_DISCOUNT_AMOUNT"  |"ORDER_TOTAL"  |
------------------------------------------------------------------------------------------------------------------------------------

In [59]:
final_df = merged_df.select("MENU_ITEM_ID", "ORDER_ID", "QUANTITY", "UNIT_PRICE", "PRICE", "ORDER_TS", "ORDER_TOTAL")

In [60]:
final_df.show()

-----------------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"ORDER_ID"  |"QUANTITY"  |"UNIT_PRICE"  |"PRICE"  |"ORDER_TS"           |"ORDER_TOTAL"  |
-----------------------------------------------------------------------------------------------------------
|132             |455182391   |2           |11.0000       |22.0000  |2022-04-27 10:29:20  |41.0000        |
|133             |455182391   |1           |6.0000        |6.0000   |2022-04-27 10:29:20  |41.0000        |
|131             |455182392   |1           |13.0000       |13.0000  |2022-04-27 10:30:38  |33.0000        |
|132             |455182392   |1           |11.0000       |11.0000  |2022-04-27 10:30:38  |33.0000        |
|133             |455182392   |1           |6.0000        |6.0000   |2022-04-27 10:30:38  |33.0000        |
|135             |455182392   |1           |3.0000        |3.0000   |2022-04-27 10:30:38  |33.0000        |
|131             |455182393 

In [61]:
final_df = final_df.to_pandas()

In [62]:
final_df

Unnamed: 0,MENU_ITEM_ID,ORDER_ID,QUANTITY,UNIT_PRICE,PRICE,ORDER_TS,ORDER_TOTAL
0,11,4063758,2,6.0,12.0,2022-11-01 08:03:24,45.0
1,12,4063758,2,6.0,12.0,2022-11-01 08:03:24,45.0
2,13,4063758,1,7.0,7.0,2022-11-01 08:03:24,45.0
3,14,4063758,1,2.0,2.0,2022-11-01 08:03:24,45.0
4,17,4063758,1,4.0,4.0,2022-11-01 08:03:24,45.0
...,...,...,...,...,...,...,...
65483023,82,448632542,3,15.0,45.0,2022-10-31 11:49:42,74.0
65483024,84,448632542,1,2.0,2.0,2022-10-31 11:49:42,74.0
65483025,85,448632542,1,3.0,3.0,2022-10-31 11:49:42,74.0
65483026,83,448632543,2,9.0,18.0,2022-10-31 11:49:42,21.0


In [63]:
menu_df = menu_table.to_pandas()

In [64]:
# Convert the string JSON data to a nested dictionary
menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'] = menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'].apply(ast.literal_eval)

# Use json_normalize to flatten the nested JSON data
menu_item_metrics = pd.json_normalize(menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'], record_path='menu_item_health_metrics')

# Rename the columns
menu_item_metrics = menu_item_metrics.rename(columns={
    'is_dairy_free_flag': 'DAIRY_FREE',
    'is_gluten_free_flag': 'GLUTEN_FREE',
    'is_healthy_flag': 'HEALTHY',
    'is_nut_free_flag': 'NUT_FREE'
})

# Replace 'Y' with 'Yes' and 'N' with 'No' in the DataFrame
menu_item_metrics = menu_item_metrics.replace({'Y': 1, 'N': 0})

# Concatenate the flattened DataFrame with the original DataFrame
menu_df = pd.concat([menu_df, menu_item_metrics], axis=1)

# Drop the original 'MENU_ITEM_HEALTH_METRICS_OBJ' and 'ingredients' column 
menu_df = menu_df.drop(columns=['MENU_ITEM_HEALTH_METRICS_OBJ', 'ingredients'])

In [65]:
menu_filtered_df = menu_df[["MENU_ITEM_ID", 'MENU_TYPE', 'TRUCK_BRAND_NAME', 'ITEM_CATEGORY', 'ITEM_SUBCATEGORY',
       'COST_OF_GOODS_USD', "DAIRY_FREE", "GLUTEN_FREE", "HEALTHY", "NUT_FREE"]]

In [67]:
# Merge 'menu_filtered_df' and 'final_df' on the common column 'MENU_ITEM_ID'
final_merged_df = pd.merge(menu_filtered_df, final_df, on='MENU_ITEM_ID', how='inner')

MemoryError: Unable to allocate 250. MiB for an array with shape (65483028, 1) and data type int32

In [None]:
# Assuming 'ORDER_TS' column is in datetime format
final_merged_df['YEAR'] = final_merged_df['ORDER_TS'].dt.year
final_merged_df['MONTH'] = final_merged_df['ORDER_TS'].dt.month

KeyError: 'ORDER_TS'

In [None]:
final_merged_df.head()

Unnamed: 0,MENU_ITEM_ID,MENU_TYPE,TRUCK_BRAND_NAME,ITEM_CATEGORY,ITEM_SUBCATEGORY,COST_OF_GOODS_USD,DAIRY_FREE,GLUTEN_FREE,HEALTHY,NUT_FREE,ORDER_ID,QUANTITY,UNIT_PRICE,PRICE,ORDER_TS,ORDER_TOTAL
0,10,Ice Cream,Freezing Point,Beverage,Cold Option,0.65,1,1,0,1,442938842,2,3.5,7.0,2022-02-06 08:03:49,41.0
1,10,Ice Cream,Freezing Point,Beverage,Cold Option,0.65,1,1,0,1,442938844,1,3.5,3.5,2022-02-06 08:05:44,19.5
2,10,Ice Cream,Freezing Point,Beverage,Cold Option,0.65,1,1,0,1,442938845,1,3.5,3.5,2022-02-06 08:05:51,35.5
3,10,Ice Cream,Freezing Point,Beverage,Cold Option,0.65,1,1,0,1,442938853,2,3.5,7.0,2022-02-06 08:09:47,63.0
4,10,Ice Cream,Freezing Point,Beverage,Cold Option,0.65,1,1,0,1,442938855,2,3.5,7.0,2022-02-06 08:10:52,48.0


In [None]:
final_merged_df.to_csv('C:/Users/donsu/Downloads/product_team_power_bi_dataset.csv', index=False)

KeyboardInterrupt: 