In [82]:
# IMPORT STATEMENTS

#Import Python packages
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import ast

# Import Snowflake modules
from snowflake.snowpark import Session, DataFrame
from pyspark.sql.functions import year
import snowflake.snowpark.functions as F
from snowflake.snowpark.types import *

# import matplotlib and seaborn to plot charts and graphs
import matplotlib.pyplot as plt
import seaborn as sns

In [83]:
# Get account credentials from a json file
with open("data_scientist_auth.json") as f:
    data = json.load(f)
    username = data["username"]
    password = data["password"]
    account = data["account"]

# Specify connection parameters
connection_parameters = {
    "account": account,
    "user": username,
    "password": password,
    "role": "TASTY_BI",
    "warehouse": "TASTY_BI_WH",
    "database": "frostbyte_tasty_bytes",
    "schema": "raw_pos",
}

# Create Snowpark session
session = Session.builder.configs(connection_parameters).create()

In [84]:
# retrieve order details usa table from snowflake
order_detail = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_DETAIL")
order_header = session.table("frostbyte_tasty_bytes.raw_pos.ORDER_HEADER")
location_table = session.table("frostbyte_tasty_bytes.raw_pos.LOCATION")
menu_table = session.table("frostbyte_tasty_bytes.raw_pos.MENU")

In [85]:
# Get a list of LOCATION_IDs where the COUNTRY column's value is 'United States'
## Filter the 'location_table' where the 'COUNTRY' column is 'United States'
filtered_location_table = location_table.filter(location_table['COUNTRY'] == 'United States')

## Select the 'LOCATION_ID' column from the filtered DataFrame
location_id_df = filtered_location_table.select('LOCATION_ID')

## Convert the 'LOCATION_ID' column to a Python list
location_id_list = location_id_df.collect()

## Extract the values from the DataFrame and convert them to a list
location_id_list = [row['LOCATION_ID'] for row in location_id_list]

In [86]:
# Merge the two tables using the 'ORDER_ID' column as the common key
merged_df = order_detail.join(order_header, on='ORDER_ID', how='inner')

In [87]:
# Get rows where the LOCATION_ID is for United States
merged_df = merged_df.filter(F.col('LOCATION_ID').isin(location_id_list))

In [88]:
final_df.show()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"ORDER_ID"  |"QUANTITY"  |"UNIT_PRICE"  |"COST_OF_GOODS_USD"  |"PRICE"  |"ORDER_TS"           |"ORDER_AMOUNT"  |"ORDER_TOTAL"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_NAME"         |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|71              |444016143   |1           |9.0000        |5.0000               |9.0000   |2022-04-06 16:04:17  |23.0000         |23.0000        |Vegetarian   |Plant Palace        |Veggie Burger            |Main             |Hot Option          |
|72         

In [89]:
final_df = final_df.select("MENU_ITEM_ID", "ORDER_ID", "QUANTITY", "UNIT_PRICE", "COST_OF_GOODS_USD", "PRICE", "ORDER_TS", "ORDER_TOTAL", "MENU_TYPE", "TRUCK_BRAND_NAME", 
                "MENU_ITEM_NAME", "ITEM_CATEGORY", "ITEM_SUBCATEGORY", )

In [90]:
final_df.show()

-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"MENU_ITEM_ID"  |"ORDER_ID"  |"QUANTITY"  |"UNIT_PRICE"  |"COST_OF_GOODS_USD"  |"PRICE"  |"ORDER_TS"           |"ORDER_TOTAL"  |"MENU_TYPE"  |"TRUCK_BRAND_NAME"  |"MENU_ITEM_NAME"       |"ITEM_CATEGORY"  |"ITEM_SUBCATEGORY"  |
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|82              |440500497   |1           |15.0000       |6.0000               |15.0000  |2021-09-10 11:47:23  |24.0000        |Crepes       |Le Coin des Crêpes  |Chicken Pot Pie Crepe  |Main             |Hot Option          |
|83              |440500497   |1           |9.0000        |4.0000               |9.0000 

In [92]:
final_df = final_df.to_pandas()

In [93]:
final_df

Unnamed: 0,MENU_ITEM_ID,ORDER_ID,QUANTITY,UNIT_PRICE,COST_OF_GOODS_USD,PRICE,ORDER_TS,ORDER_TOTAL,MENU_TYPE,TRUCK_BRAND_NAME,MENU_ITEM_NAME,ITEM_CATEGORY,ITEM_SUBCATEGORY
0,51,446720707,1,17.25,8.00,17.25,2022-08-06 20:11:52,17.25,Ramen,Kitakata Ramen Bar,Creamy Chicken Ramen,Main,Hot Option
1,52,446720708,4,17.25,7.00,69.00,2022-08-06 20:11:54,69.00,Ramen,Kitakata Ramen Bar,Spicy Miso Vegetable Ramen,Main,Hot Option
2,53,446720709,2,17.25,7.00,34.50,2022-08-06 20:14:28,109.50,Ramen,Kitakata Ramen Bar,Tonkotsu Ramen,Main,Hot Option
3,52,446720709,1,17.25,7.00,17.25,2022-08-06 20:14:28,109.50,Ramen,Kitakata Ramen Bar,Spicy Miso Vegetable Ramen,Main,Hot Option
4,56,446720709,2,3.00,0.75,6.00,2022-08-06 20:14:28,109.50,Ramen,Kitakata Ramen Bar,Ice Tea,Beverage,Cold Option
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65483023,131,437374057,1,13.00,6.00,13.00,2021-01-07 17:53:32,61.00,Chinese,Peking Truck,Combo Lo Mein,Main,Hot Option
65483024,131,437374058,2,13.00,6.00,26.00,2021-01-07 17:54:59,60.00,Chinese,Peking Truck,Combo Lo Mein,Main,Hot Option
65483025,132,437374058,2,11.00,5.00,22.00,2021-01-07 17:54:59,60.00,Chinese,Peking Truck,Combo Fried Rice,Main,Warm Option
65483026,133,437374058,2,6.00,2.00,12.00,2021-01-07 17:54:59,60.00,Chinese,Peking Truck,Wonton Soup,Main,Warm Option


In [102]:
menu_df = menu_table.to_pandas()

In [103]:
# Convert the string JSON data to a nested dictionary
menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'] = menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'].apply(ast.literal_eval)

# Use json_normalize to flatten the nested JSON data
menu_item_metrics = pd.json_normalize(menu_df['MENU_ITEM_HEALTH_METRICS_OBJ'], record_path='menu_item_health_metrics')

# Rename the columns
menu_item_metrics = menu_item_metrics.rename(columns={
    'is_dairy_free_flag': 'DAIRY_FREE',
    'is_gluten_free_flag': 'GLUTEN_FREE',
    'is_healthy_flag': 'HEALTHY',
    'is_nut_free_flag': 'NUT_FREE'
})

# Replace 'Y' with 'Yes' and 'N' with 'No' in the DataFrame
menu_item_metrics = menu_item_metrics.replace({'Y': 1, 'N': 0})

# Concatenate the flattened DataFrame with the original DataFrame
menu_df = pd.concat([menu_df, menu_item_metrics], axis=1)

# Drop the original 'MENU_ITEM_HEALTH_METRICS_OBJ' and 'ingredients' column 
menu_df = menu_df.drop(columns=['MENU_ITEM_HEALTH_METRICS_OBJ', 'ingredients'])

In [104]:
menu_filtered_df = menu_df[["MENU_ITEM_ID", "DAIRY_FREE", "GLUTEN_FREE", "HEALTHY", "NUT_FREE"]]

In [106]:
# Merge 'menu_filtered_df' and 'final_df' on the common column 'MENU_ITEM_ID'
final_merged_df = pd.merge(menu_filtered_df, final_df, on='MENU_ITEM_ID', how='inner')

In [108]:
final_merged_df.head()

Unnamed: 0,MENU_ITEM_ID,DAIRY_FREE,GLUTEN_FREE,HEALTHY,NUT_FREE,ORDER_ID,QUANTITY,UNIT_PRICE,COST_OF_GOODS_USD,PRICE,ORDER_TS,ORDER_TOTAL,MENU_TYPE,TRUCK_BRAND_NAME,MENU_ITEM_NAME,ITEM_CATEGORY,ITEM_SUBCATEGORY
0,10,1,1,0,1,446735776,1,3.5,0.65,3.5,2022-08-07 08:04:02,42.5,Ice Cream,Freezing Point,Lemonade,Beverage,Cold Option
1,10,1,1,0,1,446735805,1,3.5,0.65,3.5,2022-08-07 08:16:16,21.5,Ice Cream,Freezing Point,Lemonade,Beverage,Cold Option
2,10,1,1,0,1,446735818,1,3.5,0.65,3.5,2022-08-07 08:23:23,21.5,Ice Cream,Freezing Point,Lemonade,Beverage,Cold Option
3,10,1,1,0,1,446735821,1,3.5,0.65,3.5,2022-08-07 08:24:41,27.5,Ice Cream,Freezing Point,Lemonade,Beverage,Cold Option
4,10,1,1,0,1,446735822,3,3.5,0.65,10.5,2022-08-07 08:25:16,35.5,Ice Cream,Freezing Point,Lemonade,Beverage,Cold Option


In [109]:
final_merged_df.to_csv('C:/Users/donsu/Downloads/product_team_power_bi_dataset', index=False)