In [None]:
# most of the imports we will need
import json

import snowflake.snowpark.functions as F
import snowflake.snowpark.types as T
from snowflake.snowpark.functions import udf
from snowflake.snowpark.session import Session

In [None]:
# set up Snowflake session / connection

with open('creds.json') as f:
    data = json.load(f)
    USERNAME = data['user']
    PASSWORD = data['password']
    ROLE = data['role']
    SF_ACCOUNT = data['account']
    SF_WH = data['warehouse']

CONNECTION_PARAMETERS = {
   "account": SF_ACCOUNT,
   "user": USERNAME,
   "password": PASSWORD,
   "role": ROLE
}

session = Session.builder.configs(CONNECTION_PARAMETERS).create()

In [None]:
# initialize environment: 

# create database & make it active in this session ("use" it)
session.sql('CREATE OR REPLACE DATABASE HOL_DB').collect()
session.use_database('HOL_DB')

# create a stage pointing at the S3 bucket with the sample data
session.sql(''' CREATE OR REPLACE STAGE FROSTBYTE_RAW_STAGE
                URL = 's3://sfquickstarts/data-engineering-with-snowpark-python/'
                ;''').collect()

# create a medium size warehouse
session.sql('CREATE OR REPLACE WAREHOUSE HOL_WH WAREHOUSE_SIZE = MEDIUM, AUTO_SUSPEND = 300, AUTO_RESUME= TRUE;').collect()

In [None]:
# names of tables & targets for loading
POS_TABLES = ['country', 'franchise', 'location', 'menu', 'truck', 'order_header', 'order_detail']
CUSTOMER_TABLES = ['customer_loyalty']
TABLE_DICT = {
    "pos": {"schema": "PUBLIC", "tables": POS_TABLES},
    "customer": {"schema": "PUBLIC", "tables": CUSTOMER_TABLES}
}

# SNOWFLAKE ADVANTAGE: Schema detection
# SNOWFLAKE ADVANTAGE: Data ingestion with COPY
# SNOWFLAKE ADVANTAGE: Snowflake Tables (not file-based)

def load_raw_table(session, tname=None, s3dir=None, year=None, schema=None):
    session.use_schema(schema)
    if year is None:
        location = "@frostbyte_raw_stage/{}/{}".format(s3dir, tname)
    else:
        print('\tLoading year {}'.format(year)) 
        location = "@frostbyte_raw_stage/{}/{}/year={}".format(s3dir, tname, year)
    
    # we can infer schema using the parquet read option
    df = session.read.option("compression", "snappy") \
                            .parquet(location)
    df.copy_into_table("{}".format(tname))

# SNOWFLAKE ADVANTAGE: Warehouse elasticity (dynamic scaling)

def load_all_raw_tables(session):
    _ = session.sql("ALTER WAREHOUSE HOL_WH SET WAREHOUSE_SIZE = X2LARGE WAIT_FOR_COMPLETION = TRUE").collect()

    for s3dir, data in TABLE_DICT.items():
        tnames = data['tables']
        schema = data['schema']
        for tname in tnames:
            print("Loading {}".format(tname))
            # Only load 1 year of data for the order tables at this point
            # We will load the 2022 data later in the lab
            if tname in ['order_header', 'order_detail']:
                for year in ['2021']:
                    load_raw_table(session, tname=tname, s3dir=s3dir, year=year, schema=schema)
            else:
                load_raw_table(session, tname=tname, s3dir=s3dir, schema=schema)

    _ = session.sql("ALTER WAREHOUSE HOL_WH SET WAREHOUSE_SIZE = MEDIUM").collect()

def validate_raw_tables(session):
    # check column names from the inferred schema
    for tname in POS_TABLES:
        print('{}: \n\t{}\n'.format(tname, session.table('PUBLIC.{}'.format(tname)).columns))

    for tname in CUSTOMER_TABLES:
        print('{}: \n\t{}\n'.format(tname, session.table('PUBLIC.{}'.format(tname)).columns))

load_all_raw_tables(session)
validate_raw_tables(session)

In [None]:
# create Point-of-Sale view
# select specific columns from ORDER_HEADER & add a date based on the timestamp
order_header = session.table("ORDER_HEADER"
                     ).select(F.col("ORDER_ID"), \
                                F.col("TRUCK_ID"), \
                                F.col("ORDER_TS"), \
                                F.to_date(F.col("ORDER_TS")).alias("ORDER_TS_DATE"), \
                                F.col("ORDER_AMOUNT"), \
                                F.col("ORDER_TAX_AMOUNT"), \
                                F.col("ORDER_DISCOUNT_AMOUNT"), \
                                F.col("LOCATION_ID"), \
                                F.col("ORDER_TOTAL"))

# select specific columns from the FRANCHISE table and rename the first/last name columns
franchise = session.table("FRANCHISE"
                  ).select(F.col("FRANCHISE_ID"), \
                            F.col("FIRST_NAME").alias("FRANCHISEE_FIRST_NAME"), \
                            F.col("LAST_NAME").alias("FRANCHISEE_LAST_NAME"))

# just pull the entire table for the rest of these
order_detail = session.table("ORDER_DETAIL")
truck = session.table("TRUCK")
menu = session.table("MENU")
location = session.table("LOCATION")

# join franchise to truck
t_with_f = truck.join(franchise, truck['FRANCHISE_ID'] == franchise['FRANCHISE_ID'], rsuffix='_f')

# add in order header and location
oh_w_t_and_l = order_header.join(t_with_f, order_header['TRUCK_ID'] == t_with_f['TRUCK_ID'], rsuffix='_t') \
                           .join(location, order_header['LOCATION_ID'] == location['LOCATION_ID'], rsuffix='_l')

# add in order detail, and menu
final_df = order_detail.join(oh_w_t_and_l, order_detail['ORDER_ID'] == oh_w_t_and_l['ORDER_ID'], rsuffix='_oh') \
                       .join(menu, order_detail['MENU_ITEM_ID'] == menu['MENU_ITEM_ID'], rsuffix='_m')

# itemize final column list
final_df = final_df.select(F.col("ORDER_ID"), 
                            F.col("TRUCK_ID"), 
                            F.col("ORDER_TS"), 
                            F.col('ORDER_TS_DATE'), 
                            F.col("ORDER_DETAIL_ID"), 
                            F.col("LINE_NUMBER"), 
                            F.col("TRUCK_BRAND_NAME"), 
                            F.col("MENU_TYPE"), 
                            F.col("PRIMARY_CITY"), 
                            F.col("REGION"), 
                            F.col("COUNTRY"), 
                            F.col("FRANCHISE_FLAG"), 
                            F.col("FRANCHISE_ID"), 
                            F.col("FRANCHISEE_FIRST_NAME"), 
                            F.col("FRANCHISEE_LAST_NAME"), 
                            F.col("LOCATION_ID"), 
                            F.col("MENU_ITEM_ID"), 
                            F.col("MENU_ITEM_NAME"), 
                            F.col("QUANTITY"), 
                            F.col("UNIT_PRICE"), 
                            F.col("PRICE"), 
                            F.col("ORDER_AMOUNT"), 
                            F.col("ORDER_TAX_AMOUNT"), 
                            F.col("ORDER_DISCOUNT_AMOUNT"), 
                            F.col("ORDER_TOTAL"))

# create a view based on the above
final_df.create_or_replace_view('POS_FLATTENED_V')

In [None]:
# have a quick look at the result
tv = session.table('POS_FLATTENED_V')
tv.limit(5).show()

In [None]:
# get (and peak at) historical weather data for Spain
weather = session.table('FROSTBYTE_WEATHERSOURCE.ONPOINT_ID.HISTORY_DAY') \
                 .filter(F.col('COUNTRY')=='ES') \
                 .select(
                        F.col('COUNTRY').alias('W_COUNTRY'),
                        F.col('CITY_NAME'),
                        F.col('DATE_VALID_STD'),
                        F.col('AVG_TEMPERATURE_FEELSLIKE_2M_F')) \
                 .group_by(['W_COUNTRY', 'CITY_NAME', 'DATE_VALID_STD']) \
                 .agg(F.avg('AVG_TEMPERATURE_FEELSLIKE_2M_F').as_('AVG_TEMP_F'))
weather.show()

In [None]:
# create a UDF for converting from fahrenheit to celsius
@udf(name="fahrenheit_to_celsius", is_permanent=True, stage_location="@~", replace=True)
def fahrenheit_to_celsius(temp_f: float) -> float:
    return (float(temp_f) - 32) * (5/9)

In [None]:
# review Spanish daily sales per city
pos_data = session.table('POS_FLATTENED_V') \
                  .select(
                        F.col('ORDER_TS_DATE'),
                        F.col('COUNTRY'),
                        F.col('PRIMARY_CITY'),
                        F.col('PRICE')) \
                  .where(F.col('COUNTRY') == 'Spain') \
                  .group_by('COUNTRY','PRIMARY_CITY','ORDER_TS_DATE') \
                  .agg(F.sum('PRICE').as_('TOTAL_SALES'))
pos_data.show()

In [None]:
# combine POS and weather data for Spain
pos_and_temp = pos_data.join(weather.select('W_COUNTRY', 'CITY_NAME', 'DATE_VALID_STD', 'AVG_TEMP_F') \
                                    .group_by(['W_COUNTRY', 'CITY_NAME', 'DATE_VALID_STD']) \
                                    .agg(F.sum('AVG_TEMP_F').as_('AVG_TEMP_F')), 
                                (pos_data['COUNTRY'] == 'Spain') & 
                                (weather['W_COUNTRY'] == 'ES') & 
                                (pos_data['PRIMARY_CITY'] == weather['CITY_NAME']) & 
                                (pos_data['ORDER_TS_DATE'] == weather['DATE_VALID_STD'])) \
                        .select(
                                F.col('COUNTRY'),
                                F.col('CITY_NAME'),
                                F.col('DATE_VALID_STD').as_('DATE'),
                                F.col('TOTAL_SALES'),
                                F.round(F.col('AVG_TEMP_F'),1).alias('AVG_TEMP_F'),
                                F.round(fahrenheit_to_celsius('AVG_TEMP_F'),1).alias('AVG_TEMP_C'))
pos_and_temp.show()

In [None]:
# materialize the results as a table
pos_and_temp.write.save_as_table('DAILY_SALES_AND_TEMP_SPAIN')

In [None]:
###### next step, left to the reader: update the materialized table daily

## hints:
## create a stream on one of the views
## create a stored procedure to update the materialized table
## create a task to run the procedure daily