In [None]:
import modin.pandas as pd
import snowflake.snowpark.modin.plugin
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION

import json

In [None]:

################################################################################################################
#  You can also use the SnowSQL Client to configure your connection params:
#  https://docs.snowflake.com/en/user-guide/snowsql-install-config.html
#
#  >>> from snowflake.ml.utils import connection_params
#  >>> session = Session.builder.configs(connection_params.SnowflakeLoginOptions()
#  >>> ).create()   
#
#  NOTE: If you have named connection params then specify the connection name
#  Example:
#  
#  >>> session = Session.builder.configs(
#  >>> connection_params.SnowflakeLoginOptions(connection_name='connections.snowml')
#  >>> ).create()
#
#################################################################################################################

# Edit the connection.json before creating the session object below
# Create Snowflake Session object
connection_parameters = json.load(open('connection.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

_ = session.sql("ALTER WAREHOUSE PANDAS_WH SET WAREHOUSE_SIZE = XLARGE WAIT_FOR_COMPLETION = TRUE").collect()

In [None]:
df_2021 = pd.read_parquet("@external.frostbyte_raw_stage/pos/order_detail/year=2021")

In [None]:
row_count = df_2021.shape[0]
print("Dataframe has {} records".format(row_count))
df_2021.head()

In [None]:
df_2021.describe()

So, we have a dataframe with over 225M records in it. How much memory is that data taking up on our local machine? We can use `df.memory_usage(deep=True)` to see exactly this.

In [None]:
df_2021.memory_usage(deep=True)

Notice that it's 0! This is because even though we are using familiar Pandas syntax, all of our computations are pushed to Snowflake, and the data itself actually isn't even present on our local machine.

Now, let's take a look at how easy it is to do some basic pandas operations.

Suppose I want to get all of the records for a specific `MENU_ITEM_ID`:

In [None]:
df_2021[df_2021['MENU_ITEM_ID'] == 71].head()

Or maybe I want to do some basic aggregations across `QUANTITY` and `PRICE` for a particular item:

In [None]:
df_2021[df_2021['MENU_ITEM_ID']==71]['QUANTITY', 'PRICE'].agg(['min', 'max', 'mean'])

or perhaps I want to look at the average number of distinct items per order:

In [None]:
df_2021.groupby(['ORDER_ID'])['MENU_ITEM_ID'].count().mean()

In [None]:
df_2021.groupby(['ORDER_ID'])['PRICE'].sum()

You can of course do some column-based operations:

In [None]:
df_2021['PRICE_WITH_SALES_TAX'] = df_2021['PRICE']*1.07
df_2021.head()

A lot of times, Pandas users will use `.apply(lambda x: ...)` type syntax. Snowpark Pandas also supports this:

In [None]:
df_2021['PRICE_WITH_SALES_TAX_LAMBDA'] = df_2021['PRICE'].apply(lambda x: float(x)*1.07)
df_2021.head()