In [1]:
import modin.pandas as pd
import snowflake.snowpark.modin.plugin
from snowflake.snowpark import Session
from snowflake.snowpark.version import VERSION

import json



In [4]:

################################################################################################################
#  You can also use the SnowSQL Client to configure your connection params:
#  https://docs.snowflake.com/en/user-guide/snowsql-install-config.html
#
#  >>> from snowflake.ml.utils import connection_params
#  >>> session = Session.builder.configs(connection_params.SnowflakeLoginOptions()
#  >>> ).create()   
#
#  NOTE: If you have named connection params then specify the connection name
#  Example:
#  
#  >>> session = Session.builder.configs(
#  >>> connection_params.SnowflakeLoginOptions(connection_name='connections.snowml')
#  >>> ).create()
#
#################################################################################################################

# Edit the connection.json before creating the session object below
# Create Snowflake Session object
connection_parameters = json.load(open('connection_caleb.json'))
session = Session.builder.configs(connection_parameters).create()
session.sql_simplifier_enabled = True

snowflake_environment = session.sql('SELECT current_user(), current_version()').collect()
snowpark_version = VERSION

# Current Environment Details
print('\nConnection Established with the following parameters:')
print('User                        : {}'.format(snowflake_environment[0][0]))
print('Role                        : {}'.format(session.get_current_role()))
print('Database                    : {}'.format(session.get_current_database()))
print('Schema                      : {}'.format(session.get_current_schema()))
print('Warehouse                   : {}'.format(session.get_current_warehouse()))
print('Snowflake version           : {}'.format(snowflake_environment[0][1]))
print('Snowpark for Python version : {}.{}.{}'.format(snowpark_version[0],snowpark_version[1],snowpark_version[2]))

_ = session.sql("ALTER WAREHOUSE PANDAS_WH SET WAREHOUSE_SIZE = XLARGE WAIT_FOR_COMPLETION = TRUE").collect()

 * To change owner, run `chown $USER "/Users/cbaechtold/.snowflake/config.toml"`.
 * To restrict permissions, run `chmod 0600 "/Users/cbaechtold/.snowflake/config.toml"`.




Connection Established with the following parameters:
User                        : CALEB
Role                        : "ACCOUNTADMIN"
Database                    : "PANDAS_DB"
Schema                      : "EXTERNAL"
Warehouse                   : "PANDAS_WH"
Snowflake version           : 8.19.1
Snowpark for Python version : 1.15.0a1


In [6]:
df_2021 = pd.read_parquet("@external.frostbyte_raw_stage/pos/order_detail/year=2021")

In [27]:
row_count = df_2021.shape[0]
print("Dataframe has {} records".format(row_count))
df_2021.head()

Dataframe has 226710782 records


Unnamed: 0,ORDER_DETAIL_ID,ORDER_ID,MENU_ITEM_ID,DISCOUNT_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE,ORDER_ITEM_DISCOUNT_AMOUNT,ORDER_TS
0,95033128,35178099,73,,0,1,12.0,12.0,,2021-06-03 20:36:16
1,95033129,35178099,76,,1,1,3.0,3.0,,2021-06-03 20:36:16
2,95033130,35178099,71,,2,1,9.0,9.0,,2021-06-03 20:36:16
3,95033194,35178123,73,,0,1,12.0,12.0,,2021-06-03 20:48:45
4,95033195,35178123,71,,1,1,9.0,9.0,,2021-06-03 20:48:45


In [28]:
df_2021.describe()

Unnamed: 0,ORDER_DETAIL_ID,ORDER_ID,MENU_ITEM_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE,ORDER_TS
count,226710800.0,226710800.0,226710800.0,226710800.0,226710800.0,226710800.0,226710800.0,226710782
mean,466370100.0,233056500.0,84.31278,1.265844,1.53484,9.468883,15.24814,2021-07-17 20:49:21.297000
min,35384140.0,12987420.0,10.0,0.0,1.0,2.0,2.0,2021-01-01 08:00:00
25%,195897500.0,72572960.0,43.0,0.0,1.0,5.0,6.0,2021-04-20 21:07:44
50%,366748200.0,135442300.0,84.0,1.0,1.0,10.0,12.0,2021-07-24 20:27:34
75%,744312300.0,400508400.0,122.0,2.0,2.0,12.5,20.0,2021-10-17 22:07:49.749000
max,900429800.0,457935300.0,156.0,9.0,22.0,21.0,380.0,2021-12-31 22:59:58
std,287384700.0,166530100.0,43.2631,1.32092,0.8495558,5.144056,13.22047,


So, we have a dataframe with over 225M records in it. How much memory is that data taking up on our local machine? We can use `df.memory_usage(deep=True)` to see exactly this.

In [30]:
df_2021.memory_usage(deep=True)

Index                         0
ORDER_DETAIL_ID               0
ORDER_ID                      0
MENU_ITEM_ID                  0
DISCOUNT_ID                   0
LINE_NUMBER                   0
QUANTITY                      0
UNIT_PRICE                    0
PRICE                         0
ORDER_ITEM_DISCOUNT_AMOUNT    0
ORDER_TS                      0
dtype: int64

Notice that it's 0! This is because even though we are using familiar Pandas syntax, all of our computations are pushed to Snowflake, and the data itself actually isn't even present on our local machine.

Now, let's take a look at how easy it is to do some basic pandas operations.

Suppose I want to get all of the records for a specific `MENU_ITEM_ID`:

In [35]:
df_2021[df_2021['MENU_ITEM_ID'] == 71].head()

Unnamed: 0,ORDER_DETAIL_ID,ORDER_ID,MENU_ITEM_ID,DISCOUNT_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE,ORDER_ITEM_DISCOUNT_AMOUNT,ORDER_TS
2,95033130,35178099,71,,2,1,9.0,9.0,,2021-06-03 20:36:16
4,95033195,35178123,71,,1,1,9.0,9.0,,2021-06-03 20:48:45
7,95033204,35178126,71,,2,3,9.0,27.0,,2021-06-03 20:50:17
9,95033366,35178191,71,,1,1,9.0,9.0,,2021-06-03 21:26:02
584,95054368,35185935,71,,2,2,9.0,18.0,,2021-06-04 09:50:42


Or maybe I want to do some basic aggregations across `QUANTITY` and `PRICE` for a particular item:

In [43]:
df_2021[df_2021['MENU_ITEM_ID']==71]['QUANTITY', 'PRICE'].agg(['min', 'max', 'mean'])

Unnamed: 0,QUANTITY,PRICE
min,1.0,9.0
max,21.0,189.0
mean,1.708128,15.373154


or perhaps I want to look at the average number of distinct items per order:

In [47]:
df_2021.groupby(['ORDER_ID'])['MENU_ITEM_ID'].count().mean()

2.703605

In [48]:
df_2021.groupby(['ORDER_ID'])['PRICE'].sum()

ORDER_ID
12987419     25.0
12987420     29.0
12987421     15.0
12987422     40.0
12987423     26.0
             ... 
457935306    33.0
457935307    33.0
457935308    11.0
457935309    44.0
457935310    72.0
Name: PRICE, Length: 83854997, dtype: float64

You can of course do some column-based operations:

In [51]:
df_2021['PRICE_WITH_SALES_TAX'] = df_2021['PRICE']*1.07
df_2021.head()

Unnamed: 0,ORDER_DETAIL_ID,ORDER_ID,MENU_ITEM_ID,DISCOUNT_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE,ORDER_ITEM_DISCOUNT_AMOUNT,ORDER_TS,PRICE_WITH_SALES_TAX
0,95033128,35178099,73,,0,1,12.0,12.0,,2021-06-03 20:36:16,12.84
1,95033129,35178099,76,,1,1,3.0,3.0,,2021-06-03 20:36:16,3.21
2,95033130,35178099,71,,2,1,9.0,9.0,,2021-06-03 20:36:16,9.63
3,95033194,35178123,73,,0,1,12.0,12.0,,2021-06-03 20:48:45,12.84
4,95033195,35178123,71,,1,1,9.0,9.0,,2021-06-03 20:48:45,9.63


A lot of times, Pandas users will use `.apply(lambda x: ...)` type syntax. Snowpark Pandas also supports this:

In [52]:
df_2021['PRICE_WITH_SALES_TAX_LAMBDA'] = df_2021['PRICE'].apply(lambda x: float(x)*1.07)
df_2021.head()

Unnamed: 0,ORDER_DETAIL_ID,ORDER_ID,MENU_ITEM_ID,DISCOUNT_ID,LINE_NUMBER,QUANTITY,UNIT_PRICE,PRICE,ORDER_ITEM_DISCOUNT_AMOUNT,ORDER_TS,PRICE_WITH_SALES_TAX,PRICE_WITH_SALES_TAX_LAMBDA
0,95033128,35178099,73,,0,1,12.0,12.0,,2021-06-03 20:36:16,12.84,12.84
1,95033129,35178099,76,,1,1,3.0,3.0,,2021-06-03 20:36:16,3.21,3.21
2,95033130,35178099,71,,2,1,9.0,9.0,,2021-06-03 20:36:16,9.63,9.63
3,95033194,35178123,73,,0,1,12.0,12.0,,2021-06-03 20:48:45,12.84,12.84
4,95033195,35178123,71,,1,1,9.0,9.0,,2021-06-03 20:48:45,9.63,9.63
