# Snowpark - Why to avoid Pandas Dataframe


By comparing the execution times of Pandas dataframes and Snowpark dataframes on the same dataset, it is possible to analyze the performance differences between these two approaches. 

In [2]:
# Import required libraries
# Snowpark
from snowflake.snowpark.functions import sproc
from snowflake.snowpark.session import Session
from snowflake.snowpark import version as v
import json

import pandas as pd
import numpy as np


with open('credentials.json') as f:
    connection_parameters = json.load(f)
    
session = Session.builder.configs(connection_parameters).create()
print(session.sql('select current_role(),current_warehouse(), current_database(), current_schema()').collect())

session.add_packages('snowflake-snowpark-python', 'pandas', 'numpy')



[Row(CURRENT_ROLE()='POLARSLED_SQLSERVER', CURRENT_WAREHOUSE()='COE_PRACTISE_WH', CURRENT_DATABASE()='SQLSERVER_DEV', CURRENT_SCHEMA()='SNOWPARK_TEST')]


In [5]:
session.sql('create stage test_stage;').collect()

[Row(status='Stage area TEST_STAGE successfully created.')]

Create Stored Procedure to remove duplicates using Snowpark Dataframe and analyse how SQL is run in your Snowflake account.

In [7]:
%%time
import snowflake.snowpark
from snowflake.snowpark.functions import sproc

@sproc(name="write_distinct_df_to_table", is_permanent=True, stage_location="@test_stage", replace=True, packages=["snowflake-snowpark-python"])
def write_distinct_df_to_table(session: snowflake.snowpark.Session, x: str) -> str:
    lineitem_df = session.table('SFC_SAMPLES_SAMPLE_DATA.TPCH_SF1.LINEITEM')
    lineitem_df = lineitem_df.dropDuplicates()
    lineitem_df.write.mode("overwrite").save_as_table("temp_lineitem")
    return 'Table Created Sucessfully'
    
session.sql("call write_distinct_df_to_table('test')").collect()

CPU times: user 164 ms, sys: 8.54 ms, total: 172 ms
Wall time: 22.1 s


[Row(WRITE_DISTINCT_DF_TO_TABLE='Table Created Sucessfully')]

Create a Stored Procedure to remove duplicates using Pandas Dataframe.

In [8]:
%%time
import snowflake.snowpark
from snowflake.snowpark.functions import sproc

session.add_packages("snowflake-snowpark-python", "pandas")
@sproc(name="write_distinct_df_to_table", is_permanent=True, stage_location="@test_stage", replace=True, packages=["snowflake-snowpark-python"])
def write_distinct_df_to_table(session: snowflake.snowpark.Session) -> str:
    lineitem_df = session.table('SFC_SAMPLES_SAMPLE_DATA.TPCH_SF1.LINEITEM')
    lineitem_pandasdf = lineitem_df.to_pandas()
    lineitem_pandasdf = lineitem_pandasdf.drop_duplicates()
    session.write_pandas(lineitem_pandasdf, "temp_lineitem", auto_create_table=True)
    return 'Table Created Sucessfully'
    
session.sql("call write_distinct_df_to_table()").collect()

CPU times: user 222 ms, sys: 15.5 ms, total: 237 ms
Wall time: 2min 47s


[Row(WRITE_DISTINCT_DF_TO_TABLE='Table Created Sucessfully')]

## <b>Snowpark df is 8x faster than Pandas df.<b>