In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import time

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

from snowflake.snowpark import Session
from snowflake.snowpark import DataFrame
from snowflake.snowpark.functions import col, current_timestamp, lit
from datetime import datetime

# Start time
start_time = datetime.now()

# Session is assumed to be already active in Snowpark Notebook
# Reading the input table
input_df = session.table('SUBSCRIPTION_ACCEL.PYTHON_TESTING.INPUT_3RD_LEVEL_TEST3')

# Simple transformation: filter rows with MRR > 1000
filtered_df = input_df.filter(col("ARR") > 3000)

# Optional: Add a processing timestamp column
result_df = filtered_df.with_column("Processed_At", current_timestamp())

# Write the output to a new table
result_df.write.mode("overwrite").save_as_table('SUBSCRIPTION_ACCEL.PYTHON_TESTING.PATAGONIA_INPUT_TEST_SNOW1')

# End time
end_time = datetime.now()

# Print duration
print(f"Total time taken: {end_time - start_time}")


In [None]:
from snowflake.snowpark.functions import col
import time

# Start measuring time
start_time = time.time()

# Load the table into a Snowflake DataFrame
input_df = session.table('SUBSCRIPTION_ACCEL.PYTHON_TESTING.INPUT_3RD_LEVEL_TEST3')

# Filter the data where MRR > 1000
cache_s_time = time.time()
filtered_df = input_df.filter(col("ARR") > 1000)
cache_e_time = time.time()
print(f"⏱️ Caching df filter snowflake: {cache_e_time - cache_s_time:.2f} seconds")

# Write the result to the comparison table
cache_s_time = time.time()
filtered_df.write.mode("overwrite").save_as_table('SUBSCRIPTION_ACCEL.PYTHON_TESTING.PATAGONIA_INPUT_TEST_SNOW2')
cache_e_time = time.time()
print(f"⏱️ Caching df save snowflake: {cache_e_time - cache_s_time:.2f} seconds")
    

# Measure the time taken
end_time = time.time()
runtime = end_time - start_time

print(f"Time taken to process the data: {runtime} seconds")


In [None]:
# Import necessary packages
import polars as pl
import time
from datetime import datetime
from snowflake.snowpark import Session

# Start time
start_time = datetime.now()

# Assuming the Snowflake session is already active
# Fetch data using Snowpark
input_df_snowpark = session.table('SUBSCRIPTION_ACCEL.PYTHON_TESTING.PATAGONIA_INPUT_TEST').to_pandas()

# Convert to Polars DataFrame
input_df = pl.DataFrame(input_df_snowpark)

# Simple transformation: filter rows with ARR > 1000
filtered_df = input_df.filter(pl.col("ARR") > 1000)

# Optional: Add a processing timestamp column
filtered_df = filtered_df.with_columns([pl.lit(datetime.now()).alias("Processed_At")])

# Write the output back to Snowflake using Snowpark
output_table = 'SUBSCRIPTION_ACCEL.PYTHON_TESTING.PATAGONIA_INPUT_TEST_SNOW1'
session.write_pandas(filtered_df.to_pandas(), output_table, overwrite=True)

# End time
end_time = datetime.now()

# Print duration
print(f"Total time taken: {end_time - start_time}")


In [None]:
# Import necessary packages
import streamlit as st
import polars as pl
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark import functions as F
from datetime import datetime

# Start time
start_time = datetime.now()

# Get the active Snowpark session
session = get_active_session()

# Reading the input table directly into a Polars DataFrame
input_df = session.table('SUBSCRIPTION_ACCEL.PYTHON_TESTING.INPUT_3RD_LEVEL_TEST3')

# Convert Snowpark DataFrame to Polars DataFrame
polars_df = pl.from_pandas(input_df.to_pandas())

cache_s_time = time.time()

# Simple transformation: filter rows with MRR > 1000
filtered_df = polars_df.filter(pl.col("ARR") > 1000)
cache_e_time = time.time()
print(f"⏱️ Caching df filter polar: {cache_e_time - cache_s_time:.2f} seconds")

# Optional: Add a processing timestamp column
result_df = filtered_df.with_columns(pl.lit(datetime.now()).alias("Processed_At"))

# Write the output to a new table
# Note: You may need to convert back to Snowpark DataFrame for writing
cache_s_time = time.time()
result_snowpark_df = session.create_dataframe(result_df.to_pandas())
result_snowpark_df.write.mode("overwrite").save_as_table('SUBSCRIPTION_ACCEL.PYTHON_TESTING.PATAGONIA_INPUT_TEST_POLAR')
cache_e_time = time.time()
print(f"⏱️ Caching df write polar: {cache_e_time - cache_s_time:.2f} seconds")

# End time
end_time = datetime.now()

# Print duration
print(f"Total time taken: {end_time - start_time}")


In [None]:
from snowflake.snowpark.functions import col
import time
import duckdb
import polars as pl
import pandas as pd

# ⏱ Measure full pipeline start time
pipeline_start = time.time()

# Snowflake processing
sf_start = time.time()
input_df = session.table("INPUT_3RD_LEVEL_TEST3")

# Filter in Snowflake
cache_s_time = time.time()
filtered_sf_df = input_df.filter(col("ARR") > 1000)
cache_e_time = time.time()
print(f"⏱️ Time taken for snowflake filtering: {cache_e_time - cache_s_time:.2f} seconds")

# Save filtered result to Snowflake
cache_s_time = time.time()
filtered_sf_df.write.mode("overwrite").save_as_table("PATAGONIA_INPUT_SNOW")
cache_e_time = time.time()

sf_end = time.time()
print(f"⏱️ Snowflake filter+write: {sf_end - sf_start:.2f}s, Caching only: {cache_e_time - cache_s_time:.2f}s")

# Export full Snowflake table to Pandas (once) for reuse
export_start = time.time()
pandas_df = input_df.to_pandas()
export_end = time.time()
print(f"⏱️ Time to export Snowflake table to Pandas: {export_end - export_start:.2f}s")

# DuckDB filtering
duck_start = time.time()
duck_df = duckdb.query_df(pandas_df, "df", "SELECT * FROM df WHERE ARR > 1000")
duck_end = time.time()
print(f"🦆 Time taken for DuckDB filtering: {duck_end - duck_start:.2f}s")

# Polars filtering
pl_df = pl.from_pandas(pandas_df)
polars_start = time.time()
filtered_polars_df = pl_df.filter(pl.col("ARR") > 1000)

polars_end = time.time()
print(f"🦾 Time taken for Polars filtering: {polars_end - polars_start:.2f}s")

# Total
pipeline_end = time.time()
print(f"✅ Total pipeline time: {pipeline_end - pipeline_start:.2f}s")


In [None]:
from snowflake.snowpark.functions import col
import time
import duckdb
import polars as pl
import pandas as pd

# ⏱ Measure full pipeline start time
pipeline_start = time.time()

# Snowflake processing
sf_start = time.time()
input_df = session.table("PATAGONIA_INPUT_TEST")
snow_start= time.time()
# Filter in Snowflake
filtered_sf_df = input_df.filter(col("ARR") > 1000)
snow_end= time.time()
print(f"⏱️ Snowflake filter: {snow_end - snow_start:.2f}s,")

# Save filtered result to Snowflake
cache_s_time = time.time()
filtered_sf_df.write.mode("overwrite").save_as_table("PATAGONIA_INPUT_SNOW")
cache_e_time = time.time()

sf_end = time.time()
print(f"⏱️ Snowflake filter+write: {sf_end - sf_start:.2f}s, Caching only: {cache_e_time - cache_s_time:.2f}s")

# Export full Snowflake table to Pandas (once) for reuse
export_start = time.time()
pandas_df = input_df.to_pandas()
export_end = time.time()
print(f"⏱️ Time to export Snowflake table to Pandas: {export_end - export_start:.2f}s")

# DuckDB filtering
duck_start = time.time()
duck_df = duckdb.query_df(pandas_df, "df", "SELECT * FROM df WHERE ARR > 1000").to_df()
duck_end = time.time()
print(f"🦆 Time taken for DuckDB filtering: {duck_end - duck_start:.2f}s")

# Save filtered result to Snowflake (DuckDB)
duck_save_start = time.time()
session.write_pandas(duck_df, "PATAGONIA_INPUT_DUCK", overwrite=True)
duck_save_end = time.time()
print(f"🦆 Time taken to save DuckDB result to Snowflake: {duck_save_end - duck_save_start:.2f}s")

# Polars filtering
polars_start = time.time()
pl_df = pl.from_pandas(pandas_df)
filtered_polars_df = pl_df.filter(pl.col("ARR") > 1000)
polars_end = time.time()
print(f"🦾 Time taken for Polars filtering: {polars_end - polars_start:.2f}s")

# Save filtered result to Snowflake (Polars)
polars_save_start = time.time()
session.write_pandas(filtered_polars_df.to_pandas(), "PATAGONIA_INPUT_POLAR", overwrite=True)
polars_save_end = time.time()
print(f"🦾 Time taken to save Polars result to Snowflake: {polars_save_end - polars_save_start:.2f}s")

# Total
pipeline_end = time.time()
print(f"✅ Total pipeline time: {pipeline_end - pipeline_start:.2f}s")
