In [1]:
# Add module folder to notebook
import os
import sys

from dotenv import find_dotenv
sys.path.append(os.path.dirname(find_dotenv()))

In [2]:
from snowflake.snowpark import functions
import datetime

from app.snowpark_session.session import snowpark_session

In [3]:
sp_session = snowpark_session()

### Run SQL Commands

In [4]:
sql = """
select *
from snowflake_sample_data.tpch_sf10.lineitem
limit 1000
"""

df = sp_session.sql(sql)

### Show dataframe

In [5]:
df.show()

---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"L_ORDERKEY"  |"L_PARTKEY"  |"L_SUPPKEY"  |"L_LINENUMBER"  |"L_QUANTITY"  |"L_EXTENDEDPRICE"  |"L_DISCOUNT"  |"L_TAX"  |"L_RETURNFLAG"  |"L_LINESTATUS"  |"L_SHIPDATE"  |"L_COMMITDATE"  |"L_RECEIPTDATE"  |"L_SHIPINSTRUCT"   |"L_SHIPMODE"  |"L_COMMENT"                                 |
---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|51426183      |929063       |4091         |5               |12.00         |13104.24           |0.05          |0.02     |N               |O   

In [6]:
df.collect()

[Row(L_ORDERKEY=51426183, L_PARTKEY=929063, L_SUPPKEY=4091, L_LINENUMBER=5, L_QUANTITY=Decimal('12.00'), L_EXTENDEDPRICE=Decimal('13104.24'), L_DISCOUNT=Decimal('0.05'), L_TAX=Decimal('0.02'), L_RETURNFLAG='N', L_LINESTATUS='O', L_SHIPDATE=datetime.date(1996, 4, 21), L_COMMITDATE=datetime.date(1996, 6, 26), L_RECEIPTDATE=datetime.date(1996, 5, 17), L_SHIPINSTRUCT='TAKE BACK RETURN', L_SHIPMODE='AIR', L_COMMENT='dencies are f'),
 Row(L_ORDERKEY=51426183, L_PARTKEY=747366, L_SUPPKEY=22388, L_LINENUMBER=6, L_QUANTITY=Decimal('4.00'), L_EXTENDEDPRICE=Decimal('5653.32'), L_DISCOUNT=Decimal('0.10'), L_TAX=Decimal('0.00'), L_RETURNFLAG='N', L_LINESTATUS='O', L_SHIPDATE=datetime.date(1996, 7, 4), L_COMMITDATE=datetime.date(1996, 5, 25), L_RECEIPTDATE=datetime.date(1996, 7, 11), L_SHIPINSTRUCT='COLLECT COD', L_SHIPMODE='MAIL', L_COMMENT='its haggle carefully. depos'),
 Row(L_ORDERKEY=51426208, L_PARTKEY=1254802, L_SUPPKEY=79815, L_LINENUMBER=1, L_QUANTITY=Decimal('23.00'), L_EXTENDEDPRICE=Decim

In [7]:
for row in df.collect():
    print(f"{row.L_ORDERKEY}: {row.L_EXTENDEDPRICE}")

51426183: 13104.24
51426183: 5653.32
51426208: 40405.02
51426208: 41019.42
51426208: 34977.92
51426208: 78705.48
51426208: 60014.85
51426208: 12353.25
51426208: 2374.28
51426209: 4579.47
51426210: 82098.96
51426210: 62099.73
51426210: 46204.62
51426211: 5565.55
51426211: 37692.77
51426211: 7186.76
51426211: 38499.52
51426211: 91459.65
51426211: 60573.44
51426211: 5375.50
51426212: 77896.31
51426213: 32531.20
51426214: 58183.61
51426214: 29883.69
51426214: 6661.00
51426214: 57618.36
51426215: 57724.94
51426215: 7303.68
51426215: 19876.16
51426215: 47364.46
51426240: 30683.38
51426240: 53556.75
51426240: 69833.96
51426240: 63113.50
51426240: 46621.27
51426240: 28900.80
51426241: 24293.85
51426241: 63039.14
51426241: 16074.24
51426242: 47745.28
51426242: 38205.44
51426242: 21428.48
51426242: 43972.04
51426242: 34632.60
51426242: 52432.20
51426242: 41682.96
51426243: 73305.60
51426243: 29916.27
51426243: 59946.52
51426243: 85180.96
51426243: 22402.92
51426243: 6174.06
51426243: 8464.64
514

### Set Database and Schema for Session

In [8]:
sp_session.use_database("snowflake_sample_data")

sp_session.use_schema("tpch_sf10")

### Set Table for Session

In [21]:
df = sp_session.table("lineitem")

### Select from Dataframe and Filter

In [22]:
from_date = datetime.date(1997, 1, 1)
to_date = datetime.date(1997, 12, 31)

df = df.select(
    "L_SHIPDATE",
    "L_EXTENDEDPRICE",
    "L_SHIPMODE",
    "L_QUANTITY",
    "L_DISCOUNT",
).filter(
    functions.col("L_SHIPDATE").between(from_date, to_date)
)

In [23]:
df.show()

---------------------------------------------------------------------------------
|"L_SHIPDATE"  |"L_EXTENDEDPRICE"  |"L_SHIPMODE"  |"L_QUANTITY"  |"L_DISCOUNT"  |
---------------------------------------------------------------------------------
|1997-07-03    |5565.55            |SHIP          |5.00          |0.01          |
|1997-05-19    |37692.77           |RAIL          |19.00         |0.04          |
|1997-08-10    |7186.76            |RAIL          |4.00          |0.09          |
|1997-07-11    |38499.52           |FOB           |32.00         |0.00          |
|1997-06-11    |91459.65           |REG AIR       |47.00         |0.07          |
|1997-07-29    |60573.44           |AIR           |32.00         |0.10          |
|1997-05-20    |5375.50            |FOB           |5.00          |0.01          |
|1997-06-04    |77896.31           |SHIP          |41.00         |0.04          |
|1997-07-03    |47745.28           |AIR           |44.00         |0.00          |
|1997-07-01    |

### Create New Columns

In [24]:
df = df.with_column(
    "REVENUE", 
    (functions.col("L_EXTENDEDPRICE") - (functions.col("L_EXTENDEDPRICE") * functions.col("L_DISCOUNT"))) * functions.col("L_QUANTITY")
)

In [25]:
df.show()

--------------------------------------------------------------------------------------------------
|"L_SHIPDATE"  |"L_EXTENDEDPRICE"  |"L_SHIPMODE"  |"L_QUANTITY"  |"L_DISCOUNT"  |"REVENUE"       |
--------------------------------------------------------------------------------------------------
|1997-07-03    |5565.55            |SHIP          |5.00          |0.01          |27549.472500    |
|1997-05-19    |37692.77           |RAIL          |19.00         |0.04          |687516.124800   |
|1997-08-10    |7186.76            |RAIL          |4.00          |0.09          |26159.806400    |
|1997-07-11    |38499.52           |FOB           |32.00         |0.00          |1231984.640000  |
|1997-06-11    |91459.65           |REG AIR       |47.00         |0.07          |3997701.301500  |
|1997-07-29    |60573.44           |AIR           |32.00         |0.10          |1744515.072000  |
|1997-05-20    |5375.50            |FOB           |5.00          |0.01          |26608.725000    |
|1997-06-0

### Grouping

In [26]:
total_1997 = df.group_by("L_SHIPMODE").agg(functions.sum("REVENUE").alias("TOTAL_REVENUE"))

In [27]:
total_1997.show()

---------------------------------------
|"L_SHIPMODE"  |"TOTAL_REVENUE"       |
---------------------------------------
|TRUCK         |1590733688302.036700  |
|AIR           |1588489012314.814700  |
|RAIL          |1593265749227.381000  |
|SHIP          |1587705642771.441000  |
|MAIL          |1589584697330.409300  |
|FOB           |1588274810649.676300  |
|REG AIR       |1588413005265.937100  |
---------------------------------------



### Joining Dataframes

In [29]:
# Create a second dataframe for 1998
df2 = sp_session.table("lineitem")

from_date = datetime.date(1998, 1, 1)
to_date = datetime.date(1998, 12, 31)

df2 = df2.select(
    "L_SHIPDATE",
    "L_EXTENDEDPRICE",
    "L_SHIPMODE",
    "L_QUANTITY",
    "L_DISCOUNT",
).filter(
    functions.col("L_SHIPDATE").between(from_date, to_date)
)

df2 = df2.with_column(
    "REVENUE", 
    (functions.col("L_EXTENDEDPRICE") - (functions.col("L_EXTENDEDPRICE") * functions.col("L_DISCOUNT"))) * functions.col("L_QUANTITY")
)

total_1998 = df2.group_by("L_SHIPMODE").agg(functions.sum("REVENUE").alias("TOTAL_REVENUE_1998"))

In [33]:
# Rename column in 1997 dataframe
total_1997.with_column_renamed(functions.col("TOTAL_REVENUE"), "TOTAL_REVENUE_1997").show()

---------------------------------------
|"L_SHIPMODE"  |"TOTAL_REVENUE_1997"  |
---------------------------------------
|TRUCK         |1590733688302.036700  |
|FOB           |1588274810649.676300  |
|AIR           |1588489012314.814700  |
|REG AIR       |1588413005265.937100  |
|SHIP          |1587705642771.441000  |
|MAIL          |1589584697330.409300  |
|RAIL          |1593265749227.381000  |
---------------------------------------



In [34]:
total_1997 = total_1997.with_column_renamed(functions.col("TOTAL_REVENUE"), "TOTAL_REVENUE_1997")

In [36]:
total_1997.join(total_1998, total_1997.L_SHIPMODE == total_1998.L_SHIPMODE).show()

-------------------------------------------------------------------------------------------
|"l_oshb_L_SHIPMODE"  |"TOTAL_REVENUE_1997"  |"r_y70y_L_SHIPMODE"  |"TOTAL_REVENUE_1998"  |
-------------------------------------------------------------------------------------------
|TRUCK                |1590733688302.036700  |TRUCK                |1197327971050.476000  |
|REG AIR              |1588413005265.937100  |REG AIR              |1197951763139.771600  |
|AIR                  |1588489012314.814700  |AIR                  |1198501334545.354000  |
|SHIP                 |1587705642771.441000  |SHIP                 |1198711220773.695500  |
|RAIL                 |1593265749227.381000  |RAIL                 |1196094347242.170000  |
|MAIL                 |1589584697330.409300  |MAIL                 |1198424237452.120300  |
|FOB                  |1588274810649.676300  |FOB                  |1192940262362.108000  |
--------------------------------------------------------------------------------

In [39]:
joined_dfs = total_1997.join(total_1998, total_1997.L_SHIPMODE == total_1998.L_SHIPMODE).select(
    total_1997.col("L_SHIPMODE").alias("SHIPMODE"),
    "TOTAL_REVENUE_1997",
    "TOTAL_REVENUE_1998",
)

In [40]:
joined_dfs.show()

------------------------------------------------------------
|"SHIPMODE"  |"TOTAL_REVENUE_1997"  |"TOTAL_REVENUE_1998"  |
------------------------------------------------------------
|AIR         |1588489012314.814700  |1198501334545.354000  |
|TRUCK       |1590733688302.036700  |1197327971050.476000  |
|MAIL        |1589584697330.409300  |1198424237452.120300  |
|RAIL        |1593265749227.381000  |1196094347242.170000  |
|SHIP        |1587705642771.441000  |1198711220773.695500  |
|REG AIR     |1588413005265.937100  |1197951763139.771600  |
|FOB         |1588274810649.676300  |1192940262362.108000  |
------------------------------------------------------------



### Functions

In [43]:
joined_dfs.with_column(
    "TOTALS",
    functions.array_construct(
        functions.col("TOTAL_REVENUE_1997"), functions.col("TOTAL_REVENUE_1998")
    )
).show()

------------------------------------------------------------------------------------
|"SHIPMODE"  |"TOTAL_REVENUE_1997"  |"TOTAL_REVENUE_1998"  |"TOTALS"               |
------------------------------------------------------------------------------------
|TRUCK       |1590733688302.036700  |1197327971050.476000  |[                      |
|            |                      |                      |  1590733688302.0367,  |
|            |                      |                      |  1197327971050.476    |
|            |                      |                      |]                      |
|SHIP        |1587705642771.441000  |1198711220773.695500  |[                      |
|            |                      |                      |  1587705642771.441,   |
|            |                      |                      |  1198711220773.6955   |
|            |                      |                      |]                      |
|MAIL        |1589584697330.409300  |1198424237452.120300  |[    

In [47]:
(
    joined_dfs.with_column(
        "TOTALS",
        functions.array_construct(
            functions.col("TOTAL_REVENUE_1997"), functions.col("TOTAL_REVENUE_1998")
        )
    )
    .with_column(
        "COMMENT",
        functions.concat(functions.lit("This logic was created by "), functions.current_user())
    )
    .with_column(
        "TRUST_LEVEL",
        functions.when(functions.col("SHIPMODE") == "REG AIR", 0.4)
        .when(functions.col("SHIPMODE") == "AIR", 0.6)
        .otherwise(1.0)
    )
).show()

-------------------------------------------------------------------------------------------------------------------------------------------
|"SHIPMODE"  |"TOTAL_REVENUE_1997"  |"TOTAL_REVENUE_1998"  |"TOTALS"               |"COMMENT"                             |"TRUST_LEVEL"  |
-------------------------------------------------------------------------------------------------------------------------------------------
|FOB         |1588274810649.676300  |1192940262362.108000  |[                      |This logic was created by PAUL_LUCAS  |1.0            |
|            |                      |                      |  1588274810649.6763,  |                                      |               |
|            |                      |                      |  1192940262362.108    |                                      |               |
|            |                      |                      |]                      |                                      |               |
|TRUCK       |159073

In [48]:
joined_dfs = (
    joined_dfs.with_column(
        "TOTALS",
        functions.array_construct(
            functions.col("TOTAL_REVENUE_1997"), functions.col("TOTAL_REVENUE_1998")
        )
    )
    .with_column(
        "COMMENT",
        functions.concat(functions.lit("This logic was created by "), functions.current_user())
    )
    .with_column(
        "TRUST_LEVEL",
        functions.when(functions.col("SHIPMODE") == "REG AIR", 0.4)
        .when(functions.col("SHIPMODE") == "AIR", 0.6)
        .otherwise(1.0)
    )
)

In [49]:
joined_dfs.show()

-------------------------------------------------------------------------------------------------------------------------------------------
|"SHIPMODE"  |"TOTAL_REVENUE_1997"  |"TOTAL_REVENUE_1998"  |"TOTALS"               |"COMMENT"                             |"TRUST_LEVEL"  |
-------------------------------------------------------------------------------------------------------------------------------------------
|FOB         |1588274810649.676300  |1192940262362.108000  |[                      |This logic was created by PAUL_LUCAS  |1.0            |
|            |                      |                      |  1588274810649.6763,  |                                      |               |
|            |                      |                      |  1192940262362.108    |                                      |               |
|            |                      |                      |]                      |                                      |               |
|TRUCK       |159073

### Drop Column

In [51]:
joined_dfs = joined_dfs.drop("TOTAL_REVENUE_1997").drop("TOTAL_REVENUE_1998")

In [52]:
joined_dfs.show()

---------------------------------------------------------------------------------------------
|"SHIPMODE"  |"TOTALS"               |"COMMENT"                             |"TRUST_LEVEL"  |
---------------------------------------------------------------------------------------------
|FOB         |[                      |This logic was created by PAUL_LUCAS  |1.0            |
|            |  1588274810649.6763,  |                                      |               |
|            |  1192940262362.108    |                                      |               |
|            |]                      |                                      |               |
|AIR         |[                      |This logic was created by PAUL_LUCAS  |0.6            |
|            |  1588489012314.8147,  |                                      |               |
|            |  1198501334545.354    |                                      |               |
|            |]                      |                      

### Close Session

In [55]:
sp_session.close()