In [1]:
# !pip install Faker

## Creating Snowpark Session

In [1]:
from snowflake.snowpark.session import Session
from snowflake.snowpark import functions as F
from snowflake.snowpark import types as T
from snowflake.snowpark.types import *
from snowflake.snowpark.functions import udf, avg, col,lit,call_udf,sum,min,sql_expr
import snowflake.snowpark as snp

from config import snowflake_conn_prop_local as snowflake_conn_prop

print(f"Using Snowpark for Python version: {snp.__version__}")


session = Session.builder.configs(snowflake_conn_prop).create()

print(session.sql('select current_warehouse(), current_database(), current_schema()').collect())

print(f"Current schema: {session.get_fully_qualified_current_schema()}, current role: {session.get_current_role()}, current warehouse:  {session.get_current_warehouse()}")


Using Snowpark for Python version: 1.1.0
[Row(CURRENT_WAREHOUSE()='CLUSTER1', CURRENT_DATABASE()='NYCTAXI', CURRENT_SCHEMA()='TAXI')]
Current schema: "NYCTAXI"."TAXI", current role: "ACCOUNTADMIN", current warehouse:  "CLUSTER1"


In [3]:
from snowflake.snowpark.functions import udf, avg, col,lit,call_udf,sum,min,sql_expr

In [4]:
from faker import Faker

faker = Faker()


## Creating DataFrame

In [5]:
allempData = ((faker.name(), "Sales", 4000), \
    (faker.name(), "Sales", 5000),  \
    (faker.name(), "Sales", faker.random_int(5000, 10000)),   \
    (faker.name(), "Sales", 5000),    \
    (faker.name(), "Sales", faker.random_int(5000, 10000)), \
    (faker.name(), "Finance", 3000),  \
    (faker.name(), "Finance", 4500),  \
    (faker.name(), "Finance", 7500),    \
    (faker.name(), "IT", faker.random_int(5000, 10000)),\
    (faker.name(), "IT", faker.random_int(5000, 10000)),\
    (faker.name(), "Marketing", 5500), \
    (faker.name(), "Marketing", 5500),\
    (faker.name(), "Marketing", faker.random_int(5000, 10000))\
  )
 
columns= ["EmployeeName", "Department", "Salary"]
df = session.createDataFrame(data = allempData, schema = columns)
df.show()



----------------------------------------------
|"EMPLOYEENAME"    |"DEPARTMENT"  |"SALARY"  |
----------------------------------------------
|Gabriela Barber   |Sales         |4000      |
|Robert Hammond    |Sales         |5000      |
|Jeffrey Fleming   |Sales         |8141      |
|Jessica Campbell  |Sales         |5000      |
|Michele Morris    |Sales         |5568      |
|Brandi Bryant     |Finance       |3000      |
|Bryce Barr        |Finance       |4500      |
|Samantha Wood     |Finance       |7500      |
|David Smith       |IT            |7291      |
|Donald Reyes      |IT            |9976      |
----------------------------------------------



In [6]:
df.schema.fields

[StructField('EMPLOYEENAME', StringType(), nullable=False),
 StructField('DEPARTMENT', StringType(), nullable=False),
 StructField('SALARY', LongType(), nullable=False)]

In [7]:
df.dtypes

[('EMPLOYEENAME', 'string'), ('DEPARTMENT', 'string'), ('SALARY', 'bigint')]

In [11]:
pd_df=df.toPandas()

In [12]:
type(pd_df)

pandas.core.frame.DataFrame

In [8]:
 #GroupBy and Agg 
    
df.groupBy(['Department']).agg(F.sum('Salary').alias("SumofSalary"), \
                               F.avg(F.col('Salary')).alias("AvgofSalary"),\
                                 F.max(df.Salary).alias("MaxofSalary")).show()

----------------------------------------------------------------
|"DEPARTMENT"  |"SUMOFSALARY"  |"AVGOFSALARY"  |"MAXOFSALARY"  |
----------------------------------------------------------------
|Sales         |27709          |5541.800000    |8141           |
|Finance       |15000          |5000.000000    |7500           |
|IT            |17267          |8633.500000    |9976           |
|Marketing     |20653          |6884.333333    |9653           |
----------------------------------------------------------------



In [9]:
(
df
    .groupBy(['Department'])
    .agg( F.sum('Salary').alias("SumofSalary")
         ,F.avg(F.col('Salary')).alias("AvgofSalary")
         ,F.max(df.Salary).alias("MaxofSalary"))
).show()

----------------------------------------------------------------
|"DEPARTMENT"  |"SUMOFSALARY"  |"AVGOFSALARY"  |"MAXOFSALARY"  |
----------------------------------------------------------------
|Sales         |27709          |5541.800000    |8141           |
|Finance       |15000          |5000.000000    |7500           |
|IT            |17267          |8633.500000    |9976           |
|Marketing     |20653          |6884.333333    |9653           |
----------------------------------------------------------------



## Using Window functions

In [13]:
# from pyspark.sql.window import Window
# from pyspark.sql.functions import row_number,rank,dense_rank,percent_rank,ntile,lead,lag,col

from snowflake.snowpark import functions as F
from snowflake.snowpark.window import Window
from snowflake.snowpark.functions import row_number,rank,dense_rank,percent_rank,ntile,lead,lag,col,call_builtin,call_udf,avg,min,max,sum,array_agg
from snowflake.snowpark.types import IntegerType,StringType,DoubleType,StructField,StructType,FloatType

windowSpec  = Window.partition_by("Department").orderBy("Salary")
windowSpec_NOB  = Window.partition_by("Department")

df.withColumn("row_number",row_number().over(windowSpec)).toPandas()

Unnamed: 0,EMPLOYEENAME,DEPARTMENT,SALARY,ROW_NUMBER
0,David Smith,IT,7291,1
1,Donald Reyes,IT,9976,2
2,Craig Ramirez,Marketing,5500,1
3,Michael Anderson,Marketing,5500,2
4,George Wilson,Marketing,9653,3
5,Brandi Bryant,Finance,3000,1
6,Bryce Barr,Finance,4500,2
7,Samantha Wood,Finance,7500,3
8,Gabriela Barber,Sales,4000,1
9,Robert Hammond,Sales,5000,2


In [14]:
df.withColumn("rank",rank().over(windowSpec))\
.withColumn("dense_rank",dense_rank().over(windowSpec)) \
.filter(col("dense_rank")<=2).show()

----------------------------------------------------------------------
|"EMPLOYEENAME"    |"DEPARTMENT"  |"SALARY"  |"RANK"  |"DENSE_RANK"  |
----------------------------------------------------------------------
|David Smith       |IT            |7291      |1       |1             |
|Donald Reyes      |IT            |9976      |2       |2             |
|Craig Ramirez     |Marketing     |5500      |1       |1             |
|Michael Anderson  |Marketing     |5500      |1       |1             |
|George Wilson     |Marketing     |9653      |3       |2             |
|Brandi Bryant     |Finance       |3000      |1       |1             |
|Bryce Barr        |Finance       |4500      |2       |2             |
|Gabriela Barber   |Sales         |4000      |1       |1             |
|Robert Hammond    |Sales         |5000      |2       |2             |
|Jessica Campbell  |Sales         |5000      |2       |2             |
----------------------------------------------------------------------



In [15]:
df.withColumn("percentage_rank",percent_rank().over(windowSpec))\
.toPandas()

Unnamed: 0,EMPLOYEENAME,DEPARTMENT,SALARY,PERCENTAGE_RANK
0,David Smith,IT,7291,0.0
1,Donald Reyes,IT,9976,1.0
2,Craig Ramirez,Marketing,5500,0.0
3,Michael Anderson,Marketing,5500,0.0
4,George Wilson,Marketing,9653,1.0
5,Brandi Bryant,Finance,3000,0.0
6,Bryce Barr,Finance,4500,0.5
7,Samantha Wood,Finance,7500,1.0
8,Gabriela Barber,Sales,4000,0.0
9,Robert Hammond,Sales,5000,0.25


In [16]:
df.withColumn("lag",lag("salary",1).over(windowSpec)).\
   withColumn("lead",lead("salary",1).over(windowSpec)) \
      .show()

---------------------------------------------------------------
|"EMPLOYEENAME"    |"DEPARTMENT"  |"SALARY"  |"LAG"  |"LEAD"  |
---------------------------------------------------------------
|David Smith       |IT            |7291      |NULL   |9976    |
|Donald Reyes      |IT            |9976      |7291   |NULL    |
|Craig Ramirez     |Marketing     |5500      |NULL   |5500    |
|Michael Anderson  |Marketing     |5500      |5500   |9653    |
|George Wilson     |Marketing     |9653      |5500   |NULL    |
|Brandi Bryant     |Finance       |3000      |NULL   |4500    |
|Bryce Barr        |Finance       |4500      |3000   |7500    |
|Samantha Wood     |Finance       |7500      |4500   |NULL    |
|Gabriela Barber   |Sales         |4000      |NULL   |5000    |
|Robert Hammond    |Sales         |5000      |4000   |5000    |
---------------------------------------------------------------



In [28]:
#Using built-in function in Snowflake - avg 

df.withColumn("AvgSal",call_builtin("avg", col("salary")).over(windowSpec_NOB)).\
    withColumn("SumSal",sum(col("salary")).over(windowSpec_NOB)) \
   .toPandas()

Unnamed: 0,EMPLOYEENAME,DEPARTMENT,SALARY,AVGSAL,SUMSAL
0,Christina Pollard,Sales,4000,5714.0,28570
1,Christopher Walker,Sales,5000,5714.0,28570
2,Collin Brooks,Sales,7732,5714.0,28570
3,Zachary Lopez,Sales,5000,5714.0,28570
4,Amanda Briggs,Sales,6838,5714.0,28570
5,Michael Valdez,Finance,3000,5000.0,15000
6,Brian Brown,Finance,4500,5000.0,15000
7,Steven Nixon,Finance,7500,5000.0,15000
8,Dr. Jessica Jordan MD,IT,5114,7502.0,15004
9,Jennifer Robinson,IT,9890,7502.0,15004


In [29]:
#Without orderby in window function

#windowSpec_NOB  = Window.partition_by("Department")


df.withColumn("AvgSal",avg("salary").over(windowSpec_NOB)).withColumn("SumSal",sum(col("salary")).over(windowSpec_NOB)) \
   .toPandas()

Unnamed: 0,EMPLOYEENAME,DEPARTMENT,SALARY,AVGSAL,SUMSAL
0,Christina Pollard,Sales,4000,5714.0,28570
1,Christopher Walker,Sales,5000,5714.0,28570
2,Collin Brooks,Sales,7732,5714.0,28570
3,Zachary Lopez,Sales,5000,5714.0,28570
4,Amanda Briggs,Sales,6838,5714.0,28570
5,Michael Valdez,Finance,3000,5000.0,15000
6,Brian Brown,Finance,4500,5000.0,15000
7,Steven Nixon,Finance,7500,5000.0,15000
8,Dr. Jessica Jordan MD,IT,5114,7502.0,15004
9,Jennifer Robinson,IT,9890,7502.0,15004


In [30]:

windowSpec_unboundedpre = Window.partitionBy(col("Department")).orderBy("Salary").rowsBetween(Window.unboundedPreceding,Window.currentRow)
windowSpec_prevrow = Window.partitionBy(col("Department")).orderBy("Salary").rowsBetween(-1,Window.currentRow)
windowSpec_nextvrow = Window.partitionBy(col("Department")).orderBy("Salary").rowsBetween(Window.currentRow,1)

In [31]:
#Running totals per department

windowSpec_unboundedpre = Window.partitionBy(col("Department")).orderBy("Salary")\
.rowsBetween(Window.unboundedPreceding,Window.currentRow)

df.withColumn("AvgSal",avg("Salary").over(windowSpec_unboundedpre))\
.withColumn("RunningTotal",sum(col("Salary")).over(windowSpec_unboundedpre)) \
    .show(11)

-------------------------------------------------------------------------------
|"EMPLOYEENAME"         |"DEPARTMENT"  |"SALARY"  |"AVGSAL"  |"RUNNINGTOTAL"  |
-------------------------------------------------------------------------------
|Dean Evans             |Marketing     |5500      |5500.000  |5500            |
|Aaron Young            |Marketing     |5500      |5500.000  |11000           |
|Megan Riley            |Marketing     |9338      |6779.333  |20338           |
|Michael Valdez         |Finance       |3000      |3000.000  |3000            |
|Brian Brown            |Finance       |4500      |3750.000  |7500            |
|Steven Nixon           |Finance       |7500      |5000.000  |15000           |
|Dr. Jessica Jordan MD  |IT            |5114      |5114.000  |5114            |
|Jennifer Robinson      |IT            |9890      |7502.000  |15004           |
|Christina Pollard      |Sales         |4000      |4000.000  |4000            |
|Christopher Walker     |Sales         |

In [32]:
# Sum with next rows
df.withColumn("SumSal",sum(col("Salary")).over(windowSpec_nextvrow)) \
    .show(11)

-----------------------------------------------------------
|"EMPLOYEENAME"      |"DEPARTMENT"  |"SALARY"  |"SUMSAL"  |
-----------------------------------------------------------
|Collin Brooks       |Sales         |7732      |7732      |
|Amanda Briggs       |Sales         |6838      |14570     |
|Christopher Walker  |Sales         |5000      |11838     |
|Zachary Lopez       |Sales         |5000      |10000     |
|Christina Pollard   |Sales         |4000      |9000      |
|Megan Riley         |Marketing     |9338      |9338      |
|Dean Evans          |Marketing     |5500      |14838     |
|Aaron Young         |Marketing     |5500      |11000     |
|Steven Nixon        |Finance       |7500      |7500      |
|Brian Brown         |Finance       |4500      |12000     |
|Michael Valdez      |Finance       |3000      |7500      |
-----------------------------------------------------------



In [33]:
#Sum with prev row
df.withColumn("SumSal",sum(col("Salary")).over(windowSpec_prevrow)) \
    .show(11)

-----------------------------------------------------------
|"EMPLOYEENAME"      |"DEPARTMENT"  |"SALARY"  |"SUMSAL"  |
-----------------------------------------------------------
|Christina Pollard   |Sales         |4000      |4000      |
|Christopher Walker  |Sales         |5000      |9000      |
|Zachary Lopez       |Sales         |5000      |10000     |
|Amanda Briggs       |Sales         |6838      |11838     |
|Collin Brooks       |Sales         |7732      |14570     |
|Dean Evans          |Marketing     |5500      |5500      |
|Aaron Young         |Marketing     |5500      |11000     |
|Megan Riley         |Marketing     |9338      |14838     |
|Michael Valdez      |Finance       |3000      |3000      |
|Brian Brown         |Finance       |4500      |7500      |
|Steven Nixon        |Finance       |7500      |12000     |
-----------------------------------------------------------



In [34]:


newempData = ((faker.name(), "Sales", faker.random_int(5000, 10000)), \
    (faker.name(), "Sales", faker.random_int(5000, 10000)),  \
    (faker.name(), "Sales", faker.random_int(5000, 10000)),   \
    (faker.name(), "Sales", faker.random_int(5000, 10000)),    \
    (faker.name(), "Sales", faker.random_int(5000, 10000)), \
    (faker.name(), "Finance", faker.random_int(5000, 10000)),  \
    (faker.name(), "Finance", faker.random_int(5000, 10000)),  \
    (faker.name(), "Finance", faker.random_int(5000, 10000)),
    (faker.name(), "IT", faker.random_int(5000, 10000)),\
    (faker.name(), "IT", faker.random_int(5000, 10000)),\
    (faker.name(), "Marketing", faker.random_int(5000, 10000)), \
    (faker.name(), "Marketing", faker.random_int(5000, 10000)),\
    (None, "Marketing", faker.random_int(5000, 10000)),\
    (faker.name(), "Marketing", faker.random_int(5000, 10000))\
  )

empSchema = StructType([ \
    StructField("EmployeeName",StringType(),True), \
    StructField("Department",StringType(),True), \
    StructField("Salary", LongType(), True) \
  ])
 

df_udf = session.createDataFrame(data = newempData, schema = empSchema)


In [35]:
df_udf.schema.fields

[StructField('EMPLOYEENAME', StringType(), nullable=True),
 StructField('DEPARTMENT', StringType(), nullable=False),
 StructField('SALARY', LongType(), nullable=False)]

In [36]:
df_udf.toPandas()

Unnamed: 0,EMPLOYEENAME,DEPARTMENT,SALARY
0,Kristina Johnson,Sales,6452
1,Alison Graham,Sales,9965
2,Kirk James,Sales,8230
3,Bryce Daniels,Sales,9122
4,Robert Berry,Sales,5555
5,Angela Davidson,Finance,7194
6,Roger Johnson,Finance,7707
7,Christine Martin,Finance,9703
8,Christopher Berry,IT,9180
9,Oscar Walter,IT,9385


In [37]:
type(df_udf)

snowflake.snowpark.dataframe.DataFrame

In [38]:
df_udf.createOrReplaceTempView("emp")

[Row(status='View EMP successfully created.')]

In [39]:
session.sql("select * from emp").show()

-----------------------------------------------
|"EMPLOYEENAME"     |"DEPARTMENT"  |"SALARY"  |
-----------------------------------------------
|Kristina Johnson   |Sales         |6452      |
|Alison Graham      |Sales         |9965      |
|Kirk James         |Sales         |8230      |
|Bryce Daniels      |Sales         |9122      |
|Robert Berry       |Sales         |5555      |
|Angela Davidson    |Finance       |7194      |
|Roger Johnson      |Finance       |7707      |
|Christine Martin   |Finance       |9703      |
|Christopher Berry  |IT            |9180      |
|Oscar Walter       |IT            |9385      |
-----------------------------------------------



## Reading from External Storage

In [40]:
# CREATE OR REPLACE STORAGE INTEGRATION azure_integration
#   TYPE = EXTERNAL_STAGE
#   STORAGE_PROVIDER = AZURE
#   ENABLED = TRUE
#   AZURE_TENANT_ID = ''
#   STORAGE_ALLOWED_LOCATIONS = ('azure://phanidemostgacct.blob.core.windows.net/staging','azure://phanidemostgacct.blob.core.windows.net/curated');

In [17]:
session.sql("list @azure_stage/parquetfiles/").show(max_width=100)

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"name"                                                                                                |"size"   |"md5"                             |"last_modified"                |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|azure://rawdemodatastorage.blob.core.windows.net/sourcedata/parquetfiles/part-00000-tid-787730706...  |1232348  |71c12069a63d65a0babd7d16808731f4  |Mon, 10 Apr 2023 14:00:08 GMT  |
|azure://rawdemodatastorage.blob.core.windows.net/sourcedata/parquetfiles/part-00001-tid-787730706...  |1232940  |3225ca79a5ccaf1591c17839fe13745c  |Mon, 10 Apr 2023 14:00:08 GMT  |
|azure://rawdemodatastorage.blob.core.windows.net/sourcedata/parquetfiles/part-00002-tid-7

### Reading CSV Files

In [18]:
from snowflake.snowpark.functions import sql_expr,json_extract_path_text,parse_json,to_json,get_path

In [19]:
cust_schema = StructType([
    StructField("C_CUSTKEY", IntegerType()),
    StructField("C_NAME", StringType()),
    StructField("C_ADDRESS", StringType()),
    StructField("C_NATIONKEY", ShortType()),
    StructField("C_PHONE", StringType()),
    StructField("C_ACCTBAL", DoubleType()),
    StructField("C_MKTSEGMENT", StringType()),
    StructField("C_COMMENT", StringType())
])

df_cust= session.read.option("skip_header", 1).option("field_delimiter", ",")\
.option("FIELD_OPTIONALLY_ENCLOSED_BY", '"')\
.schema(cust_schema).csv("@azure_stage/csvfiles/")

df_cust.limit(5).toPandas()

Unnamed: 0,C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
0,48186,Customer#000048186,"uDRQH,RXtNl",0,10-212-919-2237,2830.4,MACHINERY,"s near the regular, bold requests cajole fluff..."
1,34309,Customer#000034309,"fDbr,ZUNaWAWiZfJbotLRUFUN3UWM69aZBzX3mdf",0,10-206-864-6173,6716.09,FURNITURE,ccounts. blithely pending deposit
2,33724,Customer#000033724,8CyHIl0gQbNv4ZcvnNe0M7McGUBGS,0,10-351-808-2162,3462.85,FURNITURE,"sits. special, regular theodolites wake blithe..."
3,39854,Customer#000039854,V2RNPnRPekf,0,10-639-634-3099,9051.31,MACHINERY,aggle among the theodolites. carefully even acco
4,38217,Customer#000038217,"Br4PsGcCSII7JmCS7DE,PWCCcZC8DUJY",0,10-898-625-7344,-446.46,AUTOMOBILE,beans. blithely pending deposits sleep quickly...


### Reading and writing to Parquet

In [20]:
#Reading parquet Files
session.read.parquet("@azure_stage/parquetfiles/").select("C_NAME","C_PHONE","C_ACCTBAL").show(5)

------------------------------------------------------
|"C_NAME"            |"C_PHONE"        |"C_ACCTBAL"  |
------------------------------------------------------
|Customer#000022643  |10-745-420-1529  |1055.58      |
|Customer#000000896  |10-425-565-3199  |7659.72      |
|Customer#000006495  |10-465-937-3117  |6679.86      |
|Customer#000011029  |10-796-436-4320  |-663.86      |
|Customer#000018365  |10-836-214-8530  |1939.80      |
------------------------------------------------------



In [21]:
#Writing to Parquet without any partitions

df_cust.write.copy_into_location('@azure_stage/custNonPartitionedParquetFiles/', \
 file_format_type="parquet", header=True,OVERWRITE=True)

[Row(rows_unloaded=150000, input_bytes=13981426, output_bytes=13981426)]

In [22]:
session.sql('ls @azure_stage/custNonPartitionedParquetFiles').collect()

[Row(name='azure://rawdemodatastorage.blob.core.windows.net/sourcedata/custNonPartitionedParquetFiles', size=0, md5=None, last_modified='Tue, 11 Apr 2023 11:48:30 GMT'),
 Row(name='azure://rawdemodatastorage.blob.core.windows.net/sourcedata/custNonPartitionedParquetFiles/data_0_0_0.snappy.parquet', size=1399763, md5='da0c64a8712b1e7201782e78ff4cd44e', last_modified='Tue, 11 Apr 2023 11:48:30 GMT'),
 Row(name='azure://rawdemodatastorage.blob.core.windows.net/sourcedata/custNonPartitionedParquetFiles/data_0_2_0.snappy.parquet', size=1398006, md5='7aac192137c792411e048c4b6b54d454', last_modified='Tue, 11 Apr 2023 11:48:30 GMT'),
 Row(name='azure://rawdemodatastorage.blob.core.windows.net/sourcedata/custNonPartitionedParquetFiles/data_0_3_0.snappy.parquet', size=1400813, md5='255c589cc3b44417c0c07730e60ddfc9', last_modified='Tue, 11 Apr 2023 11:48:30 GMT'),
 Row(name='azure://rawdemodatastorage.blob.core.windows.net/sourcedata/custNonPartitionedParquetFiles/data_0_4_0.snappy.parquet', size

In [23]:
#Writing to Parquet with partitions
df_cust.write.copy_into_location('azure_stage/custPartitionedParquetFiles/', 
 partition_by="'C_MKTSEGMENT=' || C_MKTSEGMENT || '/C_NATIONKEY=' || to_varchar(C_NATIONKEY)" ,\
 file_format_type="parquet", header=True)

[Row(rows_unloaded=150000, input_bytes=13892815, output_bytes=13892815)]

In [24]:

session.sql("list @azure_stage/custPartitionedParquetFiles").show(max_width=1000)

-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|"name"                                                                                                                                                                                          |"size"   |"md5"                             |"last_modified"                |
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|azure://rawdemodatastorage.blob.core.windows.net/sourcedata/custPartitionedParquetFiles                                                                                                

### Reading JSON Data

In [25]:
df_stages=session.sql(" list @azure_stage/jsonfiles/").collect()
for val in df_stages:
    print(val.name)

azure://rawdemodatastorage.blob.core.windows.net/sourcedata/jsonfiles/Vehicle1.json
azure://rawdemodatastorage.blob.core.windows.net/sourcedata/jsonfiles/Vehicle2.json
azure://rawdemodatastorage.blob.core.windows.net/sourcedata/jsonfiles/Vehicle3.json
azure://rawdemodatastorage.blob.core.windows.net/sourcedata/jsonfiles/Vehicle4.json
azure://rawdemodatastorage.blob.core.windows.net/sourcedata/jsonfiles/Vehicle5.json


In [26]:
df_json = session.read.json("@azure_stage/jsonfiles/")
df_json.select(sql_expr("$1[0]:Fuel").alias("Fuel")\
              ,sql_expr("$1[0]:Transmission").alias("Transmission")\
              ,sql_expr("$1[0]:about").alias("about")\
              ,sql_expr("$1[0]:cost").alias("Cost")).show()

--------------------------------------------------------------------------------------------------
|"FUEL"      |"TRANSMISSION"  |"ABOUT"                                             |"COST"       |
--------------------------------------------------------------------------------------------------
|"diesel"    |"automatic"     |"In minim irure ipsum exercitation elit ea ea v...  |"$8,267.32"  |
|"petrol"    |"manual"        |"Adipisicing aliquip eu qui anim. Eu pariatur l...  |"$5,677.19"  |
|"petrol"    |"manual"        |"Amet adipisicing est dolore nulla labore conse...  |"$5,960.22"  |
|"diesel"    |"automatic"     |"Aute exercitation deserunt ea consequat qui vo...  |"$6,635.44"  |
|"Electric"  |"manual"        |"Ullamco tempor ipsum officia excepteur culpa s...  |"$9,387.00"  |
--------------------------------------------------------------------------------------------------



### Writing to Table

In [27]:
df_cust.write.mode("append").saveAsTable("NewCustomers")

In [28]:
session.table('NewCustomers').limit(5).toPandas()

Unnamed: 0,C_CUSTKEY,C_NAME,C_ADDRESS,C_NATIONKEY,C_PHONE,C_ACCTBAL,C_MKTSEGMENT,C_COMMENT
0,22643,Customer#000022643,6q1mH2NDbbSZD2tOliowe3pT9,0,10-745-420-1529,1055.58,FURNITURE,lyly quickly regular packages.
1,896,Customer#000000896,gPb5nP15yi7urmO,0,10-425-565-3199,7659.72,AUTOMOBILE,affix carefully unusual requests. furiously fin
2,6495,Customer#000006495,WlTJB854exrQqhPqI2uaJlupnKfoJyB,0,10-465-937-3117,6679.86,AUTOMOBILE,deposits. theodolites hang blithely after the ...
3,11029,Customer#000011029,"O,I97tQae9jyp,C1SVaVFr5C913w3SYAQkTsB1",0,10-796-436-4320,-663.86,FURNITURE,carefully according to the blithely even depos...
4,18365,Customer#000018365,"yFRP1S3n,uOsQ0hHPrkNtKBmkQbPnX 0c9i",0,10-836-214-8530,1939.8,BUILDING,kly. quickly even deposits cajole blithely aft...


In [29]:
session.sql('''
select C_CUSTKEY ,C_NAME, C_PHONE , C_ACCTBAL, C_MKTSEGMENT from NewCustomers''').limit(4).toPandas()

Unnamed: 0,C_CUSTKEY,C_NAME,C_PHONE,C_ACCTBAL,C_MKTSEGMENT
0,22643,Customer#000022643,10-745-420-1529,1055.58,FURNITURE
1,896,Customer#000000896,10-425-565-3199,7659.72,AUTOMOBILE
2,6495,Customer#000006495,10-465-937-3117,6679.86,AUTOMOBILE
3,11029,Customer#000011029,10-796-436-4320,-663.86,FURNITURE


In [30]:
session.sql("create or replace stage sprocstage").collect()

[Row(status='Stage area SPROCSTAGE successfully created.')]

In [64]:
# session.table("taxi_trips_mat_view").limit(10).show()

## Creating Python Stored Procedures

In [31]:
import snowflake.snowpark as snowpark
from snowflake.snowpark.functions import *
from snowflake.snowpark.types import StringType, StructType, StructField, IntegerType,DoubleType,FloatType

# @sproc(packages=["snowflake-snowpark-python", "pandas"])
@sproc(name="usp_loadVehicleTranmissionDate",return_type=StringType(), is_permanent=True, stage_location="@sprocstage", replace=True, packages=["snowflake-snowpark-python"]) 
def main(session:Session):
    
    df1=session.table("taxi_trips_mat_view")
    df1.createOrReplaceView("tax_raw_data")
    
    @udf(name="udf_DoubleCost",return_type=FloatType(),is_permanent=True,stage_location="@pythonudfstage",replace=True)
    def DoubleCost(col1:float,col2:float)->float:
        return col1*col2
      
    df1.select("TAXI_TYPE","total_amount","PICKUP_BOROUGH") \
       .groupBy('PICKUP_BOROUGH','TAXI_TYPE') \
       .agg([sum(col('total_amount')).alias("TotalAmount"),avg(col('total_amount')).alias("AvgAmount")])\
       .write.mode("overwrite").saveAsTable("TotalAmount_TaxiType_Borough")
    
    df_json = session.read.json("@azure_stage/jsonfiles/")
    df_json.select(sql_expr("$1[0]:Fuel::VARCHAR").alias("Fuel")\
              ,sql_expr("$1[0]:Transmission::VARCHAR").alias("Transmission")\
              ,sql_expr("$1[0]:about::VARCHAR").alias("about")\
              ,sql_expr("cast(replace(replace($1[0]:cost,'$',''),',','') as float)").alias("Cost")\
              ,call_builtin('udf_DoubleCost', sql_expr("cast(replace(replace($1[0]:cost,'$',''),',','') as float)"), 
                            sql_expr("cast(replace(replace($1[0]:cost,'$',''),',','') as float)")).alias('DoubleCost')  \
                  )\
    .write.mode("overwrite").saveAsTable("Vehicle_Transmission_Data")
    
    return 'Done!!'

In [32]:
session.call('usp_loadVehicleTranmissionDate')

'Done!!'

In [33]:
session.table('Vehicle_Transmission_Data').limit(10).show()

-----------------------------------------------------------------------------------------------------------------
|"FUEL"    |"TRANSMISSION"  |"ABOUT"                                             |"COST"   |"DOUBLECOST"        |
-----------------------------------------------------------------------------------------------------------------
|diesel    |automatic       |In minim irure ipsum exercitation elit ea ea vo...  |8267.32  |68348579.9824       |
|          |                |                                                    |         |                    |
|petrol    |manual          |Amet adipisicing est dolore nulla labore conseq...  |5960.22  |35524222.448400006  |
|          |                |                                                    |         |                    |
|petrol    |manual          |Adipisicing aliquip eu qui anim. Eu pariatur la...  |5677.19  |32230486.296099994  |
|          |                |                                                    |      

In [34]:
session.sql('select FUEL, DOUBLECOST from Vehicle_Transmission_Data').limit(10).show()

---------------------------------
|"FUEL"    |"DOUBLECOST"        |
---------------------------------
|diesel    |68348579.9824       |
|petrol    |32230486.296099994  |
|petrol    |35524222.448400006  |
|diesel    |44029063.993599996  |
|Electric  |88115769.0          |
---------------------------------

