# Locate Spark in Virtual Environment

In [None]:
# Prior to executing this code below:
# 1. Install Conda
# 2. Create an environtment by running this from your command line
#        conda env create -f environment.yml
# 3. Make sure you're using the 3.11.6 iceberg-demo Python kernel
# 4. Export environment variables from spark_env_variables.txt
# 5. Activate the environment and open jupyter notebooks by running this from your command line
#        conda activate iceberg-lab
#        jupyter notebook

In [None]:
import os
from dotenv import load_dotenv
import findspark
load_dotenv()
findspark.init()
findspark.find()

# Run Spark 

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [None]:
# Create SparkSession, for AWS
spark = SparkSession.builder.appName('iceberg_lab')\
    .config('spark.jars.packages', os.environ['PACKAGES'])\
    .config('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')\
    .getOrCreate()

### Spark configurations
Set the following configurations for Spark

In [None]:
spark.conf.set("spark.sql.defaultCatalog", "snowflake_catalog")
spark.conf.set("spark.sql.catalog.snowflake_catalog", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.snowflake_catalog.catalog-impl", "org.apache.iceberg.snowflake.SnowflakeCatalog")
spark.conf.set("spark.sql.catalog.snowflake_catalog.uri", os.environ['SNOWFLAKE_CATALOG_URI'])
spark.conf.set("spark.sql.catalog.snowflake_catalog.jdbc.role", os.environ['SNOWFLAKE_ROLE'])
spark.conf.set("spark.sql.catalog.snowflake_catalog.jdbc.user", os.environ['SNOWFLAKE_USERNAME'])
spark.conf.set("spark.sql.catalog.snowflake_catalog.jdbc.password", os.environ['SNOWFLAKE_PASSWORD'])
spark.conf.set("spark.sql.iceberg.vectorization.enabled", "false")
spark.conf.set("spark.sql.catalog.snowflake_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
spark.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
spark.conf.set("spark.hadoop.fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID'])
spark.conf.set("spark.hadoop.fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])
spark.conf.set("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
spark.conf.set("spark.hadoop.fs.s3a.endpoint.region", os.environ['AWS_REGION'])

# Read Snowflake-managed Iceberg Tables

In [None]:
spark.sql("USE DEMO.PUBLIC")

In [None]:
df_product_reviews = spark.table("demo.public.product_reviews")
df_product_reviews.show()

In [None]:
df_reviews_per_day = df_product_reviews.groupBy("review_date") \
                                       .agg(F.countDistinct("id") \
                                       .alias("num_reviews"))
df_reviews_per_day.orderBy("review_date", ascenting=False).show(truncate=False)

In [None]:
df_product_sentiment = df_product_reviews.groupBy("product_name") \
                                       .agg(F.avg("sentiment") \
                                       .alias("avg_sentiment"))
df_product_sentiment.orderBy("avg_sentiment", ascending=False).show(truncate=False)

In [None]:
df_product_reviews = spark.table("demo.public.product_reviews")
df_product_reviews.createOrReplaceTempView("product_reviews")

In [None]:
jan_df = spark.sql("""
    SELECT
        product_name,
        avg(sentiment) as avg_sentiment
    FROM product_reviews
    WHERE MONTH(review_date) = 1
    GROUP BY product_name
    ORDER BY avg_sentiment DESC
""")
jan_df.show()

In [None]:
feb_df = spark.sql("""
    SELECT
        product_name,
        avg(sentiment) as avg_sentiment
    FROM product_reviews
    WHERE MONTH(review_date) = 2
    GROUP BY product_name
    ORDER BY avg_sentiment DESC
""")
feb_df.show()

In [None]:
result_df = jan_df.alias("jan").join(feb_df.alias("feb"), jan_df.product_name == feb_df.product_name, how="full_outer") \
    .select(
        F.coalesce(F.col("jan.product_name"), F.col("feb.product_name")).alias("product_name"),
        jan_df.avg_sentiment.alias("jan_sentiment"),
        feb_df.avg_sentiment.alias("feb_sentiment")
    ) \
    .withColumn("sentiment_diff", F.col("feb_sentiment") - F.col("jan_sentiment")) \
    .orderBy("sentiment_diff", ascending=False)
result_df.show()

In [None]:
# When complete, you can deactivate the environment and remove it by running this from command line
#       conda deactivate
#       conda remove -n iceberg-demo --all