# Locate Spark in Virtual Environment

In [None]:
import os

In [None]:
os.environ['SPARK_HOME'] = '/<your path>/anaconda3/envs/iceberg-lab/lib/python3.11/site-packages/pyspark'

In [None]:
import findspark
findspark.init()
findspark.find()

# Environment Variables

### Environment variables for all clouds
Regardless which cloud your Snowflake account is in, set the following environment variables replacing values with those of your own.

In [None]:
os.environ['SNOWFLAKE_CATALOG_URI'] = "jdbc:snowflake://<your snowflake account locator>.snowflakecomputing.com"
os.environ['SNOWFLAKE_ROLE'] = "ICEBERG_LAB"
os.environ['SNOWFLAKE_USERNAME'] = "ICEBERG_LAB"
os.environ['SNOWFLAKE_PASSWORD'] = "<your password>"
os.environ['PRIVATE_KEY_FILE'] = "/<your path>/rsa_key.p8"

### Environment variables for AWS
If your Snowflake account and object storage are on AWS, set these additional environment variables. 

In [None]:
os.environ['PACKAGES'] = "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.1,net.snowflake:snowflake-jdbc:3.14.2,software.amazon.awssdk:bundle:2.20.160,software.amazon.awssdk:url-connection-client:2.20.160"
os.environ['AWS_REGION'] = "<your aws region>"
os.environ['AWS_ACCESS_KEY_ID'] = "<your aws access key>"
os.environ['AWS_SECRET_ACCESS_KEY'] = "<your aws secret access key>"

### Environment variables for Azure
If your Snowflake account and object storage are on Azure, set these additional environment variables.

In [None]:
os.environ['PACKAGES'] = "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.1,net.snowflake:snowflake-jdbc:3.14.2,com.microsoft.azure:azure-storage:8.6.6,org.apache.hadoop:hadoop-azure:3.3.6"
os.environ['AZURE_ACCESS_KEY'] = "<your storage account access key>"

### Environment variables for GCP
If your Snowflake account and object storage are on Google Cloud, set these additional environment variables.

In [None]:
os.environ['PACKAGES'] = "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.1,net.snowflake:snowflake-jdbc:3.14.2,com.google.cloud:google-cloud-storage:2.29.1,org.apache.iceberg:iceberg-gcp:1.4.2"
os.environ['spark.hadoop.fs.gs.project.id'] = "<your project ID>"
os.environ['spark.hadoop.fs.gs.auth.service.account.json.keyfile'] = "<path to your JSON keyfile>"

# Run Spark 

In [None]:
import pyspark
from pyspark.sql import SparkSession

In [None]:
# Create SparkSession, for AWS and Azure
spark = SparkSession.builder.appName('iceberg_lab')\
    .config('spark.jars.packages', os.environ['PACKAGES'])\
    .config('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')\
    .getOrCreate()

In [None]:
# Create SparkSession, specifically for GCP where GCS needs the shaded jar as described here:
# https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcs/INSTALL.md#troubleshooting-the-installation
spark = SparkSession.builder.appName('iceberg_lab')\
    .config('spark.jars', 'https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.18/gcs-connector-hadoop3-2.2.18-shaded.jar')\
    .config('spark.jars.packages', os.environ['PACKAGES'])\
    .config('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')\
    .getOrCreate()

### Spark configurations for all clouds
Regardless which cloud your Snowflake account is in, set the following configurations for Spark

In [None]:
spark.conf.set("spark.sql.defaultCatalog", "snowflake_catalog")
spark.conf.set("spark.sql.catalog.snowflake_catalog", "org.apache.iceberg.spark.SparkCatalog")
spark.conf.set("spark.sql.catalog.snowflake_catalog.catalog-impl", "org.apache.iceberg.snowflake.SnowflakeCatalog")
spark.conf.set("spark.sql.catalog.snowflake_catalog.uri", os.environ['SNOWFLAKE_CATALOG_URI'])
spark.conf.set("spark.sql.catalog.snowflake_catalog.jdbc.role", "ICEBERG_LAB")
spark.conf.set("spark.sql.catalog.snowflake_catalog.jdbc.user", "ICEBERG_LAB")
spark.conf.set("spark.sql.catalog.snowflake_catalog.jdbc.password", os.environ['SNOWFLAKE_PASSWORD'])
spark.conf.set("spark.sql.catalog.snowflake_catalog.jdbc.private_key_file", os.environ['PRIVATE_KEY_FILE'])
spark.conf.set("spark.sql.iceberg.vectorization.enabled", "false")

### Spark configurations for AWS
If your Snowflake account and object storage are on AWS, set these additional Spark configurations.

In [None]:
spark.conf.set("spark.sql.catalog.snowflake_catalog.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
spark.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
spark.conf.set("spark.hadoop.fs.s3a.access.key", os.environ['AWS_ACCESS_KEY_ID'])
spark.conf.set("spark.hadoop.fs.s3a.secret.key", os.environ['AWS_SECRET_ACCESS_KEY'])
spark.conf.set("spark.hadoop.fs.s3a.endpoint", "s3.amazonaws.com")
spark.conf.set("spark.hadoop.fs.s3a.endpoint.region", os.environ['AWS_REGION'])

### Spark configurations for Azure
If your Snowflake account and object storage are on Azure, set these additional Spark configurations.

In [None]:
#This is using a storage account and container with anonymous access enabled.
spark.conf.set("spark.sql.catalog.snowflake_catalog.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")
spark.conf.set("spark.hadoop.fs.azure.account.key.snowflakeiceberg.blob.core.windows.net", os.environ['AZURE_ACCESS_KEY'])
spark.conf.set("spark.hadoop.fs.azure.account.auth.type.snowflakeiceberg.blob.core.windows.net", "SharedKey")

### Spark configurations for GCP
If your Snowflake account and object storage are on Google Cloud, set these additional Spark configurations.

In [None]:
spark.conf.set("spark.sql.catalog.snowflake_catalog.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO")
spark._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
spark._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
spark._jsc.hadoopConfiguration().set("fs.gs.project.id", os.environ['PROJECT_ID'])
spark._jsc.hadoopConfiguration().set("fs.gs.auth.type", "SERVICE_ACCOUNT_JSON_KEYFILE")
spark._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")
spark._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.json.keyfile", os.environ['JSON_KEYFILE'])

# Read Snowflake-managed Iceberg Tables

In [None]:
spark.sql("SHOW NAMESPACES IN ICEBERG_LAB").show()

In [None]:
spark.sql("USE ICEBERG_LAB.ICEBERG_LAB")

In [None]:
spark.sql("SHOW TABLES").show()

In [None]:
df = spark.table("iceberg_lab.iceberg_lab.customer_iceberg")
df.show()