In [None]:
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkConf

from pathlib import Path

In [None]:
warehouse = str(Path.home() / "temp" / "iceberg" / "spark-warehouse" / "local")

In [None]:
conf = (
    SparkConf()
    .setAppName("TEEHR")
    .setMaster("local[*]")
    .set("spark.sql.sources.partitionOverwriteMode", "dynamic")
    .set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
    .set("spark.sql.execution.arrow.pyspark.enabled", "true")
    .set("spark.sql.session.timeZone", "UTC")

    .set("spark.sql.package", "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.0")
    .set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
    # .set("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog")
    # .set("spark.sql.catalog.spark_catalog.type", "hive")

    .set("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog")
    .set("spark.sql.catalog.local.type", "hadoop")
    .set("spark.sql.catalog.local.warehouse", warehouse)

    .set("spark.driver.host", "localhost")
)
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
spark.sql("SELECT * FROM local.db.locations;")

In [None]:
spark.sql("DESCRIBE EXTENDED local.db.locations").show(truncate=False)


In [None]:
spark.sql("CREATE TABLE local.table (id bigint, data string) USING iceberg;")

In [None]:
spark.sql("INSERT INTO local.db.table VALUES (1, 'a'), (2, 'b'), (3, 'c')")

In [None]:
spark.sql("SELECT * FROM local.db.table").show()

In [None]:
df = spark.table("local.table")
df.count()

In [None]:
spark.sql("SELECT * FROM local.db.table.snapshots;").show()

In [None]:
spark.sql("DROP TABLE local.db.primary_timeseries;")
spark.sql("""
    CREATE TABLE local.db.primary_timeseries (
        reference_time timestamp,
        value_time timestamp,
        configuration_name string,
        unit_name string,
        variable_name string,
        value float,
        location_id string
    ) USING iceberg PARTITIONED BY (configuration_name, variable_name, reference_time);
""")

In [None]:
spark.read.format("parquet").load("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations.parquet").show()

In [None]:
# spark.sql("DROP TABLE local.db.location;")
spark.sql("""
    CREATE TABLE local.db.location (
        id string,
        name string,
        geom binary
    ) USING iceberg;
""")

In [None]:
spark.stop()