In [None]:
import pyspark
SPARK_HOME = pyspark.__path__[0]
print(f"SPARK_HOME is: {SPARK_HOME}")
SPARK_VERSION = pyspark.__version__
print(f"SPARK_VERSION is: {SPARK_VERSION}")

In [None]:
import teehr
from pathlib import Path
import shutil
import json

# Tell Bokeh to output plots in the notebook
from bokeh.io import output_notebook
output_notebook()

In [None]:
from sedona.spark import SedonaContext, ST_GeomFromWKB, ST_SetSRID, ST_AsEWKT

In [None]:


from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
import geopandas as gpd

spark = (
    SparkSession.builder
    .appName("TEEHR")
    .master("local[*]")
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .config("spark.sql.session.timeZone", "UTC")
    .config('spark.jars.packages', 'org.apache.sedona:sedona-spark-3.5_2.12:1.7.0,org.datasyslab:geotools-wrapper:1.7.0-28.5,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.524')
    .config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all')
    .config("spark.serializer", KryoSerializer.getName)
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

SedonaRegistrator.registerAll(spark)

In [None]:
# Define the directory where the Evaluation will be created
test_eval_dir = Path(Path().home(), "temp", "10_sedona")
test_eval_dir

In [None]:
# Delete existing Evaluation (useful when testing)
# shutil.rmtree(test_eval_dir, ignore_errors=True)

# Create an Evaluation object and create the directory
# ev = teehr.Evaluation(dir_path=test_eval_dir, create_dir=True, spark=spark)

In [None]:
# Clone the e0_2_location_example evaluation from the S3 bucket
# ev.clone_from_s3("e0_2_location_example")

In [None]:
# Connect to existing Evaluation
ev = teehr.Evaluation(dir_path=test_eval_dir, spark=spark)

In [None]:
# Grab locations from database
sdf = ev.locations.to_sdf()
sdf.show(truncate=False)

In [None]:
# Seems like meta data has to be read from a single file even if the "table/partition" is a folder (wait maybe not)
# and the files written by TEEHR v0.4.7 does not contain the geoparquet.metadata
sdf_meta = (
    ev.spark.read.format("geoparquet.metadata")
    .load("/Users/mdenno/temp/10_sedona/dataset/locations/part-00000-b29ce52b-7ba9-443f-be44-11d204bec7dc-c000.snappy.parquet")
)
sdf_meta.show(truncate=False)

In [None]:
sdf_meta.printSchema()

In [None]:
# Now lets make locations CRS aware with the SRID with SQL and DatafRame API
# choose one
# gdf =  sdf.withColumn("geometry2", ST_GeomFromWKB("geometry")).drop("geometry").withColumnRenamed("geometry2", "geometry")
gdf = ev.sql("""
SELECT id, name, ST_SetSRID(ST_GeomFromWKB(geometry), 4326) as geometry FROM locations
""")
gdf.show(truncate=False)

In [None]:
gdf.printSchema()

In [None]:
# SQL query of temp view shows SRID in geometry
gdf.createOrReplaceTempView("two_locations")
ev.spark.sql("SELECT ST_AsEWKT(geometry) FROM two_locations").show(truncate=False)

In [None]:
# Grab the CRS from this convenient available geoparquet file (could put in a const file  or something)
# look up form somewhere?
projjson_crs = gpd.read_parquet("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations.parquet").crs.to_json()
# projjson_crs = json.loads(projjson_crs)
projjson_crs

In [None]:
# write a geoparquet with CRS info
two_locations_geoparquet_path = "/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations_geoparquet"
(
    gdf
        .repartition(1)
        .write.format("geoparquet")
        .mode("overwrite")
        .option("geoparquet.crs", projjson_crs)
        .save(two_locations_geoparquet_path)
)

In [None]:
# Now read the parquet file just saved with PySpark
gdf = ev.spark.read.format("geoparquet").load(two_locations_geoparquet_path)
gdf.show(truncate=False)

In [None]:
# Read metadata from the "folder" which I thought was not possible...
gdf_meta = ev.spark.read.format("geoparquet.metadata").load(two_locations_geoparquet_path)
gdf_meta.show(truncate=False)

In [None]:

# So can the CRS be read from the metadata and added to the geometry?
# This seems like it could be problematic since you could (although we don't save data in multiple CRS)
espg_code = json.loads(gdf_meta.toPandas()["columns"].values[0]["geometry"]["crs"])["id"]["code"]
espg_code


In [None]:
# Now look at table using SQL
gdf.createOrReplaceTempView("two_locations")

# does not show SRID (i.e.), so I think the SRID is only in the metadata but not in the geometry
# I think this is also "known" to be true or was for shapfiles anyway.
ev.spark.sql("SELECT ST_AsEWKT(geometry) FROM two_locations").show(truncate=False)

In [None]:
crs_gdf = ev.sql("""
SELECT id, name, ST_SetSRID(ST_GeomFromWKB(geometry), 4326) as geometry FROM locations
""")
crs_gdf.select(ST_AsEWKT("geometry")).show(truncate=False)

In [None]:
two_locations_geoparquet_path = Path(two_locations_geoparquet_path)

gpd.read_parquet(list(two_locations_geoparquet_path.glob("*.parquet"))[0])

In [None]:
path = str(ev.locations.dir)
path

In [None]:
df = ev.spark.read.format("geoparquet").load("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations.parquet")
df.show(truncate=False)

In [None]:
df = ev.spark.read.format("geoparquet.metadata").load(path)
df.printSchema()

In [None]:
two_locations_df = ev.spark.read.format("geoparquet").options().load("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations.parquet")
two_locations_df.createOrReplaceTempView("two_locations")

# does not show SRID
ev.spark.sql("SELECT ST_AsEWKT(geometry) FROM two_locations").show(truncate=False)

# shows SRID
ev.spark.sql("SELECT ST_AsEWKT(ST_SetSrid(geometry, 4326)) FROM two_locations").show(truncate=False)



In [None]:
two_locations_df = two_locations_df.withColumn("geometry2", ST_SetSRID("geometry", 4326))
two_locations_df.show()
two_locations_df.createOrReplaceTempView("two_locations")


In [None]:
# does not show SRID
ev.spark.sql("SELECT ST_AsEWKT(geometry2) FROM two_locations").show(truncate=False)

In [None]:
proj_sdf = ev.spark.sql("SELECT id, name, ST_Transform(geometry, 'EPSG:4326', 'EPSG:3857') as geometry FROM two_locations")
proj_sdf.show(truncate=False)

In [None]:
proj_sdf.write.format("geoparquet").mode("overwrite").save("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations_proj.parquet")

In [None]:
df = ev.spark.read.format("geoparquet").load("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations_proj.parquet")
df.show(truncate=False)


In [None]:
df = ev.spark.read.format("geoparquet.metadata").load("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations_proj.parquet")
df.select("columns").show(truncate=False)

In [None]:
gdf = gpd.read_parquet("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations.parquet")
# gdf.to_crs("EPSG:3857", inplace=True)
projjson_crs = gdf.crs.to_json()
gdf.crs.to_json()


In [None]:
spark.createDataFrame(gdf).select(ST_AsEWKT("geometry")).show(truncate=False)

In [None]:
gdf = gpd.read_parquet("/Users/mdenno/repos/teehr/tests/data/two_locations/two_locations_proj.parquet")
gdf.crs

In [None]:
spark.createDataFrame(gdf).select(ST_AsEWKT("geometry")).show(truncate=False)

In [None]:
# Just trying to run metrics in
from pyspark.sql.functions import pandas_udf

In [None]:
kge = teehr.Metrics.KlingGuptaEfficiency()

func_pd = pandas_udf(kge.func, kge.attrs["return_type"])

spark.udf.register("kling_gupta_efficiency", func_pd)


In [None]:
%%time
ev.sql("""
    SELECT
        primary_location_id
        , kling_gupta_efficiency(primary_value, secondary_value) as kling_gupta_efficiency
    FROM
        joined_timeseries
    GROUP BY
        primary_location_id
""").show()

In [None]:
%%time
sdf = ev.metrics.query(
    group_by=["primary_location_id"],
    include_metrics=[kge]
).to_sdf()
sdf.show()

In [None]:
# What if we could groupby polygon?
# Or work with the hydrofabric geopackage