In [None]:
import pyspark
SPARK_HOME = pyspark.__path__[0]
print(f"SPARK_HOME is: {SPARK_HOME}")
SPARK_VERSION = pyspark.__version__
print(f"SPARK_VERSION is: {SPARK_VERSION}")

In [None]:
import teehr
from pathlib import Path
import shutil

# Tell Bokeh to output plots in the notebook
from bokeh.io import output_notebook
output_notebook()

In [None]:
from sedona.spark import SedonaContext, ST_GeomFromWKB

# config = (
#     SedonaContext.builder()
#     .appName("TEEHR")
#     .master("local[*]")
#     .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
#     .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
#     .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
#     .config("spark.sql.execution.arrow.pyspark.enabled", "true")
#     .config("spark.sql.session.timeZone", "UTC")
#     .config('spark.jars.packages', 'org.apache.sedona:sedona-spark-3.5_2.12:1.7.0,org.datasyslab:geotools-wrapper:1.7.0-28.5,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.524')
#     .config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all')
#     .getOrCreate()
# )

# spark = SedonaContext.create(config)

from pyspark.sql import SparkSession
from sedona.register import SedonaRegistrator
from sedona.utils import SedonaKryoRegistrator, KryoSerializer
import geopandas as gpd

spark = (
    SparkSession.builder
    .appName("TEEHR")
    .master("local[*]")
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider")
    .config("spark.sql.execution.arrow.pyspark.enabled", "true")
    .config("spark.sql.session.timeZone", "UTC")
    .config('spark.jars.packages', 'org.apache.sedona:sedona-spark-3.5_2.12:1.7.0,org.datasyslab:geotools-wrapper:1.7.0-28.5,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.524')
    .config('spark.jars.repositories', 'https://artifacts.unidata.ucar.edu/repository/unidata-all')
    .config("spark.serializer", KryoSerializer.getName)
    .config("spark.kryo.registrator", SedonaKryoRegistrator.getName)
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")

# spark = SparkSession. \
#     builder. \
#     appName('Python Spark Apache Sedona example'). \
#     master('local[*]'). \
#     config("spark.serializer", KryoSerializer.getName). \
#     config("spark.kryo.registrator", SedonaKryoRegistrator.getName). \
#     config('spark.jars.packages',
#            'org.apache.sedona:sedona-python-adapter-3.0_2.12:1.0.1-incubating,'
#            'org.datasyslab:geotools-wrapper:geotools-24.1'). \
#     getOrCreate()

SedonaRegistrator.registerAll(spark)

In [5]:
# Define the directory where the Evaluation will be created
test_eval_dir = Path(Path().home(), "temp", "10_sedona")
shutil.rmtree(test_eval_dir, ignore_errors=True)

# Create an Evaluation object and create the directory
ev = teehr.Evaluation(dir_path=test_eval_dir, create_dir=True, spark=spark)

In [None]:
# Clone the e0_2_location_example evaluation from the S3 bucket
ev.clone_from_s3("e0_2_location_example")

In [None]:
sdf = ev.locations.to_sdf()
sdf.show()

In [None]:
df = sdf.withColumn("geometry2", ST_GeomFromWKB("geometry"))
df.show()

In [None]:
df.printSchema()

In [None]:
path = str(ev.locations.dir)
path

In [None]:
df = ev.spark.read.format("geoparquet.metadata").load(path)
df.printSchema()