# Geo Addressing Demo
In this notebook the geocoder will be configured, some sample data will be created, and geocodes will be executed using the data.
Before executing this notebook make sure to have completed executing all cells in Addressing Installation notebook.

In [0]:
%sh
# make sure java version is at least jdk-11
# from databricks runtime 15.*,  you can add JNAME=zulu17-ca-amd64 as environment variable and restart the cluster
# till databricks runtime version 15.*. you can add JNAME=zulu11-ca-amd64 as environment variable and restart the cluster
java -version

openjdk version "17.0.11" 2024-04-16 LTS
OpenJDK Runtime Environment Zulu17.50+19-CA (build 17.0.11+9-LTS)
OpenJDK 64-Bit Server VM Zulu17.50+19-CA (build 17.0.11+9-LTS, mixed mode, sharing)


In [0]:
# Import the required PySpark Libraries
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder \
  .config("spark.sql.legacy.allowUntypedScalaUDF", True) \
  .getOrCreate()

In [0]:
addressing_sdk_location = "/dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1"
resources_location = f"{addressing_sdk_location}/resources"
sdk_zip_path = f"{addressing_sdk_location}/pyspark/sdk/lib/geo-addressing-bigdata-addressing-sdk-pyspark-5.2.1.zip"

# Adding the Python SDK files to the SparkContext
sc.addPyFile(sdk_zip_path)

# Import all the required classes
from addressing.DownloadManagerBuilder import DownloadManagerBuilder
from addressing.AddressingBuilder import AddressingBuilder
from addressing.HDFSDownloader import HDFSDownloader
from addressing.S3Downloader import S3Downloader
from addressing.LocalFilePassthroughDownloader import LocalFilePassthroughDownloader
from addressing.PreferencesBuilder import PreferencesBuilder
from addressing.HadoopConfiguration import HadoopConfiguration
from addressing.UDFExecutor import UDFExecutor

outputFields = ["customFields['PB_KEY'] as PBKEY", 
                "address.formattedStreetAddress as FormattedAddress",  
                "location.feature.geometry.coordinates.x as LON", 
                "location.feature.geometry.coordinates.y as LAT"]
# Create the Geocode UDF
geocodeUdf = (
  AddressingBuilder()
  .withResourcesLocation(resources_location)
  # Pass this only if addressing.yaml file absent in the resources/config directory.
  # .withDataLocations(*data_location) \
  .udfBuilder()
  .withOutputFields(*outputFields)
  .withErrorField("error")
  .withResultAsJSON("resultJson")
  .forGeocode()
)

## Build some test data or create your own udf.

In [0]:
data = [("350 jordan rd ny 12180", "USA"), ("1 Global View troy ny 12180", "USA"), ("222 Jersey City Blvd Jersey City NJ 7305", "USA")]
df = spark.createDataFrame(data=data, schema = ["ADDRESS", "COUNTRY"])
df.show(5, truncate=False)

+----------------------------------------+-------+
|ADDRESS                                 |COUNTRY|
+----------------------------------------+-------+
|350 jordan rd ny 12180                  |USA    |
|1 Global View troy ny 12180             |USA    |
|222 Jersey City Blvd Jersey City NJ 7305|USA    |
+----------------------------------------+-------+



## NOTE: Please run some warm up calls to start the Geo Addressing Engine, before running on large data.
Also, as the geocode operation can be computationally expensive, Adding a call to "df.persist()" directly after the geocode function should ensure that each record calls the geocode function only once.

In [0]:
geocodeDf = df.withColumn("result", 
                   UDFExecutor() \
                   .apply(geocodeUdf,
                          create_map(lit("addressLines[0]"), col("ADDRESS"),
                                     lit("country"), col("COUNTRY")))) \
        .persist() \
        .select("*", "result.*").drop("result")

geocodeDf.show(5)

+--------------------+-------+------------+--------------------+----------+---------+--------------------+-----+
|             ADDRESS|COUNTRY|       PBKEY|    FormattedAddress|       LON|      LAT|          resultJson|error|
+--------------------+-------+------------+--------------------+----------+---------+--------------------+-----+
|350 jordan rd ny ...|    USA|P0000GL638OL|       350 JORDAN RD|-73.699929|42.678119|{"score":97,"addr...| NULL|
|1 Global View tro...|    USA|P0000GL41OME|         1 GLOBAL VW|-73.704443|42.682242|{"score":100,"add...| NULL|
|222 Jersey City B...|    USA|P0000FNYUEQH|222 JERSEY CITY BLVD|-74.054672|40.708214|{"score":97,"addr...| NULL|
+--------------------+-------+------------+--------------------+----------+---------+--------------------+-----+



# Registering the geo addressing functionalities as UDFs and running the SQL queries

You can now register the geo addressing functionalities as UDFs and start running SQL queries.

In [0]:
geocodeUdf = (
  AddressingBuilder()
  .withResourcesLocation(resources_location)
  # Pass this only if addressing.yaml file absent in the resources/config directory.
  # .withDataLocations(*data_location) \
  .udfBuilder()
  .withOutputFields(*outputFields)
  .withErrorField("error")
  .withResultAsJSON("resultJson")
  .forGeocode("PreciselyGeocode")
)

df.createOrReplaceTempView("inputTable")

geocodeSqlDf = spark.sql("select *, PreciselyGeocode(map('addressLines[0]', ADDRESS, 'country', COUNTRY)) as result from inputTable") \
        .persist().select("*", "result.*").drop("result")

geocodeSqlDf.show(5)

+--------------------+-------+------------+--------------------+----------+---------+--------------------+-----+
|             ADDRESS|COUNTRY|       PBKEY|    FormattedAddress|       LON|      LAT|          resultJson|error|
+--------------------+-------+------------+--------------------+----------+---------+--------------------+-----+
|350 jordan rd ny ...|    USA|P0000GL638OL|       350 JORDAN RD|-73.699929|42.678119|{"score":97,"addr...| NULL|
|1 Global View tro...|    USA|P0000GL41OME|         1 GLOBAL VW|-73.704443|42.682242|{"score":100,"add...| NULL|
|222 Jersey City B...|    USA|P0000FNYUEQH|222 JERSEY CITY BLVD|-74.054672|40.708214|{"score":97,"addr...| NULL|
+--------------------+-------+------------+--------------------+----------+---------+--------------------+-----+

