# Addressing Demo
In this notebook the geocoder will be configured, some sample data will be created, and geocodes will be executed using the data.
Before executing this notebook make sure to have completed executing all cells in Addressing Installation notebook.

In [None]:
# Import the required PySpark Libraries
from pyspark.sql.functions import *
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder \
  .config("spark.sql.legacy.allowUntypedScalaUDF", True) \
  .getOrCreate()

In [None]:
# addressingRootDBFS will be the same path specified in the installation script
addressingRootDBFS = "/addressing"

# where to write the generated preferences file
preferencesFileDBFS = f"{addressingRootDBFS}/addressing.yaml"

# These should not need to be modified
sdkLocationLocal = f"/dbfs{addressingRootDBFS}/sdk"
preferencesFileLocal = f"/dbfs{preferencesFileDBFS}"
dataLocationLocal = f"/dbfs{addressingRootDBFS}/data"
extractLocationLocal = "/precisely/data"

## Geocode Preferences

In [None]:
dbutils.fs.put(preferencesFileDBFS,"""
---
config:
  default:
    preferences:
      returnAllInfo: true
      clientCoordSysName: "epsg:4326"
""", True)
dbutils.fs.head(preferencesFileDBFS, 1024*1024*10)

In [None]:
# import OS module
import os
addressing_dir = [path for path in os.listdir(sdkLocationLocal) if path.startswith("spectrum-bigdata-addressing")][0]
python_lib_dir = f"{sdkLocationLocal}/{addressing_dir}/pyspark/sdk/lib"
python_zip_location = python_lib_dir + "/" + [path for path in os.listdir(python_lib_dir) if path.startswith("spectrum-bigdata-addressing-sdk-pyspark")][0]
print("SDK Location Local:", python_zip_location)
# Adding the Python SDK files to the SparkContext
sc.addPyFile(python_zip_location)

# Resources Location
resourcesLocationLocal = f"{sdkLocationLocal}/{addressing_dir}/resources/"
print("Resource Location Local:", resourcesLocationLocal)

# Import all the required classes
from addressing.DownloadManagerBuilder import DownloadManagerBuilder
from addressing.AddressingBuilder import AddressingBuilder
from addressing.HDFSDownloader import HDFSDownloader
from addressing.S3Downloader import S3Downloader
from addressing.LocalFilePassthroughDownloader import LocalFilePassthroughDownloader
from addressing.PreferencesBuilder import PreferencesBuilder
from addressing.HadoopConfiguration import HadoopConfiguration
from addressing.UDFExecutor import UDFExecutor

# Create the Geocode UDF
geocodeUdf = AddressingBuilder() \
  .withResourcesLocation(resourcesLocationLocal) \
  .withDataLocations(dataLocationLocal) \
  .withExtractionLocation(extractLocationLocal) \
  .udfBuilder() \
  .withPreferencesFile(preferencesFileLocal) \
  .withOutputFields("customFields['PB_KEY']", "address.formattedStreetAddress", "address.formattedLocationAddress", "location.feature.geometry.coordinates.x", "location.feature.geometry.coordinates.y") \
  .withErrorField("error") \
  .forGeocode()

## Build some test data

In [None]:
data = [("350 jordan rd","troy","ny","12180"), ("1 Global View","troy","ny","12180"), ("222 Jersey City Blvd", "Jersey City", "NJ", "7305")]
df = spark.createDataFrame(data=data, schema = ["address", "city", "state", "postcode"])
df.show(5, truncate=False)

## Execute the Geocode
Note: Due to the behavior of the spark query execution planner, the geocode function could be executed multiple times for every record.  Because the geocode operation can be computationally expensive, this should be avoided.  Adding a call to "df.persist()" directly after the geocode function should ensure that each record calls the geocode function only once.

In [None]:
df = df.withColumn("result", 
                   UDFExecutor() \
                   .apply(geocodeUdf,
                          create_map(lit("street"), col("address"),
                                     lit("city"), col("city"),
                                     lit("admin1"), col("state"),
                                     lit("postalCode"), col("postcode"),
                                     lit("country"), lit("USA")))) \
        .persist() \
        .select("*", "result.*").drop("result")

df.show(5, truncate=False)