# Geo Addressing SDK For Big Data in Databricks - Installation
This is a installation file having information about how to install all the required configurations.

# Configuration and Creating Environment Variables

In [0]:
# Update the following with the API and Secret keys for your DataExperience account; these can be generated by visiting https://data.precisely.com/autodownload.
PDX_API_KEY = "YOUR_PDX_API_KEY"
PDX_SECRET = "YOUR_PDX_API_SECRET"

# SDK Download URL
SDK_URL = "YOUR_PRESIGNED_SDK_URL"

# The SDK jar is copied to this user's workspace for installing it in cluster as a Library. (/Workspace/Users/<username>/$SDK_EXTRACT_LOCATION)
USERNAME = "Shardul.Rajhans@precisely.com"

# We will be installing to the following directory. You can change the directory to suit your environment - you will need to use the same value in the Geocoding Demo Workspace.
GEO_ADDRESSING_ROOT_DBFS = "/precisely/addressing"

# SDK Release Version
SDK_RELEASE_VERSION = "5.2.1"

# This is the version of the latest vintage whenever updated to current. The required format is "(YEAR.MONTH)".
DATA_VINTAGE = "2024.9"

# Configure the datasets to be downloaded from data.precisely.com.
GEOCODING_SPDS = [f"Geocoding MLD US#United States#All USA#Spectrum Platform Data#1.0.0#{DATA_VINTAGE}",
                  f"Geocoding TT Street US#United States#All USA#Spectrum Platform Data#1.0.0#{DATA_VINTAGE}",
                  f"Geocoding Reverse PRECISELYID#United States#All USA#Spectrum Platform Data#1.0.0#{DATA_VINTAGE}"]

# Local Environment Setup - The remaining lines should not need to be modified
DBFS_SDK_EXTRACT_LOCATION = f"{GEO_ADDRESSING_ROOT_DBFS}/geo-addressing-bigdata-distribution-{SDK_RELEASE_VERSION}"
DBFS_DATA_LOCATION = f"{GEO_ADDRESSING_ROOT_DBFS}/data/{DATA_VINTAGE}"

# SPD files will be present here.
LOCAL_DATA_ZIPPED = f"{GEO_ADDRESSING_ROOT_DBFS}/zip"

# Add the pdx sdk jar from github to your Filestore.
PDX_SDK_URL = "https://raw.githubusercontent.com/PreciselyData/big-data/dev/databricks-geocoding/lib/precisely-bigdata-pdx-sdk3.0.1-full.jar"

PDX_CLASSNAME = "com.precisely.pdx.sdkexample.SampleDemoApp"
DBFS_PDX_SDK_JAR = f"{DBFS_SDK_EXTRACT_LOCATION}/pdx-sdk.jar"

# We did this in any %sh command to ensure variables are available in the environment.
dbutils.fs.put("file:///dbricks_env.sh", f"""#!/bin/bash

export SDK_URL="{SDK_URL}"
export PDX_API_KEY={PDX_API_KEY}
export PDX_SECRET={PDX_SECRET}
export DATA_VINTAGE={DATA_VINTAGE}
export SDK_EXTRACT_LOCATION={DBFS_SDK_EXTRACT_LOCATION}
export DBFS_SDK_EXTRACT_LOCATION=/dbfs{DBFS_SDK_EXTRACT_LOCATION}
export DBFS_DATA_LOCATION=/dbfs{DBFS_DATA_LOCATION}
export LOCAL_DATA_ZIPPED={LOCAL_DATA_ZIPPED}
export USERNAME={USERNAME}
export PDX_SDK_URL={PDX_SDK_URL}
export PDX_CLASSNAME={PDX_CLASSNAME}
export DBFS_PDX_SDK_JAR=/dbfs{DBFS_PDX_SDK_JAR}
export GEOCODING_SPDS={"({})".format(" ".join(list(map(lambda x: '"{}"'.format(x), GEOCODING_SPDS))))}

""", True)

Wrote 2863 bytes.


True

# Download and Extract Geo Addressing SDK For Big Data Distribution Zip

In [0]:
%sh . /dbricks_env.sh

rm -rf $DBFS_SDK_EXTRACT_LOCATION
mkdir -p $DBFS_SDK_EXTRACT_LOCATION

if [ ! -z "$SDK_URL" ]
then
  echo "Installing addressing SDK..."
  curl -o addressing-sdk.zip "$SDK_URL"
  unzip -d $DBFS_SDK_EXTRACT_LOCATION addressing-sdk.zip
else
  echo "Not installing addressing SDK"
fi

Installing addressing SDK...


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0  302M    0 15824    0     0   1874      0 47:02:47  0:00:08 47:02:39  1874  5  302M    5 16.0M    0     0  1887k      0  0:02:44  0:00:08  0:02:36 1886k 10  302M   10 32.0M    0     0  3380k      0  0:01:31  0:00:09  0:01:22 3380k 18  302M   18 56.0M    0     0  5372k      0  0:00:57  0:00:10  0:00:47 5372k 26  302M   26 79.9M    0     0  7028k      0  0:00:44  0:00:11  0:00:33 7028k 36  302M   36  110M    0     0  8829k      0  0:00:35  0:00:12  0:00:23 25.1M 41  302M   41  126M    0     0  9329k      0  0:00:33  0:00:13  0:00:20 21.3M 47  302M   47  143M    0     0   9.7M      0  0:00:30  0:00:14  0:00:16 22.2M 57  302M   57  172M    0     0  11.0M      0  0:00:27  0:00:15  0:00:12 23.5M 63  302M   63  192M    0     0  11.4M      0  0:00

Archive:  addressing-sdk.zip
   creating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/spark2/
   creating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/spark2/sdk/
   creating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/spark2/sdk/lib/
  inflating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/spark2/sdk/lib/geo-addressing-bigdata-addressing-sdk-spark2_2.12-5.2.1.jar  
   creating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/pyspark/
   creating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/pyspark/sdk/
   creating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/pyspark/sdk/lib/
  inflating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/pyspark/sdk/lib/geo-addressing-bigdata-addressing-sdk-pyspark-5.2.1.zip  
  inflating: /dbfs/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/pyspark

# Download the Precisely Data Experience Jar
This jar is required for downloading the Geo Addressing Reference Data using PDX jar.

In [0]:
%sh . /dbricks_env.sh

if [ ! -z "$PDX_SDK_URL" ]
then
  echo "Installing PDX SDK..."
  curl -o $DBFS_PDX_SDK_JAR "$PDX_SDK_URL"
else
  echo "Not installing geocoding SDK"
fi

Installing PDX SDK...


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100 5578k  100 5578k    0     0  12.9M      0 --:--:-- --:--:-- --:--:-- 12.9M


## Download the Geo Addressing Reference Data using PDX

In [0]:
%sh . /dbricks_env.sh
rm -rf $LOCAL_DATA_ZIPPED
mkdir -p $LOCAL_DATA_ZIPPED
printf '%s\n' "${GEOCODING_SPDS[@]}" | xargs -P 4 -I {spd} java -cp $DBFS_PDX_SDK_JAR $PDX_CLASSNAME -a $PDX_API_KEY -s $PDX_SECRET -d $LOCAL_DATA_ZIPPED -dd \"{spd}\"

Downloading file 1 of 1 KLR092024.spd to /precisely/addressing/zip
Progress for KLR092024.spd: 0%
Progress for KLR092024.spd: 2%
Progress for KLR092024.spd: 5%
Progress for KLR092024.spd: 8%
Progress for KLR092024.spd: 9%
Progress for KLR092024.spd: 11%
Progress for KLR092024.spd: 13%
Progress for KLR092024.spd: 14%
Progress for KLR092024.spd: 16%
Progress for KLR092024.spd: 19%
Progress for KLR092024.spd: 22%
Progress for KLR092024.spd: 24%
Progress for KLR092024.spd: 25%
Progress for KLR092024.spd: 26%
Progress for KLR092024.spd: 27%
Progress for KLR092024.spd: 28%
Progress for KLR092024.spd: 31%
Progress for KLR092024.spd: 32%
Progress for KLR092024.spd: 35%
Progress for KLR092024.spd: 37%
Progress for KLR092024.spd: 39%
Progress for KLR092024.spd: 42%
Progress for KLR092024.spd: 44%
Progress for KLR092024.spd: 47%
Progress for KLR092024.spd: 49%
Progress for KLR092024.spd: 51%
Progress for KLR092024.spd: 54%
Downloading file 1 of 1 KLD092024.spd to /precisely/addressing/zip
Progres

# Extract the Reference Data using Geo Addressing CLI Utility

In [0]:
%sh . /dbricks_env.sh
rm -rf $DBFS_DATA_LOCATION
mkdir -p $DBFS_DATA_LOCATION
$DBFS_SDK_EXTRACT_LOCATION/cli/cli.sh extract --s $LOCAL_DATA_ZIPPED --d $DBFS_DATA_LOCATION --t 8
ls $DBFS_DATA_LOCATION | sed 's/\/dbfs/dbfs:/'

Oct 14, 2024 10:41:50 AM com.pb.geocoding.config.extractor.SPDExtractor extract
INFO: Extractor found 2 source locations
Oct 14, 2024 10:41:50 AM com.pb.geocoding.config.extractor.SPDExtractor extract
INFO: extracting using 8 extractors
Oct 14, 2024 10:45:53 AM com.pb.geocoding.config.extractor.SPDExtractor extract
INFO: extraction took 4 minutes and 3 seconds
Oct 14, 2024 10:45:53 AM org.springframework.shell.core.AbstractShell handleExecutionResult
INFO: Successfully extracted: /dbfs/precisely/addressing/data/2024.9


USA-MasterLocationDataRevPreciselyID_202409
USA-MasterLocationData_202409
USA-TomTom_202409


# Configure Geo Addressing

In [0]:
%sh . /dbricks_env.sh
(cd $DBFS_SDK_EXTRACT_LOCATION/cli; sh ./cli.sh configure --s $DBFS_DATA_LOCATION --d $DBFS_SDK_EXTRACT_LOCATION/resources/config)
cat $DBFS_SDK_EXTRACT_LOCATION/resources/config/addressing.yaml

Oct 14, 2024 10:46:12 AM org.springframework.shell.core.AbstractShell handleExecutionResult
INFO: Successfully created configuration


# Addressing SDK settings---
config:
  default:
    # default preferences
    preferences:
      maxResults: 1 # maximum number of results per api request
      returnAllInfo: true # return all possible result data. (does not include content that requires explicit call from another preference)
      clientLocale: en
      clientCoordSysName: 'epsg:4326'
      distance:
        value: 150
        distanceUnit: METER
      streetOffset:
        value: 7
        distanceUnit: METER
      cornerOffset:
        value: 7
        distanceUnit: METER
      fallbackToPostal: true
      fallbackToGeographic: true
      matchMode: null
      originXY: null
      returnOfAdditionalFields: # return specific additional fields
      #  - "PB_KEY"
      customPreferences:
        "RETURN_PARSED_INPUT": "true" # return parsed address if possible
        
# data sources for Addressing SDK
dataSources:
- fileDataSource: "/dbfs/precisely/addressing/data/2024.9/USA-MasterLocationDataRevPreciselyID_202409"


# Add Geo Addressing SDK JAR as a Library in your Databricks Cluster

In [0]:
%sh . /dbricks_env.sh

echo "If Databricks Runtime Version is less than 15.*, you should attach the following jar as Library to your cluster:"
ls $DBFS_SDK_EXTRACT_LOCATION/pyspark/sdk/lib/geo-addressing-bigdata-addressing-sdk-spark*.jar | sed 's/\/dbfs/dbfs:/'

If Databricks Runtime Version is less than 15.*, you should attach the following jar as Library to your cluster:
dbfs:/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/pyspark/sdk/lib/geo-addressing-bigdata-addressing-sdk-spark2_2.12-5.2.1.jar


# For Databricks Version 15.1 and Above

From Databricks Version above 14.3 LTS, DBFS libraries support is being removed.
Copy the SDK Jar which needs to be added as a Library in your User's Workspace.

In [0]:
%sh . /dbricks_env.sh

rm -rf /Workspace/Users/$USERNAME$SDK_EXTRACT_LOCATION/geo-addressing-bigdata-addressing-sdk-spark*.jar
mkdir -p /Workspace/Users/$USERNAME$SDK_EXTRACT_LOCATION
cp -r $DBFS_SDK_EXTRACT_LOCATION/pyspark/sdk/lib/geo-addressing-bigdata-addressing-sdk-spark*.jar /Workspace/Users/$USERNAME$SDK_EXTRACT_LOCATION/
echo "You can now attach the following SKD jar as a Databricks Library to any cluster."
ls /Workspace/Users/$USERNAME$SDK_EXTRACT_LOCATION/geo-addressing-bigdata-addressing-sdk-spark*.jar

You can now attach the following SKD jar as a Databricks Library to any cluster.
/Workspace/Users/Shardul.Rajhans@precisely.com/precisely/addressing/geo-addressing-bigdata-distribution-5.2.1/geo-addressing-bigdata-addressing-sdk-spark2_2.12-5.2.1.jar
