In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.functions import broadcast
from pyspark import StorageLevel 

import re

In [2]:
# Initialize Spark with optimized configurations
spark = SparkSession.builder.appName("NAD_Geocoding").getOrCreate()

In [3]:
# 1. Load the processed transaction data
processed_path = '/mmfs1/projects/f8d7c0/2024-25/Gate City Bank/Agustin/Gate_City_processed_completed_TEST.csv'

df = spark.read.csv(processed_path, header=True, inferSchema=True)

# 2. Load the cleaned NAD data
nad_path = '/mmfs1/projects/f8d7c0/2024-25/Gate City Bank/Agustin/NAD_r18_clean.csv'
nad_df = spark.read.csv(nad_path, header=True)

In [4]:
# 2. Data preparation and cleaning
print(f"NAD DataFrame size: {nad_df.count()} rows")

nad_df.cache()

NAD DataFrame size: 77767367 rows


DataFrame[NAD_No: string, NAD_st: string, NAD_county: string, NAD_City: string, NAD_state: string, Zip_Code: string, NAD_longitude: string, NAD_latitude: string]

In [5]:
# 3. Extract street number and name from `term_street` (using NAD column names)
df_prepared = df.withColumn(
    "term_street_number", regexp_extract(col("term_street"), r"^(\d+)", 1)
).withColumn(
    "term_street_name",
    regexp_replace(
        # First remove the house number and spaces
        regexp_replace(col("term_street"), r"^\d+\s*", ""),
        # Then remove ALL street types, directions, and special characters
        r"(?i)[.#]|\b(st|street|rd|road|ave|avenue|blvd|boulevard|dr|drive|ln|lane|way|ct|court|cir|circle|ter|terrace|pl|place|pkwy|parkway|aly|alley|expy|expressway|hwy|highway|sq|square|tpke|turnpike|n|s|e|w|north|south|east|west)(\.?)\b",
        ""
    )
).withColumn(
    "term_street_name", 
    trim(regexp_replace(col("term_street_name"), r"^\W+|\W+$", ""))  # Remove leading/trailing non-word chars
)

# Prepare NAD data using correct column names
nad_prepared = nad_df.withColumn(
    "NAD_streetNumber", trim(col("NAD_No"))  # Using NAD_No for street number
).withColumn(
    "NAD_streetName", trim(col("NAD_st"))    # Using NAD_st for street name
)

# Create join keys (state|city|street_number|street_name)
df_prepared = df_prepared.withColumn(
    "join_key",
    concat_ws("|",
        lower(trim(col("TERM_State"))),
        lower(trim(col("TERM_City"))),
        trim(col("term_street_number")),
        lower(trim(col("term_street_name")))
    )
)

nad_prepared = nad_prepared.withColumn(
    "join_key",
    concat_ws("|",
        lower(trim(col("NAD_state"))),
        lower(trim(col("NAD_City"))),
        trim(col("NAD_streetNumber")),
        lower(trim(col("NAD_streetName")))
    )
).select("join_key", "NAD_longitude", "NAD_latitude")

# Perform the join
result_df = df_prepared.join(
    nad_prepared,
    on="join_key",
    how="left"
).drop("join_key", "term_street_number", "term_street_name")

# Handle records with term_street_flag < 1 (no street data)
non_street_records = df.filter(col("term_street_flag") < 1)
final_result = result_df.unionByName(
    non_street_records.withColumn("NAD_longitude", lit(None))
                     .withColumn("NAD_latitude", lit(None))
)

In [6]:
# 4. Save results (updated with pre-processing for Tableau compatibility)
result_df = df_prepared.join(
    nad_prepared,
    on="join_key",
    how="left"
).select(
    df_prepared["*"],  # Keep all original columns
    nad_prepared["NAD_longitude"],
    nad_prepared["NAD_latitude"]
)

# Drop the unnecessary columns before saving
result_df = result_df.drop("2022_NAICS_Title","term_street_number", "term_street_name", "join_key")

# Ensure consistent column order, clean headers, and proper CSV formatting
output_path = '/mmfs1/projects/f8d7c0/2024-25/Gate City Bank/Agustin/Gate_City_with_NAD_FINAL.csv'

# Write with explicit CSV settings to avoid Tableau union issues
(
    result_df
    .coalesce(20)  # Reduce partitions to minimize file splits (optional)
    .write
    .csv(
        output_path,
        header=True,
        mode="overwrite",
        quote='"',       # Force consistent quoting
        escape='"',      # Handle escaped quotes uniformly
        encoding="UTF-8", # Standard encoding
        lineSep="\n",    # Explicit line separator
        emptyValue=""    # Replace nulls with empty strings
    )
)

In [7]:
print("\nFinal dataset columns:")
print(result_df.columns)
print("\nGeocoding completed successfully!")
print(f"Total records processed: {df.count()}")
print(f"Records with coordinates matched: {result_df.filter(col('NAD_longitude').isNotNull()).count()}")
print(f"Output saved to: {output_path}")

# Sample output
#print("\nSample geocoded records:")
#result_df.select("TERM_City", "TERM_State", "term_street", "NAD_longitude", "NAD_latitude") \
#         .filter(col("NAD_longitude").isNotNull()) \
#         .show(5, truncate=False)

spark.stop()


Final dataset columns:
['SICSUBCD', 'TXN_DESCRIPTION', 'TERM_ADDR', 'HASHED_MERCH_ID', 'Related_SIC_Code_Description', 'Online_status', 'State', 'Country', 'Parent_Company', 'Parent_Company_Flag', 'City', 'Address_Flag', 'Address', 'TERM_State', 'TERM_City', 'term_street', 'term_street_flag', 'Company', 'COMPANY_KEY', 'NAD_longitude', 'NAD_latitude']

Geocoding completed successfully!
Total records processed: 10208188
Records with coordinates matched: 704826
Output saved to: /mmfs1/projects/f8d7c0/2024-25/Gate City Bank/Agustin/Gate_City_with_NAD_FINAL.csv
