In [2]:
import re
import os
import zipfile
import random
import string
import polars as pl
import numpy as np
from pathlib import Path
from typing import List, Tuple, Dict
from geopy.geocoders import Nominatim

print(f'polars version: {pl.__version__}')

geolocator = Nominatim(user_agent="geocoder_llm_project", timeout=300)

project_dir = Path(os.getcwd())

polars version: 1.9.0


In [9]:
# !wget https://nationaladdressdata.s3.amazonaws.com/NAD_r18_TXT.zip

In [10]:
# zip_data_file_path = project_dir / 'NAD_r18_TXT.zip'

# with zipfile.ZipFile(zip_data_file_path, 'r') as zip_ref:
#     zip_ref.extractall(".")

In [7]:
raw_file_path = project_dir / 'TXT/NAD_r18.txt'

Dataset schema. Everything is a string here.

In [38]:
schema_overrides = {
    "OID_": pl.Int64,
    "AddNum_Pre": pl.Utf8,
    "Add_Number": pl.Int64,
    "AddNum_Suf": pl.Utf8,
    "AddNo_Full": pl.Int64,
    "St_PreMod": pl.Utf8,
    "St_PreDir": pl.Utf8,
    "St_PreTyp": pl.Utf8,
    "St_PreSep": pl.Utf8,
    "St_Name": pl.Utf8,
    "St_PosTyp": pl.Utf8,
    "St_PosDir": pl.Utf8,
    "St_PosMod": pl.Utf8,
    "StNam_Full": pl.Utf8,
    "Building": pl.Utf8,
    "Floor": pl.Utf8,
    "Unit": pl.Utf8,
    "Room": pl.Utf8,
    "Seat": pl.Utf8,
    "Addtl_Loc": pl.Utf8,
    "SubAddress": pl.Utf8,
    "LandmkName": pl.Utf8,
    "County": pl.Utf8,
    "Inc_Muni": pl.Utf8,
    "Post_City": pl.Utf8,
    "Census_Plc": pl.Utf8,
    "Uninc_Comm": pl.Utf8,
    "Nbrhd_Comm": pl.Utf8,
    "NatAmArea": pl.Utf8,
    "NatAmSub": pl.Utf8,
    "Urbnztn_PR": pl.Utf8,
    "PlaceOther": pl.Utf8,
    "PlaceNmTyp": pl.Utf8,
    "State": pl.Utf8,
    "Zip_Code": pl.Int64,
    "Plus_4": pl.Int64,
    "UUID": pl.Utf8,
    "AddAuth": pl.Int64,
    "AddrRefSys": pl.Utf8,
    "Longitude": pl.Float64,
    "Latitude": pl.Float64,
    "NatGrid": pl.Utf8,
    "Elevation": pl.Utf8,
    "Placement": pl.Utf8,
    "AddrPoint": pl.Utf8,
    "Related_ID": pl.Utf8,
    "RelateType": pl.Utf8,
    "ParcelSrc": pl.Utf8,
    "Parcel_ID": pl.Utf8,
    "AddrClass": pl.Utf8,
    "Lifecycle": pl.Utf8,
    "Effective": pl.Utf8,
    "Expire": pl.Utf8,
    "DateUpdate": pl.Utf8,
    "AnomStatus": pl.Utf8,
    "LocatnDesc": pl.Utf8,
    "Addr_Type": pl.Utf8,
    "DeliverTyp": pl.Utf8,
    "NAD_Source": pl.Utf8,
    "DataSet_ID": pl.Utf8,
    "StreetAddress": pl.Utf8,
    "SecondaryAddress": pl.Utf8,
    "CityStateZip": pl.Utf8,
    "FullAddress": pl.Utf8,
}

Given the data is huge (31GB) and build over years, there are some inconsistencies. So we ignore errors, infer schema and provide null values as well.

In [64]:
df = pl.read_csv(
    raw_file_path, 
    ignore_errors=True, 
    separator=",", 
    infer_schema_length=0, 
    quote_char=None, 
    schema_overrides=schema_overrides,
    truncate_ragged_lines=True,
    null_values=["Not stated"]
)

Filter out states which are not null

In [65]:
df = df.filter(pl.col('State').is_not_null())

These are the states available in the data. We have data from 47 states.

In [66]:
valid_states = [
    'TX', 'LA', 'ME', 'WY', 'KY', 'MI', 'WA', 'VT', 'ND', 'TN',
    'IN', 'WV', 'MN', 'RI', 'DE', 'IL', 'SD', 'AK', 'MS', 'OK',
    'PA', 'WI', 'NY', 'KS', 'NM', 'AZ', 'SC', 'FL', 'NC', 'MD',
    'UT', 'NE', 'NH', 'VA', 'GA', 'AL', 'CA', 'MA', 'CT', 'AR',
    'CO', 'MT', 'DC', 'ID', 'IA', 'OH', 'MO'
]

In [67]:
df = df.filter(
    pl.col('State').is_in(valid_states)
)

In [68]:
print(f'Number of records: {len(df)}')

Number of records: 80044721


Concatinating different columns into single strings to get street and country information, and finally build the FullAddress column.

In [7]:
df = df.with_columns(
    pl.concat_str(
        [
            pl.col("AddNum_Pre"),
            pl.col("Add_Number").cast(str),
            pl.col("AddNum_Suf"),
            pl.col("St_PreMod"),
            pl.col("St_PreDir"),
            pl.col("St_PreTyp"),
            pl.col("St_Name"),
            pl.col("St_PosTyp"),
            pl.col("St_PosDir"),
            pl.col("St_PosMod"),
        ],
        separator=" ",
        ignore_nulls=True
    ).alias("StreetAddress"),
    pl.concat_str(
        [
            pl.col("Building"),
            pl.col("Floor"),
            pl.col("Unit"),
            pl.col("Room"),
            pl.col("Seat"),
            pl.col("Addtl_Loc"),
            pl.col("SubAddress")
        ],
        separator=", ",
        ignore_nulls=True
    ).alias("SecondaryAddress"),
    pl.concat_str(
        [
            pl.col("LandmkName"),
            pl.col("County"),
            pl.col("Inc_Muni"),
            pl.col("Post_City"),
            pl.col("State"),
            pl.concat_str(
                [pl.col("Zip_Code").cast(str), pl.col("Plus_4").cast(str)],
                separator="-",
                ignore_nulls=True
            )
        ],
        separator=", ",
        ignore_nulls=True
    ).alias("CityStateZip")
)

df = df.with_columns(
    pl.concat_str(
        [
            pl.col("StreetAddress"),
            pl.col("SecondaryAddress"),
            pl.col("CityStateZip")
        ],
        separator="\n",
        ignore_nulls=True
    ).alias("FullAddress")
)

Some inconsistencies found in the `Inc_Muni` columns

In [None]:
['UNKN', '250240201300', '510000105900', '**PREVIOUS NAME REMOVED BY FDC. (MAYBERRY COURT)', '631332003700', 'SWWJDU 15-5']

In [None]:
'Mh Sw' = 'south west', 'Mm 100.8 I95 Sb Hwy', 'Hwy' = 'highway', 'Lti' = '', 

In [None]:
remove_chars = '?'

In [12]:
idx = 10
print(df['FullAddress'][idx])

617 MEXBORO Road

Monroe, UNINCORPORATED, FRISCO CITY, AL, 36445


Mean & Median address length: 41-42 characters.

In [70]:
mean_add_len = df.with_columns(pl.col('FullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].mean()
median_add_len = df.with_columns(pl.col('FullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].median()

print(f'Mean Address Length: {mean_add_len} | Median Address Length: {median_add_len}')

Mean Address Length: 42.41529813065374 | Median Address Length: 41.0


State full name and abbreviations

In [82]:
state_name_abbr_tuples = [
    ("Alabama", "AL"),
    ("Alaska", "AK"),
    ("Arizona", "AZ"),
    ("Arkansas", "AR"),
    ("California", "CA"),
    ("Colorado", "CO"),
    ("Connecticut", "CT"),
    ("Delaware", "DE"),
    ("District of Columbia", "DC"),
    ("Florida", "FL"),
    ("Georgia", "GA"),
    ("Idaho", "ID"),
    ("Illinois", "IL"),
    ("Indiana", "IN"),
    ("Iowa", "IA"),
    ("Kansas", "KS"),
    ("Kentucky", "KY"),
    ("Louisiana", "LA"),
    ("Maine", "ME"),
    ("Maryland", "MD"),
    ("Massachusetts", "MA"),
    ("Michigan", "MI"),
    ("Minnesota", "MN"),
    ("Mississippi", "MS"),
    ("Missouri", "MO"),
    ("Montana", "MT"),
    ("Nebraska", "NE"),
    ("New Hampshire", "NH"),
    ("New Mexico", "NM"),
    ("New York", "NY"),
    ("North Carolina", "NC"),
    ("North Dakota", "ND"),
    ("Ohio", "OH"),
    ("Oklahoma", "OK"),
    ("Pennsylvania", "PA"),
    ("Rhode Island", "RI"),
    ("South Carolina", "SC"),
    ("South Dakota", "SD"),
    ("Tennessee", "TN"),
    ("Texas", "TX"),
    ("Utah", "UT"),
    ("Vermont", "VT"),
    ("Virginia", "VA"),
    ("Washington", "WA"),
    ("West Virginia", "WV"),
    ("Wisconsin", "WI"),
    ("Wyoming", "WY"),
]

Sampling 10K addresses for each state, from the entire dataset. We do sampling without replacement as we don't want duplicates in the dataset. So we have a dataset of 470K records.

In [79]:
address_per_state = 10_000

In [91]:
def get_state_df(df: pl.DataFrame, state_abv: str, samples: int = address_per_state) -> pl.DataFrame:
    state_df = df.filter(pl.col('State') == state_abv)

    sample_with_replacement = True if len(state_df) < samples else False

    return state_df.sample(n=samples, seed=0, with_replacement=sample_with_replacement, shuffle=True) 

def build_dataset(df: pl.DataFrame, states: List[Tuple[str, str]]) -> pl.DataFrame:
    dfs = [get_state_df(df, state_abv) for state, state_abv in states]
    return pl.concat(dfs)

In [92]:
sampled_df = build_dataset(df, state_name_abbr_tuples)

In [94]:
print(f'Number of samples: {len(sampled_df)}')

Number of samples: 470000


In [95]:
print(f'Number of unique states: {len(sampled_df["State"].unique())}')

Number of unique states: 47


We store the dataset into a parquet format because it is a columnar store and compresses efficiently. 

In [98]:
sampled_df.write_parquet(project_dir / 'nad_sample_address.parquet', compression='gzip')

## Sequence to Sequence Dataset prep

In [140]:
project_dir = Path(os.getcwd()).parent
data_dir = project_dir / 'Data'
null_values = ["unkn", "unincorporated", "unknown", "null", "nan", "null", "nill", "na", "none"]

In [141]:
df = pl.read_parquet(data_dir / 'address_dataset.parquet')
print(f'Shape of data: {df.shape}')
print(f"States: {df['State'].unique().to_list()}")

Shape of data: (470000, 64)
States: ['SD', 'IL', 'VA', 'MD', 'MT', 'NM', 'AZ', 'NY', 'CA', 'WY', 'AL', 'TN', 'ND', 'MS', 'MA', 'WI', 'MI', 'KY', 'WA', 'VT', 'FL', 'ME', 'GA', 'MO', 'AK', 'OH', 'ID', 'OK', 'CO', 'CT', 'IA', 'IN', 'PA', 'RI', 'NC', 'NE', 'AR', 'DE', 'LA', 'KS', 'TX', 'DC', 'MN', 'UT', 'SC', 'WV', 'NH']


In [142]:
state_mapping = {
    'TX': 'Texas',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'WY': 'Wyoming',
    'KY': 'Kentucky',
    'MI': 'Michigan',
    'WA': 'Washington',
    'VT': 'Vermont',
    'ND': 'North Dakota',
    'TN': 'Tennessee',
    'IN': 'Indiana',
    'WV': 'West Virginia',
    'MN': 'Minnesota',
    'RI': 'Rhode Island',
    'DE': 'Delaware',
    'IL': 'Illinois',
    'SD': 'South Dakota',
    'AK': 'Alaska',
    'MS': 'Mississippi',
    'OK': 'Oklahoma',
    'PA': 'Pennsylvania',
    'WI': 'Wisconsin',
    'NY': 'New York',
    'KS': 'Kansas',
    'NM': 'New Mexico',
    'AZ': 'Arizona',
    'SC': 'South Carolina',
    'FL': 'Florida',
    'NC': 'North Carolina',
    'MD': 'Maryland',
    'UT': 'Utah',
    'NE': 'Nebraska',
    'NH': 'New Hampshire',
    'VA': 'Virginia',
    'GA': 'Georgia',
    'AL': 'Alabama',
    'CA': 'California',
    'MA': 'Massachusetts',
    'CT': 'Connecticut',
    'AR': 'Arkansas',
    'CO': 'Colorado',
    'MT': 'Montana',
    'DC': 'District of Columbia',
    'ID': 'Idaho',
    'IA': 'Iowa',
    'OH': 'Ohio',
    'MO': 'Missouri'
}

First lower case all the string columns, and then replace the occurances of null value strings with np.nan and the replace them with empty strings ``

In [143]:
for column in df.select(pl.col(pl.Utf8)).columns:
    df = df.with_columns(
        pl.col(column).str.to_lowercase().alias(column)
    )

In [144]:
for column in df.select(pl.col(pl.Utf8)).columns:
    df = df.with_columns(
        pl.when(pl.col(column).is_in(null_values)).then(np.nan).otherwise(pl.col(column)).alias(column)
    )

In [145]:
df = df.fill_null('')
df = df.fill_nan('')

In [146]:
for column in df.select(pl.col(pl.Utf8)).columns:
    x = df.filter(
        pl.col(column) == 'nan'
    )
    if len(x) > 0:
        print(x)

Function to build the full address and format it appropriately. I try to follow the address format from geopy's `Nominatim` class. This class connects with the openstreet maps data and provides the latitude, longitude and the full address.

In [147]:
def format_usdot_to_freeform_granular(data: dict, state_map: dict) -> str:
    # Custom null-like values to filter
    NULL_STRINGS = {"", None, "nan", "null"}

    def safe_get(key):
        val = data.get(key)
        if isinstance(val, str):
            val = val.lower()
        return None if (val in NULL_STRINGS or str(val).strip() in NULL_STRINGS) else str(val).strip()

    def safe_title(key):
        val = safe_get(key)
        return val.title() if val else None

    # House number
    number = " ".join(filter(None, [safe_get("AddNum_Pre"),
                                    safe_get("Add_Number"),
                                    safe_get("AddNum_Suf")]))

    # Street full
    street_parts = [
        safe_get("St_PreDir"),
        safe_title("St_Name"),
        safe_title("St_PosTyp"),
        safe_get("St_PosDir")
    ]
    street = " ".join(part for part in street_parts if part)

    # Unit/building details
    sub_parts = []
    if safe_get("Building"): sub_parts.append(f"Bldg {safe_get('Building')}")
    if safe_get("Floor"): sub_parts.append(f"Floor {safe_get('Floor')}")
    if safe_get("Unit"): sub_parts.append(f"Unit {safe_get('Unit')}")
    if safe_get("Room"): sub_parts.append(f"Room {safe_get('Room')}")

    sub_address = ", ".join(sub_parts)

    # Town/City
    town = safe_title("Uninc_Comm") or safe_title("Inc_Muni")

    # County
    county = safe_title("County")

    # State
    state_abbr = safe_get("State")
    state_full = state_map.get(state_abbr.upper(), state_abbr) if state_abbr else None

    # ZIP
    zip_raw = safe_get("Zip_Code")
    zip_code = zip_raw.zfill(5) if zip_raw and zip_raw.isdigit() else None

    # Compose full address
    components = [number, street]
    if sub_address:
        components.append(sub_address)
    components.extend([
        town,
        f"{county} County" if county else None,
        state_full,
        zip_code
    ])

    return ", ".join([c for c in components if c])

In [148]:
formatted_addresses = [
    format_usdot_to_freeform_granular(r, state_mapping) 
    for r in df.rows(named=True)
    ]

In [149]:
df = df.with_columns(
    pl.Series("FormattedFullAddress", formatted_addresses)
)

The mean and median address lengths have increased to 65 characters now. This new formatting makes the address strings more clear and easier to read

In [150]:
mean_add_len = df.with_columns(pl.col('FormattedFullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].mean()
median_add_len = df.with_columns(pl.col('FormattedFullAddress').str.len_chars().alias('AddressLength'))['AddressLength'].median()

print(f'Mean Address Length: {mean_add_len} | Median Address Length: {median_add_len}')

Mean Address Length: 64.9411170212766 | Median Address Length: 64.0


In [151]:
idx = 0
df.select(['OID_', 'FullAddress', 'FormattedFullAddress', 'Latitude', 'Longitude'])[idx].to_dict(as_series=False)

{'OID_': [72099617],
 'FullAddress': ['472 south main street\n\ncamp hill, al, 36850'],
 'FormattedFullAddress': ['472, south Main Street, Camp Hill, Tallapoosa County, Alabama, 36850'],
 'Latitude': ['32.79596540137694'],
 'Longitude': ['-85.6535596907001']}

Validating the Formatted Full Address with the Full Address from Nominatim using reverse geocoding

In [152]:
# location = geolocator.geocode("13, John Daniels Place, New Haven County, Connecticut, 06511")
# location = geolocator.geocode({"postalcode": int("06511"), "country": "US"})
location = geolocator.reverse(['41.317783902424', '-72.9320178229665'])

if location is None:
    print("Location not found.")
else:
    print(location.address)
    print(location.latitude, location.longitude)

13, Daniels Place, Dixwell, New Haven, Connecticut, 06511, United States
41.3177839 -72.932018


In [153]:
idx = 100
print(df[idx]['FormattedFullAddress'].item())
print(df[idx]['Latitude'].item(), df[idx]['Longitude'].item())

4524, Old Caldwell Mill Road, Shelby County, Alabama, 35242
33.41236637208968 -86.73952124099591


In [None]:
# df.write_parquet(data_dir / 'new_formatted_addresses.parquet', compression='gzip')

Clearly the newly formatted address matches more closely with the standard open street maps address.

Now building the source-target pairs for supervised fine tuning. The source is the unnormalized / address with mistakes and target is the cleaned address. 
To generate noisy source addresses, we inject the following noise: <br>
 Noise Types Introduced

| Noise Type              | Field           | Description                                                                 |
|-------------------------|------------------|-----------------------------------------------------------------------------|
| Street Number Removal   | `Add_Number`     | 50% chance to remove the house/building number (`None`)                    |
| Character Corruption    | `St_Name`        | 20% per character: replace characters randomly (simulating typos)          |
| City Dropping           | `Post_City`      | 30% chance to remove city field                                             |
| ZIP Code Truncation     | `Zip_Code`       | 20% chance to truncate ZIP (e.g., `36078` → `3607`)                         |

In [127]:
def build_clean_address(row: Dict) -> str:
    """Construct address from structured fields with type consistency"""
    parts = []
    
    # Street component
    street = []
    if row.get('Add_Number') is not None:
        street.append(str(int(row['Add_Number'])))
    if row.get('St_Name'):
        street.append(str(row['St_Name']).lower())
    if street:
        parts.append(' '.join(street))
    
    # Location component
    location = []
    if row.get('Post_City'):
        location.append(str(row['Post_City']).lower())
    if row.get('State'):
        location.append(str(row['State']).lower())
    if row.get('Zip_Code') is not None:
        location.append(f"{int(row['Zip_Code']):05d}"[:5])
    if location:
        parts.append(', '.join(location))
    
    return ', '.join(parts)

def add_character_noise(component: str) -> str:
    """Add character noise while maintaining string type"""
    return ''.join([
        random.choice(string.ascii_lowercase) 
        if c.isalpha() and random.random() < 0.2 
        else c
        for c in component
    ]) if component else component

def generate_noisy_address(row: Dict) -> str:
    """Generate noisy address with type-safe modifications"""
    modified = row.copy()
    
    # 50% chance to remove street number (set to None)
    if random.random() < 0.5:
        modified['Add_Number'] = None
    
    # Add noise to street name (keep as string)
    if modified.get('St_Name'):
        modified['St_Name'] = add_character_noise(str(modified['St_Name']))
    
    # 30% chance to remove city (set to None)
    if random.random() < 0.3:
        modified['Post_City'] = None
    
    # 20% chance to modify zip code (keep as integer)
    if modified.get('Zip_Code') and random.random() < 0.2:
        zip_code = int(modified['Zip_Code'])
        if 10000 <= zip_code <= 99999:
            modified['Zip_Code'] = zip_code // 10  # Truncate last digit
    
    return build_clean_address(modified)

def create_address_pairs(df: pl.DataFrame, n_noisy_varient_per_add: int = 3) -> pl.DataFrame:
    """Generate address pairs with schema consistency"""
    results = []

    for row in df.to_dicts():
        oid = row['OID_']
        state = row['State']

        # Original clean target
        clean_target = build_clean_address(row)
        
        # Add clean pair
        results.append({
            'oid': oid,
            'source': clean_target,
            'target': clean_target,
            'state': state
        })
        
        # Generate n noisy variants
        for _ in range(n_noisy_varient_per_add):
            noisy_source = generate_noisy_address(row)
            results.append({
                'oid': oid,
                'source': noisy_source,
                'target': clean_target,
                'state': state
            })
    # Ensure schema consistency
    return pl.DataFrame(results).unique()