In [28]:
import pandas as pd
import re
from rapidfuzz import fuzz, process
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import symspellpy
from symspellpy import SymSpell

In [29]:
addresses_df = pd.read_csv('list_of_real_usa_addresses.csv')
print(addresses_df)


                      street            city state    zip
0        777 Brockton Avenue        Abington    MA   2351
1           30 Memrial Drive            Avon    MA   2322
2        250 Hartford Avenue      Bellingham    MA   2019
3        777 brockton Avenue        Abington    MA   2351
4              700 Oak Stret        Brockton    MA   2301
..                       ...             ...   ...    ...
230      1501 Skyland Blvd E      Tuscaloosa    AL  35405
231             3501 20th Av          Valley    AL  36854
232  1300 Montgomery Highway  Vestavia Hills    AL  35216
233          4538 Us Hwy 231        Wetumpka    AL  36092
234           2575 Us Hwy 43        Winfield    AL  35594

[235 rows x 4 columns]


In [30]:
# Step 2: Preprocessing the text
def preprocess_address(addresses_df):
    addresses_df["street"] = addresses_df["street"].str.strip().str.lower()
    addresses_df["city"] = addresses_df["city"].str.strip().str.lower()
    addresses_df["state"] = addresses_df["state"].str.strip().str.upper()
    addresses_df["zip"] = addresses_df["zip"].astype(str).str.strip()
    print("Preprocessed Data:")
    print(addresses_df.head())
    return addresses_df
addresses_df = preprocess_address(addresses_df)

Preprocessed Data:
                street        city state   zip
0  777 brockton avenue    abington    MA  2351
1     30 memrial drive        avon    MA  2322
2  250 hartford avenue  bellingham    MA  2019
3  777 brockton avenue    abington    MA  2351
4        700 oak stret    brockton    MA  2301


In [31]:
print(addresses_df.isnull().sum())

street    0
city      0
state     0
zip       0
dtype: int64


In [32]:
# autocorrecta popravq greshno sushkrashteniq na addresi(kato Rd, Dr) i gradove

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = "frequency_dictionary_en_82_765.txt"
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

valid_states = [
    "AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA",
    "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", "NM", "NY", "NC", "ND", "OH", "OK",
    "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"
]
#protected terms mai ne bachka
protected_terms = {"Rd", "Dr", "Ave", "St", "Blvd", "Ln", "Ct", "Pl", "Pkwy", "Cir", "Terr", "Way"}

def correct_street(address):
    # Split the address into words
    words = address.split()

    # Correct only non-numeric and non-protected words
    corrected_words = []
    for word in words:
        # Check if the word is in the protected terms
        if word in protected_terms:
            corrected_words.append(word)
        elif word.isalpha():
            # Apply symspell correction for alphabetic words not in protected terms
            suggestions = sym_spell.lookup(word, symspellpy.Verbosity.CLOSEST, max_edit_distance=2)
            corrected_words.append(suggestions[0].term if suggestions else word)
        else:
            # Keep numbers or mixed words as-is
            corrected_words.append(word)

    # Rejoin the words
    return " ".join(corrected_words)

def correct_city(city):
    # Split the city into words
    words = city.split()
    corrected_words = []
    for word in words:
        if word in protected_terms:  # Protect city terms if necessary
            corrected_words.append(word)
        elif word.isalpha():
            # Apply symspell correction
            suggestions = sym_spell.lookup(word, symspellpy.Verbosity.CLOSEST, max_edit_distance=2)
            corrected_words.append(suggestions[0].term if suggestions else word)
        else:
            corrected_words.append(word)
    return " ".join(corrected_words)

def correct_typos(addresses_df):
    # Correct state abbreviations
    addresses_df["state"] = addresses_df["state"].apply(
        lambda x: process.extractOne(x, valid_states, scorer=fuzz.token_sort_ratio)[0] if x not in valid_states else x
    )

    # Replace street abbreviations
    addresses_df["street"] = addresses_df["street"].str.replace(r"\b(st)\b", "Street", regex=True)
    addresses_df["street"] = addresses_df["street"].str.replace(r"\b(ave)\b", "Avenue", regex=True)

    # Apply corrections to street and city columns
    addresses_df["street"] = addresses_df["street"].apply(correct_street)
    addresses_df["city"] = addresses_df["city"].apply(correct_city)

    # Convert to title case
    addresses_df["street"] = addresses_df["street"].str.title()
    addresses_df["city"] = addresses_df["city"].str.title()
    print(addresses_df.head())
    return addresses_df

# Usage
addresses_df = correct_typos(addresses_df)


                street        city state   zip
0  777 Brockton Avenue    Abingdon    MA  2351
1    30 Memorial Drive        Avon    MA  2322
2  250 Hartford Avenue  Bellingham    MA  2019
3  777 Brockton Avenue    Abingdon    MA  2351
4       700 Oak Street    Brockton    MA  2301


In [33]:
state_full_names = {
    "AL": "Alabama", "AK": "Alaska", "AZ": "Arizona", "AR": "Arkansas", "CA": "California", "CO": "Colorado", "CT": "Connecticut", "DE": "Delaware", "FL": "Florida", "GA": "Georgia", "HI": "Hawaii", "ID": "Idaho", "IL": "Illinois", "IN": "Indiana", "IA": "Iowa", "KS": "Kansas", "KY": "Kentucky", "LA": "Louisiana", "ME": "Maine", "MD": "Maryland", "MA": "Massachusetts", "MI": "Michigan", "MN": "Minnesota", "MS": "Mississippi", "MO": "Missouri", "MT": "Montana", "NE": "Nebraska", "NV": "Nevada", "NH": "New Hampshire", "NJ": "New Jersey", "NM": "New Mexico", "NY": "New York", "NC": "North Carolina", "ND": "North Dakota", "OH": "Ohio", "OK": "Oklahoma", "OR": "Oregon", "PA": "Pennsylvania", "RI": "Rhode Island", "SC": "South Carolina", "SD": "South Dakota", "TN": "Tennessee", "TX": "Texas", "UT": "Utah", "VT": "Vermont", "VA": "Virginia", "WA": "Washington", "WV": "West Virginia", "WI": "Wisconsin", "WY": "Wyoming"
}

def standardize_address(addresses_df):
    addresses_df["zip"] = addresses_df["zip"].str.zfill(5)  # Ensure all ZIP codes are 5 digits
    addresses_df["state"] = addresses_df["state"].map(state_full_names).fillna(addresses_df["state"])
    print("Standardized Addresses:")
    print(addresses_df.head())
    return addresses_df
addresses_df = standardize_address(addresses_df)

Standardized Addresses:
                street        city          state    zip
0  777 Brockton Avenue    Abingdon  Massachusetts  02351
1    30 Memorial Drive        Avon  Massachusetts  02322
2  250 Hartford Avenue  Bellingham  Massachusetts  02019
3  777 Brockton Avenue    Abingdon  Massachusetts  02351
4       700 Oak Street    Brockton  Massachusetts  02301


In [34]:
def deduplicate_addresses(addresses_df):
    addresses_df = addresses_df.drop_duplicates(subset=["street", "city", "state", "zip"], keep="first")
    print("Deduplicated Data:")
    print(addresses_df.head())
    return addresses_df
addresses_df = deduplicate_addresses(addresses_df)

Deduplicated Data:
                street        city          state    zip
0  777 Brockton Avenue    Abingdon  Massachusetts  02351
1    30 Memorial Drive        Avon  Massachusetts  02322
2  250 Hartford Avenue  Bellingham  Massachusetts  02019
4       700 Oak Street    Brockton  Massachusetts  02301
5    66-4 Pankhurst Re  Chelmsford  Massachusetts  01824


In [26]:
def validate_and_geocode(addresses_df):
    geolocator = Nominatim(user_agent="address_cleaner")

    def geocode(row):
        try:
            address = f"{row['street']}, {row['city']}, {row['state']}, {row['zip']}"
            location = geolocator.geocode(address, timeout=4)
            if location:
                return pd.Series([location.latitude, location.longitude])
            else:
                return pd.Series([None, None])
        except GeocoderTimedOut:
            return pd.Series([None, None])

    addresses_df[["latitude", "longitude"]] = addresses_df.apply(geocode, axis=1)
    print("Geocoded Data:")
    print(addresses_df.head())
    return addresses_df

addresses_df = validate_and_geocode(addresses_df)

Geocoded Data:
                street        city          state    zip   latitude  longitude
0  777 Brockton Avenue    Abingdon  Massachusetts  02351        NaN        NaN
1    30 Memorial Drive        Avon  Massachusetts  02322  42.121302 -71.030164
2  250 Hartford Avenue  Bellingham  Massachusetts  02019  42.116332 -71.466154
4       700 Oak Street    Brockton  Massachusetts  02301  42.098118 -71.056756
5    66-4 Pankhurst Re  Chelmsford  Massachusetts  01824        NaN        NaN
