In [None]:
# Use two packages to open Excel files
#!pip install xlrd --break-system-packages

#!pip install openpyxl --break-system-packages

# Install the disambiguation package (no need for a master list of firm names)
#pip install disamby --break-system-packages

# Load package for data manipulation purposes
import pandas as pd
import re

In [None]:
# Load in June 2015 data to make a complete set of data; get rid of associations that list condo in name

june15_df = pd.read_excel("../Data/NRED/2015/NRED HOA - 06 2015.xls")

june15_df = june15_df[~june15_df['Name'].str.contains('condo', case=False, na=False)]

june15_df 

In [None]:
# Create a subset of observations that have management companies associated with them 
# Then sort in ascending alphabetical order according to the management company names
june15_subset = june15_df[june15_df['Address1'].str.contains('C/O', na=False)].sort_values(by='Address1')
june15_subset['City'] = june15_subset['City'].str.title()
june15_subset['Address1'] = june15_subset['Address1'].str.replace(r'\bC/O\b', '', regex=True).str.title()
june15_subset['Address2'] = june15_subset['Address2'].str.title()
june15_subset['Name'] = june15_subset['Name'].str.title()

june15_subset

In [None]:
# Group and rename common entries that are unique in name and by listed phone number
# Proactively clean so that any similar addresses are linked regardless of format 
# (abbreviations, plural/singular) and connected to most common name and phone number

# How else can I refine this so that I catch mistakes?

def singularize_word(word):
    if len(word) > 3 and word.endswith('s') and not word.endswith('ss'):
        return word[:-1]
    return word

def normalize_address(address):
    if pd.isnull(address):
        return ''
    address = address.lower()
    
    # Normalize common road types
    address = re.sub(r'\b(streets|street|st)\b', 'st', address)
    address = re.sub(r'\b(roads|road|rd)\b', 'rd', address)
    address = re.sub(r'\b(avenues|avenue|ave)\b', 'ave', address)
    address = re.sub(r'\b(boulevards|boulevard|blvd)\b', 'blvd', address)
    address = re.sub(r'\b(suites|suite|ste|se)\b', 'ste', address) 
    address = re.sub(r'\b(apartments|apartment|apt)\b', 'apt', address)
    address = re.sub(r'\b(forts|fort|ft)\b', 'ft', address)
    
    # Remove punctuation
    address = re.sub(r'[.,]', '', address)
    address = re.sub(r'\s+', ' ', address).strip()

    # Singularize words
    address = ' '.join(singularize_word(word) for word in address.split())

    return address

june15_subset['Address2_Normalized'] = june15_subset['Address2'].apply(normalize_address)

def get_mode(series):
    mode = series.mode()
    return mode.iloc[0] if not mode.empty else series.iloc[0]

canonical_info = (
    june15_subset
    .groupby('Address2_Normalized')
    .agg({
        'Address1': get_mode,         
    })
    .reset_index()
    .rename(columns={
        'Address1': 'ManagementCompany_Standardized',
    })
)

june15_subset = june15_subset.merge(canonical_info, on='Address2_Normalized', how='left')

june15_subset.to_csv('../Data/Cleaned files/june15_subset.csv', index = False)

june15_subset

In [None]:
# Load in June 2025 data to make a complete set of data; get rid of associations that list condo in name

june25_df = pd.read_excel("../Data/NRED/2025/NRED HOA - 06 2025.xlsx")
june25_df = june25_df[~june25_df['Name'].str.contains('condo', case=False, na=False)]

june25_df 

In [None]:
# Create another subset
june25_subset = june25_df[june25_df['Address1'].str.contains('C/O', na=False)].sort_values(by='Address1')
june25_subset['City'] = june25_subset['City'].str.title()
june25_subset['Address1'] = june25_subset['Address1'].str.replace(r'\bC/O\b', '', regex=True).str.title()
june25_subset['Address2'] = june25_subset['Address2'].str.title()
june25_subset['Name'] = june25_subset['Name'].str.title()

june25_subset

In [None]:
# Repeat the disambiguation process using the previously defined above

june25_subset['Address2_Normalized'] = june25_subset['Address2'].apply(normalize_address)

canonical_info = (
    june25_subset
    .groupby('Address2_Normalized')
    .agg({
        'Address1': get_mode,         
    })
    .reset_index()
    .rename(columns={
        'Address1': 'ManagementCompany_Standardized',
    })
)

june25_subset = june25_subset.merge(canonical_info, on='Address2_Normalized', how='left')

june25_subset.to_csv('../Data/Cleaned files/june25_subset.csv', index = False)

june25_subset