In [None]:
# Use two packages to open Excel files
#!pip install xlrd --break-system-packages

!pip install openpyxl --break-system-packages

# Install the disambiguation package (no need for a master list of firm names)
#pip install disamby --break-system-packages
!pip install rapidfuzz

# Load package for data manipulation purposes
import pandas as pd
import re
from rapidfuzz import fuzz, process 

In [None]:
# Load in June 2015 data to make a complete set of data; get rid of associations that list condo in name and those with zero listed units

june15_df = pd.read_excel("../Data/NRED/2015/NRED HOA - 06 2015.xls")
june15_df = june15_df[~june15_df['Name'].str.contains('condo', case=False, na=False)]
june15_df = june15_df[~(june15_df['# of Units'] == 0)]

june15_df 

In [None]:
# Create a subset of observations that have management companies associated with them 
# Then sort in ascending alphabetical order according to the management company names
june15_subset = june15_df[june15_df['Address1'].str.contains('C/O', na=False)].sort_values(by='Address1')
june15_subset['City'] = june15_subset['City'].str.title()
june15_subset['Address1'] = june15_subset['Address1'].str.replace(r'\bC/O\b', '', regex=True).str.title()
june15_subset['Address2'] = june15_subset['Address2'].str.title()
june15_subset['Name'] = june15_subset['Name'].str.title()

june15_subset

In [None]:
# Kept this the way it was (old, less accurate fuzzy) - see looping for correct code

import pandas as pd
import re
from rapidfuzz import process, fuzz

def singularize_word(word):
    if len(word) > 3 and word.endswith('s') and not word.endswith('ss'):
        return word[:-1]
    return word

def normalize_address(address):
    if pd.isnull(address):
        return ''
    address = address.lower()
    
    # Normalize common road types
    address = re.sub(r'\b(streets|street|st)\b', 'st', address)
    address = re.sub(r'\b(roads|road|rd)\b', 'rd', address)
    address = re.sub(r'\b(avenues|avenue|ave)\b', 'ave', address)
    address = re.sub(r'\b(boulevards|boulevard|blvd)\b', 'blvd', address)
    address = re.sub(r'\b(suites|suite|ste|se)\b', 'ste', address) 
    address = re.sub(r'\b(apartments|apartment|apt)\b', 'apt', address)
    address = re.sub(r'\b(forts|fort|ft)\b', 'ft', address)
    
    # Standardize street directions
    address = re.sub(r'\b(south|s)\b', 's', address)
    address = re.sub(r'\b(north|n)\b', 'n', address)
    address = re.sub(r'\b(east|e)\b', 'e', address)
    address = re.sub(r'\b(west|w)\b', 'w', address)
    address = re.sub(r'\b(southeast|se)\b', 'se', address)
    address = re.sub(r'\b(northeast|ne)\b', 'ne', address)
    address = re.sub(r'\b(southwest|sw)\b', 'sw', address)
    address = re.sub(r'\b(northwest|nw)\b', 'nw', address)
    
    # Eliminate firm office indicators
    address = re.split(r'\b(suite|ste|apt|apartment|unit|site)\b', address)[0]
    
    # Remove punctuation and number sign
    address = re.sub(r'[.,]', '', address)
    address = re.sub(r'\s+', ' ', address).strip()
    address = re.sub(r'#', '', address)

    # Singularize words
    address = ' '.join(singularize_word(word) for word in address.split())

    return address

# Do a similar process for the firm names
def normalize_firm_name(name):
    if pd.isnull(name):
        return ''
    name = name.lower().strip()
    
    # Remove suffixes
    name = re.sub(r'\b(llc|inc|corp|co|ltd)\b', '', name)
    
    # Remove commas etc.
    name = re.sub(r'[^\w\s]', '', name)
    
    # Remove extra spaces
    name = re.sub(r'\s+', ' ', name)
    
    # Singularize words
    name = ' '.join(singularize_word(word) for word in name.split())
    
    return name

# Normalize the phone
def normalize_phone(phone):
    if pd.isnull(phone):
        return ''
    return re.sub(r'\D', '', str(phone))

def get_mode(series):
    mode = series.mode()
    return mode.iloc[0] if not mode.empty else series.iloc[0]

june15_subset['Address2_Normalized'] = june15_subset['Address2'].apply(normalize_address)
june15_subset['Firm_Normalized'] = june15_subset['Address1'].apply(normalize_firm_name)
june15_subset['Phone_Normalized'] = june15_subset['Telephone'].apply(normalize_phone)

june15_subset['Firm_Normalized'] = june15_subset['Firm_Normalized'].str.replace(
    r'\bmangement\b', 'management', regex=True
)

# Most common phone and address by firm name
canonical_phone = (
    june15_subset.groupby('Phone_Normalized')
    .agg({'Firm_Normalized': get_mode})
    .rename(columns={'Firm_Normalized': 'Firm_ByPhone'})
    .reset_index()
)

canonical_address = (
    june15_subset.groupby('Address2_Normalized')
    .agg({'Firm_Normalized': get_mode})
    .rename(columns={'Firm_Normalized': 'Firm_ByAddress'})
    .reset_index()
)

june15_subset = june15_subset.merge(canonical_phone, on='Phone_Normalized', how='left')
june15_subset = june15_subset.merge(canonical_address, on='Address2_Normalized', how='left')

# Build the hierarchy
def pick_canonical(row):
    if pd.notnull(row['Firm_ByPhone']):
        return row['Firm_ByPhone']
    elif pd.notnull(row['Firm_ByAddress']):
        return row['Firm_ByAddress']
    else:
        return row['Firm_Normalized']

june15_subset['Firm_Canonical'] = june15_subset.apply(pick_canonical, axis=1)

# Fuzzy matching within the canonical deduplication
def fuzzy_within_group(group, threshold=90):
    unique_names = group['Firm_Canonical'].unique()
    name_map = {}
    for name in unique_names:
        if name in name_map:
            continue
        matches = process.extract(name, unique_names, scorer=fuzz.token_set_ratio, limit=None)
        close_matches = [m[0] for m in matches if m[1] >= threshold]
        for cm in close_matches:
            name_map[cm] = name
    group['Firm_Final'] = group['Firm_Canonical'].map(name_map).fillna(group['Firm_Canonical'])
    return group

# Apply fuzzy matching **within each phone group first** to avoid cross-company merges
june15_subset = june15_subset.groupby('Phone_Normalized').apply(fuzzy_within_group).reset_index(drop=True)

# Fuzzy across all names to catch edge cases
all_names = june15_subset['Firm_Final'].unique()
name_map_global = {}

for name in all_names:
    if name in name_map_global:
        continue
    matches = process.extract(name, all_names, scorer=fuzz.token_set_ratio, limit=None)
    close_matches = [m[0] for m in matches if m[1] >= 90]  
    for cm in close_matches:
        name_map_global[cm] = name

june15_subset['Firm_Final'] = june15_subset['Firm_Final'].map(name_map_global).fillna(june15_subset['Firm_Final'])

# Save data
june15_subset.to_csv('../Data/Cleaned files/june15_subset.csv', index=False)

In [None]:
# Load in June 2025 data to make a complete set of data; get rid of associations that list condo in name and zero units cases

june25_df = pd.read_excel("../Data/NRED/2025/NRED HOA - 06 2025.xlsx")
june25_df = june25_df[~june25_df['Name'].str.contains('condo', case=False, na=False)]
june25_df = june25_df[~(june25_df['# of Units'] == 0)]

june25_df 

In [None]:
# Create another subset
june25_subset = june25_df[june25_df['Address1'].str.contains('C/O', na=False)].sort_values(by='Address1')
june25_subset['City'] = june25_subset['City'].str.title()
june25_subset['Address1'] = june25_subset['Address1'].str.replace(r'\bC/O\b', '', regex=True).str.title()
june25_subset['Address2'] = june25_subset['Address2'].str.title()
june25_subset['Name'] = june25_subset['Name'].str.title()

june25_subset

In [None]:
# Repeat process for second static period
# Kept this the way it was (old, less accurate fuzzy) - see looping for correct code

june25_subset['Address2_Normalized'] = june25_subset['Address2'].apply(normalize_address)
june25_subset['Firm_Normalized'] = june25_subset['Address1'].apply(normalize_firm_name)
june25_subset['Phone_Normalized'] = june25_subset['Telephone'].apply(normalize_phone)

june25_subset['Firm_Normalized'] = june25_subset['Firm_Normalized'].str.replace(
    r'\bmangement\b', 'management', regex=True
)

canonical_phone = (
    june25_subset.groupby('Phone_Normalized')
    .agg({'Firm_Normalized': get_mode})
    .rename(columns={'Firm_Normalized': 'Firm_ByPhone'})
    .reset_index()
)

canonical_address = (
    june25_subset.groupby('Address2_Normalized')
    .agg({'Firm_Normalized': get_mode})
    .rename(columns={'Firm_Normalized': 'Firm_ByAddress'})
    .reset_index()
)

june25_subset = june25_subset.merge(canonical_phone, on='Phone_Normalized', how='left')
june25_subset = june25_subset.merge(canonical_address, on='Address2_Normalized', how='left')

june25_subset['Firm_Canonical'] = june25_subset.apply(pick_canonical, axis=1)

june25_subset = june25_subset.groupby('Phone_Normalized').apply(fuzzy_within_group).reset_index(drop=True)

all_names = june25_subset['Firm_Final'].unique()
name_map_global = {}

for name in all_names:
    if name in name_map_global:
        continue
    matches = process.extract(name, all_names, scorer=fuzz.token_set_ratio, limit=None)
    close_matches = [m[0] for m in matches if m[1] >= 90] 
    for cm in close_matches:
        name_map_global[cm] = name

june25_subset['Firm_Final'] = june25_subset['Firm_Final'].map(name_map_global).fillna(june25_subset['Firm_Final'])

june25_subset.to_csv('../Data/Cleaned files/june25_subset.csv', index=False)

In [6]:
!pip install pyxlsb

import pandas as pd
import glob
import os

nred_folder = '../Data/NRED/'

# Find all Excel files within the folders
all_files = glob.glob(os.path.join(nred_folder, '**', '*.xls*'), recursive=True)
print(f"Found {len(all_files)} Excel files")

# Create new list to store
month_data = []

for file in all_files:
    try:
        # Read Excel, handle .xlsb and .xls/.xlsx
        if file.endswith('.xlsb'):
            df = pd.read_excel(file, engine='pyxlsb')
        else:
            df = pd.read_excel(file)
        
        if df.empty:
            continue

        # Try to extract month and year from file name (safer)
        # Example file name: "NRED HOA - 06 2015.xls"
        base_name = os.path.splitext(os.path.basename(file))[0]
        parts = base_name.split('-')[-1].strip().split()
        if len(parts) != 2:
            print(f"Skipping {file}, cannot extract month/year")
            continue
        month = int(parts[0])
        year = int(parts[1])

        # Filter out condos
        df = df[~df['Name'].str.contains('condo', case=False, na=False)]

        # Filter zero units using proper column name
        df['# of Units'] = pd.to_numeric(df['# of Units'], errors='coerce')
        df = df[df['# of Units'] > 0]

        # Count total and professionally managed
        total_rows = df.shape[0]
        professional_rows = (
            df['Address1']
            .fillna('')
            .str.replace(r'\bC/O\b', '', regex=True)
            .str.strip()
            .ne('')
            .sum()
        )

        month_data.append({
            'Year': year,
            'Month': month,
            'Total_HOAs': total_rows,
            'Professional_HOAs': professional_rows
        })

    except Exception as e:
        print(f"Error reading {file}: {e}")

if not month_data:
    raise ValueError("No valid files found to process!")

# Convert to DataFrame
summary_df = pd.DataFrame(month_data)

# Sort by Year and Month
summary_df = summary_df.sort_values(['Year', 'Month']).reset_index(drop=True)

# Save for Stata
summary_df.to_csv('../Data/Misc/HOA_summary_monthly_for_stata.csv', index=False)
print("Monthly summary CSV saved!")

Found 121 Excel files


  warn("""Cannot parse header or footer so it will be ignored""")


Monthly summary CSV saved!


In [1]:
# Build managed-firm subset ONLY

import pandas as pd
import glob
import os
import re

def singularize_word(word):
    if len(word) > 3 and word.endswith('s') and not word.endswith('ss'):
        return word[:-1]
    return word

def normalize_address(address):
    if pd.isnull(address):
        return ''
    address = address.lower()
    address = re.sub(r'\b(streets|street|st)\b', 'st', address)
    address = re.sub(r'\b(roads|road|rd)\b', 'rd', address)
    address = re.sub(r'\b(avenues|avenue|ave)\b', 'ave', address)
    address = re.sub(r'\b(boulevards|boulevard|blvd)\b', 'blvd', address)
    address = re.sub(r'\b(suites|suite|ste|se)\b', 'ste', address)
    address = re.sub(r'\b(apartments|apartment|apt)\b', 'apt', address)
    address = re.sub(r'\b(forts|fort|ft)\b', 'ft', address)
    address = re.sub(r'\b(south|s)\b', 's', address)
    address = re.sub(r'\b(north|n)\b', 'n', address)
    address = re.sub(r'\b(east|e)\b', 'e', address)
    address = re.sub(r'\b(west|w)\b', 'w', address)
    address = re.sub(r'\b(southeast|se)\b', 'se', address)
    address = re.sub(r'\b(northeast|ne)\b', 'ne', address)
    address = re.sub(r'\b(southwest|sw)\b', 'sw', address)
    address = re.sub(r'\b(northwest|nw)\b', 'nw', address)
    address = re.split(r'\b(suite|ste|apt|apartment|unit|site)\b', address)[0]
    address = re.sub(r'[.,#]', '', address)
    address = re.sub(r'\s+', ' ', address).strip()
    address = ' '.join(singularize_word(w) for w in address.split())
    return address

def normalize_firm_name(name):
    if pd.isnull(name):
        return ''
    name = name.strip().lower()
    name = re.sub(r'\b(llc|inc|corp|corporation|co|company|ltd|limited)\b', '', name)
    name = re.sub(r'[^\w\s]', '', name)
    name = re.sub(r'\s+', ' ', name)
    name = ' '.join(singularize_word(w) for w in name.split())
    return name

def normalize_phone(phone):
    if pd.isnull(phone):
        return ''
    phone_str = str(phone)
    phone_str = re.split(r'[xX]', phone_str)[0]
    return re.sub(r'\D', '', phone_str)

def get_mode(series):
    mode = series.mode()
    return min(mode, key=len) if not mode.empty else series.iloc[0]

def pick_canonical(row):
    if pd.notnull(row.get('Firm_ByPhone')):
        return row['Firm_ByPhone']
    elif pd.notnull(row.get('Firm_ByAddress')):
        return row['Firm_ByAddress']
    return row['Firm_Normalized']

root_folder = "../Data/NRED/"
output_folder = "../Data/Cleaned files/"
os.makedirs(output_folder, exist_ok=True)

excel_files = glob.glob(os.path.join(root_folder, '**', '*.xls*'), recursive=True)
print(f"Found {len(excel_files)} Excel files")


for file in excel_files:
    try:
        df = pd.read_excel(file, dtype=str)

        for col in ['Name', 'Address1', 'Address2', 'Telephone', 'City']:
            if col in df.columns:
                df[col] = (
                    df[col].fillna('')
                    .str.replace('\xa0', ' ', regex=False)
                    .str.strip()
                    .str.replace(r'\s+', ' ', regex=True)
                )

        df = df[~df['Name'].str.contains('condo', case=False, na=False)]
        df = df[df['# of Units'] != '0']
        
        df['Address1_clean'] = df['Address1'].str.replace(r'\bC/O\b', '', regex=True).str.strip()
        subset = df[df['Address1_clean'].str.strip() != ''].copy()
        
        if subset.empty:
            continue

        subset['City'] = subset['City'].str.title()
        subset['Address1'] = subset['Address1_clean'].str.title()
        subset['Address2'] = subset['Address2'].str.title()
        subset['Name'] = subset['Name'].str.title()

        subset['Address2_Normalized'] = subset['Address2'].apply(normalize_address)
        subset['Firm_Normalized'] = subset['Address1'].apply(normalize_firm_name)
        subset['Phone_Normalized'] = subset['Telephone'].apply(normalize_phone).replace('', pd.NA)

        subset['Firm_Normalized'] = subset['Firm_Normalized'].replace({
            "ams mangement": "ams management group",
            "firstservice residential": "firstservice residential nevada",
            "first service residential": "firstservice residential nevada",
            "level community management": "level property management",
            "ccmc": "capital consultant management",
            "seabeeze management": "seabreeze management",
            "westward 360": "westward360",
            "terra west property management": "terra west management service",
            "nicklin community management": "nicklin community management service",
            "nicklin property management": "nicklin community management service",
            "associa nevada south": "associa",
            "associa sierra north": "associa",
            "nicklin property management investment": "nicklin community management service"
        })

        canonical_phone = (
            subset.groupby('Phone_Normalized')['Firm_Normalized']
            .agg(get_mode)
            .rename('Firm_ByPhone')
            .reset_index()
        )

        canonical_address = (
            subset.groupby('Address2_Normalized')['Firm_Normalized']
            .agg(get_mode)
            .rename('Firm_ByAddress')
            .reset_index()
        )

        subset = subset.merge(canonical_phone, on='Phone_Normalized', how='left')
        subset = subset.merge(canonical_address, on='Address2_Normalized', how='left')

        subset['Firm_Final'] = subset.apply(pick_canonical, axis=1)

        firm_frequency = subset['Firm_Normalized'].value_counts()
        STABLE_THRESHOLD = 10

        subset['Firm_Final'] = subset.apply(
            lambda r: r['Firm_Normalized']
            if r['Firm_Normalized'] != r['Firm_Final']
            and firm_frequency.get(r['Firm_Normalized'], 0) >= STABLE_THRESHOLD
            else r['Firm_Final'],
            axis=1
        )

        subset['Name_Changed_Flag'] = subset['Firm_Normalized'] != subset['Firm_Final']

        base = os.path.splitext(os.path.basename(file))[0]
        subset.to_csv(os.path.join(output_folder, f"{base}_subset.csv"), index=False)

        print(f"Subset written: {base}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

Found 121 Excel files
Subset written: NRED HOA - 06 2015
Subset written: NRED HOA - 07 2015
Subset written: NRED HOA - 08 2015
Subset written: NRED HOA - 09 2015
Subset written: NRED HOA - 10 2015
Subset written: NRED HOA - 11 2015
Subset written: NRED HOA - 12 2015
Subset written: NRED HOA - 01 2016
Subset written: NRED HOA - 02 2016
Subset written: NRED HOA - 03 2016
Subset written: NRED HOA - 04 2016
Subset written: NRED HOA - 05 2016
Subset written: NRED HOA - 06 2016
Subset written: NRED HOA - 07 2016
Subset written: NRED HOA - 08 2016
Subset written: NRED HOA - 09 2016
Subset written: NRED HOA - 10 2016
Subset written: NRED HOA - 11 2016
Subset written: NRED HOA - 12 2016
Subset written: NRED HOA - 01 2017
Subset written: NRED HOA - 02 2017
Subset written: NRED HOA - 03 2017
Subset written: NRED HOA - 04 2017
Subset written: NRED HOA - 05 2017
Subset written: NRED HOA - 06 2017
Subset written: NRED HOA - 07 2017
Subset written: NRED HOA - 08 2017
Subset written: NRED HOA - 09 201

  warn("""Cannot parse header or footer so it will be ignored""")


Subset written: NRED HOA - 07 2018
Subset written: NRED HOA - 08 2018
Subset written: NRED HOA - 09 2018
Subset written: NRED HOA - 10 2018
Subset written: NRED HOA - 11 2018
Subset written: NRED HOA - 12 2018
Subset written: NRED HOA - 01 2019
Subset written: NRED HOA - 02 2019
Subset written: NRED HOA - 03 2019
Subset written: NRED HOA - 04 2019
Subset written: NRED HOA - 05 2019
Subset written: NRED HOA - 06 2019
Subset written: NRED HOA - 07 2019
Subset written: NRED HOA - 08 2019
Subset written: NRED HOA - 09 2019
Subset written: NRED HOA - 10 2019
Subset written: NRED HOA - 11 2019
Subset written: NRED HOA - 12 2019
Subset written: NRED HOA - 01 2020
Subset written: NRED HOA - 02 2020
Subset written: NRED HOA - 03 2020
Subset written: NRED HOA - 04 2020
Subset written: NRED HOA - 05 2020
Subset written: NRED HOA - 06 2020
Subset written: NRED HOA - 07 2020
Subset written: NRED HOA - 08 2020
Subset written: NRED HOA - 09 2020
Subset written: NRED HOA - 10 2020
Subset written: NRED

In [2]:
# CELL 2 â€” Build FULL file by appending unmanaged rows

import pandas as pd
import glob
import os

root_folder = "../Data/NRED/"
cleaned_folder = "../Data/Cleaned files/"

excel_files = glob.glob(os.path.join(root_folder, '**', '*.xls*'), recursive=True)

for file in excel_files:
    try:
        base = os.path.splitext(os.path.basename(file))[0]

        df_raw = pd.read_excel(file, dtype=str)

        for col in ['Name', 'Address1', 'Address2', 'Telephone', 'City']:
            if col in df_raw.columns:
                df_raw[col] = (
                    df_raw[col].fillna('')
                    .str.replace('\xa0', ' ', regex=False)
                    .str.strip()
                    .str.replace(r'\s+', ' ', regex=True)
                )

        unmanaged = df_raw[df_raw['Address1'].str.strip() == ''].copy()

        subset_path = os.path.join(cleaned_folder, f"{base}_subset.csv")
        if not os.path.exists(subset_path):
            continue

        subset = pd.read_csv(subset_path, dtype=str)

        for col in subset.columns:
            if col not in unmanaged.columns:
                unmanaged[col] = pd.NA

        for col in unmanaged.columns:
            if col not in subset.columns:
                subset[col] = pd.NA

        full_df = pd.concat(
            [subset, unmanaged],
            ignore_index=True,
            sort=False
        )

        full_df.to_csv(
            os.path.join(cleaned_folder, f"{base}_full_plus_subset.csv"),
            index=False
        )

        print(f"Full file written: {base}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

Full file written: NRED HOA - 06 2015
Full file written: NRED HOA - 07 2015
Full file written: NRED HOA - 08 2015
Full file written: NRED HOA - 09 2015
Full file written: NRED HOA - 10 2015
Full file written: NRED HOA - 11 2015
Full file written: NRED HOA - 12 2015
Full file written: NRED HOA - 01 2016
Full file written: NRED HOA - 02 2016
Full file written: NRED HOA - 03 2016
Full file written: NRED HOA - 04 2016
Full file written: NRED HOA - 05 2016
Full file written: NRED HOA - 06 2016
Full file written: NRED HOA - 07 2016
Full file written: NRED HOA - 08 2016
Full file written: NRED HOA - 09 2016
Full file written: NRED HOA - 10 2016
Full file written: NRED HOA - 11 2016
Full file written: NRED HOA - 12 2016
Full file written: NRED HOA - 01 2017
Full file written: NRED HOA - 02 2017
Full file written: NRED HOA - 03 2017
Full file written: NRED HOA - 04 2017
Full file written: NRED HOA - 05 2017
Full file written: NRED HOA - 06 2017
Full file written: NRED HOA - 07 2017
Full file wr

  warn("""Cannot parse header or footer so it will be ignored""")


Full file written: NRED HOA - 07 2018
Full file written: NRED HOA - 08 2018
Full file written: NRED HOA - 09 2018
Full file written: NRED HOA - 10 2018
Full file written: NRED HOA - 11 2018
Full file written: NRED HOA - 12 2018
Full file written: NRED HOA - 01 2019
Full file written: NRED HOA - 02 2019
Full file written: NRED HOA - 03 2019
Full file written: NRED HOA - 04 2019
Full file written: NRED HOA - 05 2019
Full file written: NRED HOA - 06 2019
Full file written: NRED HOA - 07 2019
Full file written: NRED HOA - 08 2019
Full file written: NRED HOA - 09 2019
Full file written: NRED HOA - 10 2019
Full file written: NRED HOA - 11 2019
Full file written: NRED HOA - 12 2019
Full file written: NRED HOA - 01 2020
Full file written: NRED HOA - 02 2020
Full file written: NRED HOA - 03 2020
Full file written: NRED HOA - 04 2020
Full file written: NRED HOA - 05 2020
Full file written: NRED HOA - 06 2020
Full file written: NRED HOA - 07 2020
Full file written: NRED HOA - 08 2020
Full file wr

In [12]:
# Build wide version

import pandas as pd
import glob
import os
import re

input_folder = "../Data/Cleaned files/"
output_file = os.path.join(input_folder, "HOA_Firm_Wide_Decembers.csv")

all_full_files = glob.glob(os.path.join(input_folder, "*_full_plus_subset.csv"))

csv_files = [f for f in all_full_files if re.search(r'- 12 ', os.path.basename(f))]
print(f"Found {len(csv_files)} December CSV files")

all_dfs = []

for file in csv_files:
    base_name = os.path.basename(file)
    
    year_search = re.search(r'(\d{4})', base_name)
    if year_search:
        year = int(year_search.group(1))
    else:
        print(f"Warning: Could not detect year from filename {base_name}, skipping")
        continue
    
    df = pd.read_csv(file, dtype=str)
    
    df = df[['Name', 'Firm_Final']].copy()
    
    df['Exists'] = 1

    df['Name'] = df['Name'].str.strip().str.title()
    
    df['Year'] = year
    
    all_dfs.append(df)

long_df = pd.concat(all_dfs, ignore_index=True)

wide_firm = long_df.pivot_table(
    index='Name',
    columns='Year',
    values='Firm_Final',
    aggfunc='first'
)

wide_exists = long_df.pivot_table(
    index='Name',
    columns='Year',
    values='Exists',
    aggfunc='first'
)

wide_firm.columns = wide_firm.columns.map(str)
wide_exists.columns = wide_exists.columns.map(str)

wide_firm = wide_firm.reset_index()
wide_exists = wide_exists.reset_index()

wide_df_complete = wide_firm.merge(
    wide_exists,
    on='Name',
    suffixes=('', '_exists')
)

year_columns = sorted(
    [col for col in wide_firm.columns if col != 'Name']
)

def count_changes(row):
    firms = [row[year] for year in year_columns]
    changes = 0
    for f1, f2 in zip(firms[:-1], firms[1:]):
        # Count change if values differ
        # AND at least one period has a non-missing firm
        if f1 != f2 and (pd.notna(f1) or pd.notna(f2)):
            changes += 1
    return changes

wide_df_complete['Number of changes'] = wide_df_complete.apply(
    count_changes, axis=1
)

wide_df_complete['Frequent changer'] = (
    wide_df_complete['Number of changes'] >= 2
)

wide_df_complete.to_csv(output_file, index=False)

print(f"Wide HOA dataset saved to: {output_file}")
print("Shape:", wide_df_complete.shape)

dec_counts = long_df.groupby('Year')['Name'].nunique().reset_index()
dec_counts.columns = ['Year', 'December_HOAs']
dec_counts['Wide_panel_rows'] = wide_df_complete.shape[0]

print("\nDiagnostic: December snapshot vs wide panel")
print(dec_counts)

# -----------------------------
# Year-by-year wide panel growth
# -----------------------------
years_sorted = sorted(long_df['Year'].unique())
rows_over_time = []

current_hoas = set()

for year in years_sorted:
    # HOAs in this December
    dec_hoas = set(long_df.loc[long_df['Year'] == year, 'Name'])
    
    # Add to the running set of all HOAs included so far
    current_hoas.update(dec_hoas)
    
    rows_over_time.append({
        'Year': year,
        'December_HOAs': len(dec_hoas),
        'Wide_panel_rows_so_far': len(current_hoas)
    })

wide_growth_df = pd.DataFrame(rows_over_time)

print("\nWide panel growth year by year:")
print(wide_growth_df)

####### Summary stats
num_rows = wide_df_complete.shape[0]
num_frequent = wide_df_complete['Frequent changer'].sum()
pct_frequent = 100 * num_frequent / num_rows

print(f"\nPercentage of frequent changers in the wide dataset: {pct_frequent:.2f}% ({num_frequent} of {num_rows} rows)")

year_columns = [str(y) for y in range(2015, 2025)]
rows_all_firm = wide_df_complete[year_columns].notna().all(axis=1).sum()

print(f"Number of HOAs with a listed firm in every December 2015-2024: {rows_all_firm} of {num_rows} rows")

Found 10 December CSV files
Wide HOA dataset saved to: ../Data/Cleaned files/HOA_Firm_Wide_Decembers.csv
Shape: (3643, 23)

Diagnostic: December snapshot vs wide panel
   Year  December_HOAs  Wide_panel_rows
0  2015           3070             3643
1  2016           3129             3643
2  2017           3168             3643
3  2018           3265             3643
4  2019           3346             3643
5  2020           3386             3643
6  2021           3395             3643
7  2022           3541             3643
8  2023           3608             3643
9  2024           3682             3643

Wide panel growth year by year:
   Year  December_HOAs  Wide_panel_rows_so_far
0  2015           3070                    3070
1  2016           3129                    3153
2  2017           3168                    3211
3  2018           3265                    3329
4  2019           3346                    3441
5  2020           3386                    3529
6  2021           3395        