In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib_venn import venn2

# Load the CSV files as DataFrames
compstak_df = pd.read_csv(r'C:\Users\clint\Desktop\compstak-analysis\Data\compstak.csv')
doe_df = pd.read_csv(r'C:\Users\clint\Desktop\compstak-analysis\Data\DOE.csv')

In [2]:
mapping_data = [
    # Direct mappings
    ["Industrial", "All", "Industrial"],
    ["Multi-Family", "All", "Multi-Family"],
    ["Office", "All", "Office"],
    ["Retail", "All", "Retail"],
    ["General Retail", "All", "Retail"],
    ["Hospitality", "All", "Hotel"],

    # Flex
    ["Flex", "Light Distribution", "Industrial"],
    ["Flex", "Light Manufacturing", "Industrial"],
    ["Flex", "R&D", "Industrial"],
    ["Flex", "Showroom", "Retail"],
    ["Flex", "Telecom Hotel/Data Hosting", "Other"],
    ["Flex", "All Others", "Industrial"],

    # Health Care
    ["Health Care", "Assisted Living", "Other"],
    ["Health Care", "Congregate Senior Housing", "Other"],
    ["Health Care", "Continuing Care Retirement Community", "Other"],
    ["Health Care", "Hospital", "Hotel"],
    ["Health Care", "Rehabilitation Center", "Other"],
    ["Health Care", "Skilled Nursing Facility", "Other"],

    # Specialty
    ["Specialty", "Airplane Hangar", "Other"],
    ["Specialty", "Airport", "Other"],
    ["Specialty", "Auto Salvage Facility", "Other"],
    ["Specialty", "Car Wash", "Retail"],
    ["Specialty", "Cement/Gravel Plant", "Industrial"],
    ["Specialty", "Cemetery/Mausoleum", "Other"],
    ["Specialty", "Chemical/Oil Refinery", "Industrial"],
    ["Specialty", "Contractor Storage Yard", "Industrial"],
    ["Specialty", "Correctional Facility", "Other"],
    ["Specialty", "Drive-in Movie", "Other"],
    ["Specialty", "Landfill", "Other"],
    ["Specialty", "Lodge/Meeting Hall", "Hotel"],
    ["Specialty", "Lumberyard", "Industrial"],
    ["Specialty", "Marina", "Other"],
    ["Specialty", "Movie/Radio/TV Studio", "Other"],
    ["Specialty", "Parking Garage", "Other"],
    ["Specialty", "Parking Lot", "Other"],
    ["Specialty", "Police/Fire Station", "Other"],
    ["Specialty", "Post Office", "Other"],
    ["Specialty", "Public Library", "Other"],
    ["Specialty", "Radio/TV Transmission Facilities", "Other"],
    ["Specialty", "Railroad Yard", "Industrial"],
    ["Specialty", "Recycling Center", "Industrial"],
    ["Specialty", "Religious Facility", "Other"],
    ["Specialty", "Residential Income", "Multi-Family"],
    ["Specialty", "Schools", "Other"],
    ["Specialty", "Self-Storage", "Other"],
    ["Specialty", "Shelter", "Other"],
    ["Specialty", "Shipyard", "Industrial"],
    ["Specialty", "Sorority/Fraternity House", "Other"],
    ["Specialty", "Trailer/Camper Park", "Other"],
    ["Specialty", "Utility Sub-Station", "Other"],
    ["Specialty", "Water Retention Facility", "Other"],
    ["Specialty", "Water Treatment Facility", "Other"],
    ["Specialty", "Winery/Vineyard", "Other"],
    ["Specialty", "All Others", "Other"],

    # Sports & Entertainment
    ["Sports & Entertainment", "Amusement Park", "Other"],
    ["Sports & Entertainment", "Baseball Field", "Other"],
    ["Sports & Entertainment", "Casino", "Hotel"],
    ["Sports & Entertainment", "Golf Course/Driving Range", "Other"],
    ["Sports & Entertainment", "Horse Stables", "Other"],
    ["Sports & Entertainment", "Race Track", "Other"],
    ["Sports & Entertainment", "Skating Rink", "Other"],
    ["Sports & Entertainment", "Swimming Pool", "Other"],
    ["Sports & Entertainment", "Theater/Concert Hall", "Other"],
    ["Sports & Entertainment", "All Others", "Other"],

    # Unknown
    ["Unknown", "All", "Other"]
]

# Step 2: Create mapping DataFrame
mapping_df = pd.DataFrame(mapping_data, columns=["DOE Property Type", "DOE Subtype", "Mapped Compstak Property Type"])

In [3]:
# Define the mapping function
def map_compstak_category(row):
    prop_type = row['reported_propertytype']
    prop_subtype = row['reported_propertysubtype']

    # First: look for exact match on property type where DOE Subtype == 'All'
    match = mapping_df[
        (mapping_df['DOE Property Type'] == prop_type) &
        (mapping_df['DOE Subtype'] == 'All')
    ]
    if not match.empty:
        return match.iloc[0]['Mapped Compstak Property Type']

    # Second: look for match on subtype
    subtype_match = mapping_df[
        (mapping_df['DOE Property Type'] == prop_type) &
        (mapping_df['DOE Subtype'] == prop_subtype)
    ]
    if not subtype_match.empty:
        return subtype_match.iloc[0]['Mapped Compstak Property Type']

    # Third: if subtype is empty or NaN, treat as 'All Others' for ambiguous types
    if (pd.isna(prop_subtype) or str(prop_subtype).strip() == ''):
        fallback_match = mapping_df[
            (mapping_df['DOE Property Type'] == prop_type) &
            (mapping_df['DOE Subtype'] == 'All Others')
        ]
        if not fallback_match.empty:
            return fallback_match.iloc[0]['Mapped Compstak Property Type']

    # If nothing matches
    return 'Unmapped'

In [4]:
# Apply the mapping function to the DOE DataFrame
doe_df['compstak_equivalent_category'] = doe_df.apply(map_compstak_category, axis=1)
doe_df

Unnamed: 0,statecode,reported_propertytype,reported_propertysubtype,compstak_equivalent_category
0,CT,Flex,Light Manufacturing,Industrial
1,CT,Industrial,Warehouse,Industrial
2,CT,Industrial,,Industrial
3,CT,Multi-Family,Apartments,Multi-Family
4,CT,Multi-Family,Apartments,Multi-Family
...,...,...,...,...
2246480,WI,Retail,Storefront Retail/Office,Retail
2246481,WI,Retail,Storefront Retail/Office,Retail
2246482,WI,Retail,,Retail
2246483,WI,Retail,,Retail


In [5]:
doe_df.to_csv('DOE.csv', index=False)