In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib_venn import venn2

# Load the CSV files as DataFrames
compstak_df = pd.read_csv(r'C:\Users\clint\Desktop\compstak-analysis\Data\compstak.csv')
doe_df = pd.read_csv(r'C:\Users\clint\Desktop\compstak-analysis\Data\DOE.csv')

In [2]:
# Define the ambiguous property types to focus on
ambiguous_types = [
    "Flex",
    "Health Care",
    "Specialty",
    "Sports & Entertainment",
    "Unknown"
]

# Filter doe_df for rows where reported_propertytype is in ambiguous_types
ambiguous_df = doe_df[doe_df['reported_propertytype'].isin(ambiguous_types)]

# Get unique values in reported_propertysubtype for these rows
unique_subtypes = [str(val) for val in ambiguous_df['reported_propertysubtype'].dropna().unique()]
print("Unique 'reported_propertysubtype' values for ambiguous 'reported_propertytype' entries:")
print(sorted(unique_subtypes))
print("\nNumber of unique values (excluding NaN):", len(unique_subtypes))

# Count NaN values in reported_propertysubtype for these rows
nan_count = ambiguous_df['reported_propertysubtype'].isna().sum()
print("\nNumber of NaN values in 'reported_propertysubtype' for ambiguous 'reported_propertytype' entries:", nan_count)


Unique 'reported_propertysubtype' values for ambiguous 'reported_propertytype' entries:
['Airplane Hangar', 'Airport', 'Amusement Park', 'Assisted Living', 'Auto Salvage Facility', 'Baseball Field', 'Car Wash', 'Casino', 'Cement/Gravel Plant', 'Cemetery/Mausoleum', 'Chemical/Oil Refinery', 'Congregate Senior Housing', 'Continuing Care Retirement Community', 'Contractor Storage Yard', 'Correctional Facility', 'Drive-in Movie', 'Golf Course/Driving Range', 'Horse Stables', 'Hospital', 'Landfill', 'Light Distribution', 'Light Manufacturing', 'Lodge/Meeting Hall', 'Lumberyard', 'Marina', 'Movie/Radio/TV Studio', 'Parking Garage', 'Parking Lot', 'Police / Fire Station', 'Post Office', 'Public Library', 'R&D', 'Race Track', 'Radio/TV Transmission Facilities', 'Railroad Yard', 'Recycling Center', 'Rehabilitation Center', 'Religious Facility', 'Residential Income', 'Schools', 'Self-Storage', 'Shelter', 'Shipyard', 'Showroom', 'Skating Rink', 'Skilled Nursing Facility', 'Sorority / Fraternity H

In [3]:

# Show the 'reported_propertytype' values for rows with NaN in 'reported_propertysubtype'
nan_subtype_df = ambiguous_df[ambiguous_df['reported_propertysubtype'].isna()]
print("Counts of 'reported_propertytype' for rows with NaN in 'reported_propertysubtype':")
print(nan_subtype_df['reported_propertytype'].value_counts())


Counts of 'reported_propertytype' for rows with NaN in 'reported_propertysubtype':
reported_propertytype
Flex                      38472
Specialty                 12593
Health Care                3288
Sports & Entertainment     1531
Unknown                       6
Name: count, dtype: int64


In [4]:
# Define the ambiguous property types to focus on
ambiguous_types = [
    "Flex",
    "Health Care",
    "Specialty",
    "Sports & Entertainment",
    "Unknown"
]

# Filter for rows with ambiguous property types
ambiguous_df = doe_df[doe_df['reported_propertytype'].isin(ambiguous_types)]

# Print each ambiguous type and its associated unique subtypes
for prop_type in ambiguous_types:
    subtypes = ambiguous_df.loc[ambiguous_df['reported_propertytype'] == prop_type, 'reported_propertysubtype'].dropna().unique()
    print(f"Subtypes for '{prop_type}':")
    print(sorted([str(s) for s in subtypes]))
    print()

Subtypes for 'Flex':
['Light Distribution', 'Light Manufacturing', 'R&D', 'Showroom', 'Telecom Hotel/Data Hosting']

Subtypes for 'Health Care':
['Assisted Living', 'Congregate Senior Housing', 'Continuing Care Retirement Community', 'Hospital', 'Rehabilitation Center', 'Skilled Nursing Facility']

Subtypes for 'Specialty':
['Airplane Hangar', 'Airport', 'Auto Salvage Facility', 'Car Wash', 'Cement/Gravel Plant', 'Cemetery/Mausoleum', 'Chemical/Oil Refinery', 'Contractor Storage Yard', 'Correctional Facility', 'Drive-in Movie', 'Landfill', 'Lodge/Meeting Hall', 'Lumberyard', 'Marina', 'Movie/Radio/TV Studio', 'Parking Garage', 'Parking Lot', 'Police / Fire Station', 'Post Office', 'Public Library', 'Radio/TV Transmission Facilities', 'Railroad Yard', 'Recycling Center', 'Religious Facility', 'Residential Income', 'Schools', 'Self-Storage', 'Shelter', 'Shipyard', 'Sorority / Fraternity House', 'Trailer / Camper Park', 'Utility Sub-Station', 'Water Retention Facility', 'Water Treatment F

In [5]:
import pandas as pd

# Define the mapping data
mapping_data = [
    ["Industrial", "All", "Industrial"],
    ["Multi-Family", "All", "Multi-Family"],
    ["Office", "All", "Office"],
    ["Retail", "All", "Retail"],
    ["General Retail", "All", "Retail"],
    ["Flex", "Light Distribution", "Industrial"],
    ["Flex", "Light Manufacturing", "Industrial"],
    ["Flex", "R&D", "Industrial"],
    ["Flex", "Showroom", "Retail"],
    ["Flex", "Telecom Hotel/Data Hosting", "Other"],
    ["Health Care", "Assisted Living", "Other"],
    ["Health Care", "Congregate Senior Housing", "Other"],
    ["Health Care", "Continuing Care Retirement Community", "Other"],
    ["Health Care", "Hospital", "Hotel"],
    ["Health Care", "Rehabilitation Center", "Other"],
    ["Health Care", "Skilled Nursing Facility", "Other"],
    ["Hospitality", "All", "Hotel"],
    ["Specialty", "Car Wash", "Retail"],
    ["Specialty", "Cement/Gravel Plant", "Industrial"],
    ["Specialty", "Chemical/Oil Refinery", "Industrial"],
    ["Specialty", "Contractor Storage Yard", "Industrial"],
    ["Specialty", "Recycling Center", "Industrial"],
    ["Specialty", "Shipyard", "Industrial"],
    ["Specialty", "Lodge/Meeting Hall", "Hotel"],
    ["Specialty", "Casino", "Hotel"],
    ["Specialty", "Residential Income", "Multi-Family"],
    ["Specialty", "All Others", "Other"],
    ["Sports & Entertainment", "Casino", "Hotel"],
    ["Sports & Entertainment", "Theater/Concert Hall", "Other"],
    ["Sports & Entertainment", "All Others", "Other"],
    ["Unknown", "All", "Other"]
]

# Create DataFrame
mapping_df = pd.DataFrame(mapping_data, columns=["DOE Property Type", "DOE Subtype", "Mapped Compstak Property Type"])

# Display the DataFrame
mapping_df

Unnamed: 0,DOE Property Type,DOE Subtype,Mapped Compstak Property Type
0,Industrial,All,Industrial
1,Multi-Family,All,Multi-Family
2,Office,All,Office
3,Retail,All,Retail
4,General Retail,All,Retail
5,Flex,Light Distribution,Industrial
6,Flex,Light Manufacturing,Industrial
7,Flex,R&D,Industrial
8,Flex,Showroom,Retail
9,Flex,Telecom Hotel/Data Hosting,Other
