In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from matplotlib_venn import venn2
import plotly.express as px
import os


In [2]:
compstak_df = pd.read_csv(r'C:\Users\clint\Desktop\compstak-analysis\Data\compstak_mapped.csv')
doe_df = pd.read_csv(r'C:\Users\clint\Desktop\compstak-analysis\Data\DOE_mapped.csv')

In [3]:
compstak_df

Unnamed: 0,Property Type,Property Subtype,Property Id,State,DOE_Compliant_Property_Type
0,Retail,Parking,1,NY,Other
1,Retail,Apartments,2,NY,Multi-Family
2,Office,,3,NY,Office
3,Retail,,4,NY,Retail
4,Office,Mixed-Use,5,NY,Other
...,...,...,...,...,...
759618,,,3611385,NY,Other
759619,Industrial,Flex/R&D,3611389,FL,Industrial
759620,Industrial,Flex/R&D,3611390,FL,Industrial
759621,,,3611391,CA,Other


In [4]:
doe_df

Unnamed: 0,statecode,reported_propertytype,reported_propertysubtype,compstak_equivalent_category
0,CT,Flex,Light Manufacturing,Industrial
1,CT,Industrial,Warehouse,Industrial
2,CT,Industrial,,Industrial
3,CT,Multi-Family,Apartments,Multi-Family
4,CT,Multi-Family,Apartments,Multi-Family
...,...,...,...,...
2246480,WI,Retail,Storefront Retail/Office,Retail
2246481,WI,Retail,Storefront Retail/Office,Retail
2246482,WI,Retail,,Retail
2246483,WI,Retail,,Retail


## Coverage Rate Calculation
Calculate the coverage rate of `compstak_df` as a subset of `doe_df`.

In [5]:
# Calculate coverage rate
coverage_rate = len(compstak_df) / len(doe_df)
print(f"Coverage rate: {coverage_rate:.2%}")

Coverage rate: 33.81%


## Segmentation by Category
Compare the segmentation of both datasets by their equivalent property type/category.

In [6]:
# Group and count by category for both datasets
compstak_counts = compstak_df['DOE_Compliant_Property_Type'].value_counts().reset_index()
compstak_counts.columns = ['Category', 'Compstak_Count']
doe_counts = doe_df['compstak_equivalent_category'].value_counts().reset_index()
doe_counts.columns = ['Category', 'DOE_Count']
# Merge for comparison
category_coverage = pd.merge(doe_counts, compstak_counts, on='Category', how='left').fillna(0)
category_coverage['Coverage_Rate'] = category_coverage['Compstak_Count'] / category_coverage['DOE_Count']
category_coverage

Unnamed: 0,Category,DOE_Count,Compstak_Count,Coverage_Rate
0,Retail,931077,195522,0.209996
1,Industrial,438534,183106,0.417541
2,Office,388432,88768,0.228529
3,Multi-Family,253361,114427,0.451636
4,Other,161804,167732,1.036637
5,Hotel,73277,10068,0.137396


## Sunburst Visualization (DOE Inner, Compstak Outer)
The inner ring shows DOE category counts, and the outer ring shows Compstak counts for the same categories.

In [7]:
# Prepare data for hierarchical sunburst: DOE (inner), Compstak (outer)
sunburst_hier = []
# Add DOE categories (parents are '')
for _, row in doe_counts.iterrows():
    sunburst_hier.append({
        'id': f"DOE_{row['Category']}",
        'label': row['Category'],
        'parent': '',
        'value': row['DOE_Count']
    })
# Add Compstak categories (parent is DOE category)
for _, row in compstak_counts.iterrows():
    sunburst_hier.append({
        'id': f"Compstak_{row['Category']}",
        'label': row['Category'],
        'parent': f"DOE_{row['Category']}",
        'value': row['Compstak_Count']
    })
sunburst_hier_df = pd.DataFrame(sunburst_hier)

fig = px.sunburst(
    sunburst_hier_df,
    ids='id',
    names='label',  # <-- use 'names' instead of 'labels'
    parents='parent',
    values='value',
    title='DOE (Inner) and Compstak (Outer) Coverage by Property Category',
)
fig.update_traces(
    sort=False,
    marker=dict(colors=["lightblue" if 'DOE' in i else "orange" for i in sunburst_hier_df['id']])
)
fig.show()

## Coverage Rate Visualizations
The following visualizations show the coverage rate of Compstak data against DOE data, both overall and by property category.

In [8]:
# 1. Bar chart showing coverage rate by category
fig_bar = px.bar(
    category_coverage.sort_values('Coverage_Rate', ascending=False),
    x='Category',
    y='Coverage_Rate',
    color='Coverage_Rate',
    text_auto='.1%',
    title='Compstak Coverage Rate by Property Category',
    labels={'Coverage_Rate': 'Coverage Rate (%)'},
    color_continuous_scale='Viridis'
)
fig_bar.update_traces(textposition='outside')
fig_bar.update_layout(yaxis=dict(tickformat='.0%'))
fig_bar.show()

# 2. Pie chart showing proportion of covered vs uncovered records
covered = len(compstak_df)
total = len(doe_df)
uncovered = total - covered

fig_pie = px.pie(
    names=['Covered by Compstak', 'Not Covered'],
    values=[covered, uncovered],
    title='Overall Coverage: DOE Data Covered by Compstak',
    color_discrete_sequence=['orange', 'lightblue'],
    hole=0.4
)
fig_pie.update_traces(textinfo='percent+label', textposition='outside')
fig_pie.show()

# 3. Stacked bar chart showing volume comparison 
category_volume = pd.melt(
    category_coverage, 
    id_vars=['Category'], 
    value_vars=['DOE_Count', 'Compstak_Count'],
    var_name='Source', 
    value_name='Count'
)

fig_stack = px.bar(
    category_volume,
    x='Category',
    y='Count',
    color='Source',
    barmode='group',
    title='DOE vs Compstak Volume by Property Category',
    color_discrete_map={'DOE_Count': 'lightblue', 'Compstak_Count': 'orange'}
)
fig_stack.show()

In [10]:
# Create directory if it doesn't exist
save_dir = r"C:\Users\clint\Desktop\compstak-analysis\Images\Corrected Data"
os.makedirs(save_dir, exist_ok=True)

# Save all plots as HTML files (interactive)
fig.write_html(os.path.join(save_dir, "sunburst_coverage.html"))
fig_bar.write_html(os.path.join(save_dir, "category_coverage_bar.html"))
fig_pie.write_html(os.path.join(save_dir, "overall_coverage_pie.html"))
fig_stack.write_html(os.path.join(save_dir, "category_volume_comparison.html"))

# Also save as static image files
fig.write_image(os.path.join(save_dir, "sunburst_coverage.png"))
fig_bar.write_image(os.path.join(save_dir, "category_coverage_bar.png"))
fig_pie.write_image(os.path.join(save_dir, "overall_coverage_pie.png"))
fig_stack.write_image(os.path.join(save_dir, "category_volume_comparison.png"))

print(f"All visualizations saved to {save_dir}")

All visualizations saved to C:\Users\clint\Desktop\compstak-analysis\Images\Corrected Data
