In [None]:
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

# Allow autoreload as we develop the GMT in parallel
%load_ext autoreload
%autoreload 2

dataset_path = Path().resolve() / "datasets"
# get the full path
print(f"Dataset path: {dataset_path}")

data_path = dataset_path / "comstock"
# get the full path
print(f"Data path: {data_path}")

figures_path = Path().resolve() / "figures" / "cec"
if not figures_path.exists():
    figures_path.mkdir(parents=True)
print(f"Figures path: {figures_path}")

warnings.filterwarnings("ignore", category=FutureWarning)
# ignore SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# ComStock

### Pull down ComStock data

To do this, you should be able to just simply run the `comstock_processor.py` file within VSCode. The data will be saved into 
the ComStock subfolder. It takes a while to run (10ish minutes).

In [None]:
# %run comstock_processor.py

### Read in ComStock data

In [None]:
# read in the comstock exported file, yes we could just read the parquet, maybe update?
df_all = pd.read_csv(data_path / "All-All-All-0-selected_metadata.csv")

In [None]:
# get the dimensions
print(df_all.shape)
# show all states
print(df_all["in.state"].unique())
# show all the building types
print(df_all["in.comstock_building_type"].unique())

In [None]:
# save the names of the fields to a list
fields = df_all.columns
with open(figures_path / "fields.txt", "w") as f:
    f.write("\n".join(fields))

In [None]:
# find if climate zone 7A or 7B or 7
df_ca = df_all[df_all["in.state"].str.contains("CA")]
print(df_ca["in.building_subtype"].unique())
print(df_ca["in.comstock_building_type"].unique())
print(df_ca["in.county_name"].unique())
# print the dimensions for each dataframe
print(f"all: {df_all.shape}")
print(f"cz7: {df_ca.shape}")

In [None]:
# calculate the of buildings in each 'in.comstock_building_type'
print(df_ca["in.comstock_building_type"].value_counts())

county_counts = df_ca["in.county_name"].value_counts()
# make it a dataframe
county_counts = county_counts.reset_index()
# in the in.county_name remove CA, and County
county_counts["in.county_name"] = county_counts["in.county_name"].str.replace("CA,", "").str.replace("County", "")
# and trim the remainder
county_counts["in.county_name"] = county_counts["in.county_name"].str.strip()

display(county_counts)  # print(df_ca["in.county_name"].value_counts())

In [None]:
import geopandas as gpd

# Load a built-in dataset of US counties (you need geopandas installed)
counties_gdf = gpd.read_file("https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json")

# Filter for California counties (FIPS state code for CA is '06')
california_counties = counties_gdf[counties_gdf["STATE"] == "06"]

# convert county_counts to a dictionary of format {county_name: ["a", "b", "c"], count: [1, 2, 3]}
county_mapping = dict(zip(county_counts["in.county_name"], county_counts["count"]))
california_counties["count"] = california_counties["NAME"].map(county_mapping)

# Plot the map on a logscale
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
california_counties.plot(
    column="count",
    cmap="Oranges",
    legend=True,
    missing_kwds={"color": "lightgrey"},
    ax=ax,
)
plt.title("California Buildings with Counts", fontsize=16)
plt.show()

# create another plot but without Los Angeles
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
california_counties[california_counties["NAME"] != "Los Angeles"].plot(
    column="count",
    cmap="Oranges",
    legend=True,
    missing_kwds={"color": "lightgrey"},
    ax=ax,
)
plt.title("California Buildings (without LA) with Counts", fontsize=16)

In [None]:
df_ca["EUI_kBTUft2"] = df_ca["out.site_energy.total.energy_consumption_intensity"] * 3.412

# return the quartiles of the column out.site_energy.total.energy_consumption_intensity
print(df_ca["EUI_kBTUft2"].describe())
print(df_ca["out.site_energy.total.energy_consumption_intensity"].describe())
df_ca["EUI_distcool_kBTUft2"] = (
    df_ca["calc.enduse_group.district_cooling.hvac.energy_consumption..kwh"] / df_ca["in.sqft"]
) * 3.412  # from kwh/ft2 to kbtu/ft2
print(df_ca["EUI_distcool_kBTUft2"].describe())

# plot just distcool
sns.kdeplot(df_ca["EUI_distcool_kBTUft2"])
plt.xlim(-10, 25)