## Problem statement

### Resources
* Pre-existing IO simulations from a computational model
$$ D^{sim}=\{(\theta_i, d_i), y_i\}_{i=1}^{N} $$
* A surrogate or high-fidelity simulator (for incremental sampling strategy)

$$y = M(x;\theta) $$

* Empirical data from a real system output responses
$$ D^{emp}=\{y_i\}_{i=1}^{N} $$
* Possibly conditioned by some known quantity (design/building type) $x$
$$ D^{emp}=\{x_k ; \{ y_{j,k} \}_{j=1}^N \}_{i=1}^{n_x} $$

### Objective

- Use existing $D^{sim}$ and possibly the surrogate (efficiently) to calibrate a plausible set or a distribution for the uncertain model parameters $\theta$



In [None]:
from resources.AIRMODE.load_helpers import *
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
import seaborn as sbn

In [None]:
# ------------------------------------------------
#  LOAD AND PARTITION INTO MODEL INPUTS/OUTPUTS
# ------------------------------------------------
D_sim = pd.read_pickle('../resources/ENERGY_PLUS/energy_plus_simulations.pkl')

# inputs = [ "num_persons", "u_roof", "u_wall", "u_ground", 'rh_out_level_hourxdayxzone'   "schedul_temps", "u_wind", "S_V", "wwr",  'rectangularity', 'perimeter', 'height' ]

# outputs_names  = ["annual_demand_kwh_m2_year"] + [f"{m}_demand_kwh/mq" for m in range(1, 3)]
output_names = ["annual_demand_kwh_m2_year"] + [f"{m}_demand_kwh/mq" for m in range(1, 13)] # all ouputs
x_names = ['u_ground', 'u_roof',]
theta_names = ["num_persons", 'rh_out_level_hourxdayxzone', "S_V", "wwr", "perc_adiabatic",  'u_wall', 'u_wind']


In [None]:
D_sim.columns

In [None]:
# Define building_type .....
# DEFINE “BUILDING TYPE” USING TRANSMITTANCE
building_type = (D_sim['u_ground'] < 0.5) & \
                (D_sim['u_roof'] < 0.5)


# GENERATE A SYNTHETIC DATA SET OF OUTPUT ENERGY CONSUMPTION RESPONSES FROM a subset of buildings from this type
sel = D_sim[building_type].iloc[:5, :]
samples = []
observed_data = []
for _, y in sel.iterrows():
    base = y.copy()
    # extract numeric version of outputs (forces float)
    base_num = pd.to_numeric(base[output_names], errors='coerce').astype(float)
    pert = np.random.normal(   # generate 100 perturbations for the variables in `outputs`
        loc=base_num.values,
        scale=0.1,            # adjust noise level
        size=(100, len(output_names))
    )
    rep = pd.DataFrame(np.tile(base[x_names].values, (100, 1)), columns=x_names)  # repeat all columns 100 times
    rep[output_names] = pert  # assign perturbed values (converted to float)
    samples.append(rep)

    observed_data.append({'Y_emp': pert, 'X': base[x_names].values})

D_emp = pd.concat(samples, ignore_index=True)

In [None]:
theta_sim = D_sim[theta_names].to_numpy()
xi_sim = D_sim[x_names].to_numpy()
y_sim   = D_sim[output_names].to_numpy()

simulated_data = [y_sim, theta_sim, xi_sim]
theta_set_posterior = []

for OBS_k in observed_data:

    Y_emp_k = OBS_k['Y_emp']
    x_k = OBS_k['X']

    # Define X_sim, Theta_sim, Y_sim
    theta_set_k = estimate_p_theta_knn(observed_data=Y_emp_k, simulated_data= simulated_data,    xi_star=x_k,   knn =10, a_tol =0.25)
    theta_set_posterior.append(theta_set_k)

theta_set_posterior_all = np.vstack(theta_set_posterior)

In [None]:

# Convert to DataFrames
df_post = pd.DataFrame(theta_set_posterior_all, columns=theta_names)
df_post["type"] = "posterior"

df_sim  = pd.DataFrame(theta_sim, columns=theta_names)
df_sim["type"] = "simulated"

# Combine
df_all = pd.concat([df_post, df_sim], ignore_index=True)

# Plot
sbn.pairplot(df_all, hue="type", diag_kind="kde", plot_kws={"alpha": 0.4}, corner=True)


In [None]:
##example plot Y_emp vs Y_sim

# Add a 'source' column to distinguish datasets
data_all_labeled = D_sim[output_names].copy()
data_all_labeled['source'] = 'simulated'


Y_emp_labeled = D_emp[output_names].copy()
Y_emp_labeled['source'] = 'empirical'

# Combine
combined = pd.concat([data_all_labeled, Y_emp_labeled], axis=0)

# Pairplot with hue to distinguish
sbn.pairplot(combined, hue='source', corner=True, plot_kws={'alpha':0.5})


In [None]:

# and Y_emp, X_emp

Y_emp = D_emp[output_names].copy()
X_emp = D_emp[x_names].copy()  # conditionig parameters

In [None]:
#import numpy as np
#Y_emp = Y_sim[np.logical_and(Y_sim['annual_demand_kwh_m2_year']>35 , Y_all['annual_demand_kwh_m2_year']<40)]
#X_target = X_sim[np.logical_and(Y_all['annual_demand_kwh_m2_year']>35 , Y_all['annual_demand_kwh_m2_year']<40)]


In [None]:
# ------------------------------------------------
# 6.  RUN DATA-DRIVEN CALIBRATION METHOD
# ------------------------------------------------
X_selected, Y_selected , Y_emp_array= X_sim.to_numpy(), Y_sim.to_numpy(), Y_emp.to_numpy()

# fit standard scaler on the outputs responses
scaler = StandardScaler()
scaler.fit(Y_selected)

## - - - Union of simulated input outputs including new simulation
N_knn = 10
neigh = NearestNeighbors(n_neighbors=N_knn)
neigh.fit(scaler.transform(Y_selected))
_, knn_idx2 = neigh.kneighbors(scaler.transform(Y_emp_array))
theta_set_2 = np.vstack([X_selected[idx] for idx in knn_idx2])
Y_calibrated_knn_2  = np.vstack([Y_selected[idx] for idx in knn_idx2])
Y_calibrated_knn_pd = pd.DataFrame(Y_calibrated_knn_2, columns=output_names)



In [None]:
# --- Plots  (OUTPUT EMP, OUT Calibrated vs OUT TARGET) --
plt.figure(figsize=(6,4))
namex, namey = 'annual_demand_kwh_m2_year', '5_demand_kwh/mq'
plt.scatter(Y_sim[namex], Y_sim[namey], c='b', label='sim DB')
plt.scatter(Y_calibrated_knn_pd[namex], Y_calibrated_knn_pd[namey], 5, alpha=0.6, c='g', label=f'{N_knn}-knn')
plt.scatter(Y_emp[namex], Y_emp[namey] , c ='r', alpha=0.6, label='emp')

plt.legend()
plt.title("Simulated vs Empirical Outputs")
plt.xlabel(namex)
plt.ylabel(namey)
plt.yticks([])
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

X_calibrated_knn_pd = pd.DataFrame(theta_set_2, columns=inputs)

n_vars = len(inputs)
fig, axes = plt.subplots(n_vars, 1, figsize=(8, 3*n_vars))

for i, var in enumerate(inputs):
    ax = axes[i]

    # Plot histograms
    sns.histplot(X_all[var], bins=15, color='b', alpha=0.3, kde=True, stat='density',  ax=ax, label='sim')
    sns.histplot(X_calibrated_knn_pd[var], bins=15, color='g', alpha=0.5, kde=True, stat='density',  ax=ax, label='calibrated (knn)')
    sns.histplot(X_target[var], bins=15, color='r', alpha=0.6, kde=True, stat='density',  ax=ax, label='target (unknown)')

    ax.set_xlabel(var)
    ax.set_ylabel('Density')
    ax.legend()

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import itertools

X_calibrated_knn_pd = pd.DataFrame(theta_set_2, columns=inputs)
# Generate all unique pairs of columns
pairs = list(itertools.combinations(inputs, 2))
for px, py in pairs:
    fig, axes = plt.subplots(2, 2, figsize=(10, 8),
                             gridspec_kw={'height_ratios':[4,1], 'width_ratios':[4,1],   'hspace':0.05, 'wspace':0.05})

    ax_scatter = axes[0,0]
    ax_histx = axes[1,0]
    ax_histy = axes[0,1]

    # Scatter plot
    ax_scatter.scatter(X_all[px], X_all[py], s=10 , alpha=0.6, c='b', label="sim")
    ax_scatter.scatter(X_calibrated_knn_pd[px], X_calibrated_knn_pd[py],   s=10, alpha=0.8, c="g", label="calibrated (knn)")
    ax_scatter.scatter(X_target[px], X_target[py],  s=40, alpha=0.9, c="r", label="target (unknown)")
    # sns.kdeplot(x=X_all[px], y=X_all[py], ax=ax_scatter, levels=5, color='b', linewidths=1, alpha=0.5)
    #sns.kdeplot(x=X_calibrated_knn_pd[px], y=X_calibrated_knn_pd[py], ax=ax_scatter, levels=5, color='g', linewidths=1, alpha=0.7)

    ax_scatter.set_xlabel(px)
    ax_scatter.set_ylabel(py)
    ax_scatter.legend()

    # Histograms
    sns.histplot(X_all[px], bins=10, ax=ax_histx, color='b', alpha=0.3,  kde=True, stat='density', fill=False)
    sns.histplot(X_target[px], bins=10,  ax=ax_histx, color='r', alpha=0.3, kde=True, stat='density', fill=False)
    sns.histplot(X_calibrated_knn_pd[px], bins=10,  ax=ax_histx, color='g', kde=True, alpha=0.5, stat='density', fill=False)
    ax_histx.set_xlabel(px)

    sns.histplot(X_all[py], bins=10, ax=ax_histy, color='b', alpha=0.3, kde=True, stat='density', orientation='horizontal')
    sns.histplot(X_target[py], bins=10, ax=ax_histy, color='r', alpha=0.3, kde=True,stat='density', orientation='horizontal')
    sns.histplot(X_calibrated_knn_pd[py], bins=10, ax=ax_histy, color='g', alpha=0.5, kde=True, stat='density', orientation='horizontal')
    ax_histy.set_ylabel(py)

    # Hide the empty subplot (bottom right)
    axes[1,1].axis('off')

    plt.suptitle(f"Posterior Samples: {px} vs {py}", y=1.02)
    #plt.tight_layout()
    plt.show()

