In [1]:
import numpy as np
import pandas as pd
from resources.OTHERS.models import paraboloid_model as simulator
from src.forward import forward_propagation

from src.plot import plot_kde_2d
from src.backward import KNNCalibrator
import matplotlib.pyplot as plt
from src.plot import scatter_post

from src.doe import run_sequential_doe_knn, select_next_xi_by_eig

In [None]:
def sample_true_uncertainty_model_2d(N = 10, rng = np.random.default_rng()):
    xa = rng.normal(4.0, 0.5, size=(N, 2))   # aleatoric
    xe = np.tile([2.0, 0.0], (N, 1))         # epistemic (fixed, repeated)
    return np.hstack([xa, xe])

# generate data for the horse and pony show
N_emp = 200
theta_true_cloud = sample_true_uncertainty_model_2d(200)
xi_list_c3 = [-2.0]
observations_c3 = []
for xi in xi_list_c3:
    y_emp  = simulator(theta_true_cloud, xi)  # shape (100,1) per design
    observations_c3.append((y_emp, xi))

print(f"CASE 3 - designs: {len(observations_c3)} samples: {observations_c3[0][0].shape[0]}")


In [None]:
class UM_theta:
    def __init__(self,a, b, xe1 = 2.0, xe2 = 0.0):
        self.a = 4.0 if a is None else a
        self.b = 0.5 if b is None else b
        self.xe1 = xe1
        self.xe2 = xe2

    def sample(self, n_samples=100):
        xa = np.random.normal(self.a, self.b, size=(n_samples, 2))   # aleatoric
        xe = np.tile([self.xe1, self.xe2], (n_samples, 1))         # epistemic (fixed, repeated)
        return np.hstack([xa, xe])

    def update(self,a,b, xe1, xe2):
        self.a = a
        self.b = b
        self.xe1 = xe1
        self.xe2 = xe2

class UniformPrior:
    def __init__(self, a=-10 , b=10, xe1=2.0, xe2=0.0):
        self.a = a
        self.b = b
        self.xe1 = xe1
        self.xe2 = xe2

    def sample(self, n_samples=100):
        xa = np.random.uniform(self.a, self.b, size=(n_samples, 2))   # aleatoric
        xe = np.tile([self.xe1, self.xe2], (n_samples, 1))         # epistemic (fixed, repeated)
        return np.hstack([xa, xe])

    def update(self, a, b, xe1, xe2):
        self.a = a
        self.b = b
        self.xe1 = xe1
        self.xe2 = xe2

In [None]:
# generate simulation from prior for a design xi0
N = 500_000
xi0 = 1.0 # assume first design was given
theta_uniform = UniformPrior(-20,20).sample(N)
xi_uniform = np.random.uniform(-1,1, (N,1)) *0 + xi0
Y_sim = simulator(theta_uniform,xi=xi_uniform)
df_sim = pd.DataFrame(np.hstack([xi_uniform, theta_uniform, Y_sim]), columns=['xc','xa1','xa2','xe1','xe2','y' ])

# generate empirical data from the unknown pdf
pdf_theta = UM_theta(a=4.0, b=0.5)
y, samples = forward_propagation(pdf_theta, simulator, xi=xi0, n_samples=500) # empirical data
D_emp = {'xi': xi0, 'y': y}

In [None]:
# perform calibration get posterior using the KNN calibrator

simulated_data = {"y": Y_sim, "theta": theta_uniform[:,:2], "xi": xi_uniform}

calib_comb = KNNCalibrator(knn=50, evaluate_model=False, a_tol=0.15)  # 'prep model
calib_comb.setup(simulated_data=simulated_data, xi_list=[1.0])

obs = [(y.reshape(-1,1), xi0)]
post_reuse = calib_comb.calibrate(obs, combine="stack", resample_n=10000)
xa_posterior = post_reuse["theta"]

fig, ax = plt.subplots(figsize=(6,5))
scatter_post(ax, xa_posterior, truth=theta_true_cloud[:,:2],
             title=r"Posterior $p(\theta|Y^e,\xi)$ via kNN + in-out simulations ")
plot_kde_2d(xa_posterior[:,:2], true_theta=theta_true_cloud[:,:2], ax=ax)
plt.show()

# forward push to get simulated Y output for p(\theta|Y^e,\xi) and xi
xe = np.tile([2.0, 0.0], (np.shape(xa_posterior)[0], 1))         # epistemic (fixed, repeated)
theta_posterior = np.hstack([xa_posterior, xe])
y_post_xi0, samples_posterior_xi0 = forward_propagation(theta_posterior, simulator, xi=xi0, n_samples=500) # empirical data

# compare conditional prior vs conditional posterior
plt.hist(Y_sim, density=True, label=r'prior $p(y|\xi)$')
plt.hist(y_post_xi0, density=True, label=r'posterior $p(y|Y^e, \xi)$')
plt.legend()
plt.grid()

In [None]:

xi_candidates = np.linspace(-5, 5, 21)   # choose your design grid
xi_next, eig_scores, diag = select_next_xi_by_eig(
    simulator=simulator,
    xa_posterior=xa_posterior,
    xi_candidates=xi_candidates,
    xe_fixed=(2.0, 0.0),
    n_eval=20000,
    k_mi=50,
    seed=0
)

print("Next xi:", xi_next)
print("Diagnostics:", diag)


plt.figure()
plt.plot(xi_candidates, eig_scores, marker="o", markersize=3)
plt.axvline(xi_next, linestyle="--")
plt.title("EIG proxy (MI) vs xi")
plt.xlabel("xi")
plt.ylabel("I(xa ; y | xi)")
plt.grid(True)
plt.show()


In [None]:
xi_candidates = np.linspace(-5, 5, 41)

pdf_theta_true = UM_theta(a=4.0, b=0.5)        # “real” uncertainty model
pdf_theta_sim  = UniformPrior(a=-20, b=20)     # simulator archive prior

hist = run_sequential_doe_knn(
    simulator=simulator,
    pdf_theta_true=pdf_theta_true,
    pdf_theta_prior=pdf_theta_sim,
    xi0=1.0,
    xi_candidates=xi_candidates,
    nq=5,
    n_emp=100,
    n_sim=250_000,
    knn=50,
    a_tol=0.05,
    combine="stack",       # strongly recommended over intersect for stability
    resample_n=2_000,
    n_eval_eig=5_000,
    k_mi=30,
    seed=0
)


In [None]:
for post_xa_i in hist['post']:

    fig, ax = plt.subplots(figsize=(6,5))
    scatter_post(ax, post_xa_i, truth=theta_true_cloud[:,:2],
                 title=r"Posterior $p(\theta|Y^e,\xi)$ via kNN + in-out simulations ")
    plot_kde_2d(post_xa_i[:2500,:2], true_theta=theta_true_cloud[:,:2], ax=ax)
    ax.set_xlim([-15,15])
    ax.set_ylim([-15,15])
    plt.show()

In [None]:
# quick diagnostic plots
xi_candidates = np.asarray(np.linspace(-5,5,41), float)

plt.figure()
for q, sc in enumerate(hist["scores"], start=1):
    plt.plot(xi_candidates, sc, alpha=0.4)
plt.scatter(hist["xi"], [max(s) if i>0 else np.nan for i,s in enumerate([None]+hist["scores"])],
            s=20)
plt.title("EIG proxy curves over sequential steps")
plt.xlabel("xi")
plt.ylabel("I(xa ; y | xi)")
plt.grid(True)
plt.show()

print("chosen xis:", hist["xi"])
print("timing:", {k: (np.sum(v) if isinstance(v, list) else v) for k,v in hist["timing"].items()})


# Tasks:
Use the available information to select new design set (set of points) $X_c = \{ x_{c,q}, ~ q=1,...,n_q \}$, up to a maximum experimental budget $n_q$, from which to collect new model responses to maximize uncertainty reduction in $x_e, x_a$.Each query generates a new empirical data set besides the $q=0$ which is already available.

* **CASE 1** Sequential design --> Assume we can query $q=1,...,n_q$ sequentially.

* **CASE 2** Non-sequential design --> Assume we select $q=1,...,n_q$ jointly and then gather data all in once.



## Optimal Design of Experiments Formulation

Let $x = (x_a, x_e, x_c)$, where $x_a$ are aleatoric variables, $x_e$ epistemic parameters, and
$x_c$ design variables. Let $y$ denote the observed system response.
We use $p(\cdot)$ to denote probability density functions.
The superscript $(q)$ indicates conditioning on the design $x_{c,q}$.

---

### CASE 2: Non-sequential (batch) experimental design

Let $X_c = \{ x_{c,1}, \ldots, x_{c,n_q} \}$ be a batch of design points.

#### CASE 2A: Conditional independence assumption

Assume that the observations $\{y_q\}_{q=1}^{n_q}$ are conditionally independent given
$x_e$ and $x_a$, i.e.

$$
p(y_{1:n_q} \mid x_e, x_a, X_c)
= \prod_{q=1}^{n_q} p(y_q \mid x_e, x_a, x_{c,q}).
$$

The batch design problem is formulated as maximization of the total expected information gain:

$$
X_c^\star
= \arg\max_{X_c}
\sum_{q=1}^{n_q}
\mathbb{E}_{p(y \mid x_{c,q})}
\left[
D_{\mathrm{KL}}
\left(
p(x_e \mid y, x_{c,q})
\;\|\;
p(x_e)
\right)
\right].
$$

Equivalently, using the joint expectation form:

$$
X_c^\star
= \arg\max_{X_c}
\sum_{q=1}^{n_q}
\mathbb{E}_{p(x_e)}
\mathbb{E}_{p(x_a)}
\mathbb{E}_{p(y \mid x_e, x_a, x_{c,q})}
\left[
\log
\frac{
p(y \mid x_e, x_{c,q})
}{
p(y \mid x_{c,q})
}
\right].
$$

---

#### CASE 2B: No independence assumption (joint entropy formulation)

If the observations are not conditionally independent, the batch design must be optimized
jointly by maximizing the mutual information between $x_e$ and the full observation vector
$y_{1:n_q}$:

$$
X_c^\star
= \arg\max_{X_c}
I(x_e ; y_{1:n_q} \mid X_c).
$$

This can be written explicitly as:

$$
X_c^\star
= \arg\max_{X_c}
\mathbb{E}_{p(x_e)}
\mathbb{E}_{p(y_{1:n_q} \mid x_e, X_c)}
\left[
\log
\frac{
p(y_{1:n_q} \mid x_e, X_c)
}{
p(y_{1:n_q} \mid X_c)
}
\right].
$$

This formulation accounts for dependencies across designs and does not decompose into
independent contributions.

---

### CASE 1: Sequential experimental design

Let $\mathcal{D}^{\mathrm{emp}}_{1:q-1}$ denote the empirical data collected up to step $q-1$.
At iteration $q$, the next design point is selected by maximizing the one-step expected
information gain:

$$
x_{c,q}^\star
=
\arg\max_{x_c \in \mathcal{X}_c}
\mathbb{E}_{p(y \mid x_c, \mathcal{D}^{\mathrm{emp}}_{1:q-1})}
\left[
D_{\mathrm{KL}}
\left(
p(x_e \mid y, x_c, \mathcal{D}^{\mathrm{emp}}_{1:q-1})
\;\|\;
p(x_e \mid \mathcal{D}^{\mathrm{emp}}_{1:q-1})
\right)
\right].
$$

Equivalently:

$$
x_{c,q}^\star
=
\arg\max_{x_c}
\mathbb{E}_{p(x_e)}
\mathbb{E}_{p(x_a)}
\mathbb{E}_{p(y \mid x_e, x_a, x_c)}
\left[
\log
\frac{
p(y \mid x_e, x_c)
}{
p(y \mid x_c)
}
\right].
$$

After querying the system at $x_{c,q}^\star$, the posterior distribution of $x_e$ is updated,
and the procedure is repeated until the experimental budget is exhausted.

