In [None]:
# softmax_gp_composition_svgp.py
# Python >= 3.9; pip install numpy pandas scikit-learn tensorflow>=2.10 gpflow>=2.9
# This script:
#   1) loads sensory and ingredient CSVs (joined by 'sample_id'),
#   2) adds an 'Other' ingredient to preserve totals,
#   3) standardizes sensory inputs,
#   4) fits a Softmax-GP (SVGP multi-output) with a Monte-Carlo variational expectation,
#   5) predicts valid compositions (sum to 1, nonnegative),
#   6) (optional) evaluates Aitchison distance via CLR.

import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List, Dict, Tuple

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

import tensorflow as tf
import gpflow
from gpflow.kernels import RBF
from gpflow.inducing_variables import InducingPoints
from gpflow.kernels.multioutput import SeparateIndependent, SharedIndependent
gpflow.config.set_default_float(tf.float64)






In [32]:
# -----------------------------
# Utilities (simplex, CLR, metrics)
# -----------------------------
def to_simplex(rows: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    rows = np.clip(rows, eps, None)
    rows = rows / rows.sum(axis=1, keepdims=True)
    return rows

def clr(x: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    x = np.clip(x, eps, None)
    g = np.exp(np.mean(np.log(x), axis=1, keepdims=True))
    return np.log(x / g)

def aitchison_distance(y_true: np.ndarray, y_pred: np.ndarray, eps: float = 1e-12) -> np.ndarray:
    C1 = clr(y_true, eps)
    C2 = clr(y_pred, eps)
    return np.sqrt(np.sum((C1 - C2) ** 2, axis=1))

def softmax_tf(z, axis=-1):
    zmax = tf.reduce_max(z, axis=axis, keepdims=True)
    ez = tf.exp(z - zmax)
    return ez / tf.reduce_sum(ez, axis=axis, keepdims=True)

# -----------------------------
# Data handling
# -----------------------------
@dataclass
class DataSpec:
    sensory_csv: str
    ingredients_csv: str
    id_col: str = "sample_id"
    add_other: bool = True
    min_presence: int = 1     # keep an ingredient if present >= this many samples
    scale_X: bool = True

def load_and_align(spec: DataSpec) -> Tuple[pd.DataFrame, pd.DataFrame]:
    Xdf = pd.read_csv(spec.sensory_csv)
    Ydf = pd.read_csv(spec.ingredients_csv)
    df = Xdf.merge(Ydf, on=spec.id_col, how="inner")
    sensory_cols = [c for c in Xdf.columns if c != spec.id_col]
    ingredient_cols = [c for c in Ydf.columns if c != spec.id_col]
    X = df[[spec.id_col] + sensory_cols].copy()
    Y = df[[spec.id_col] + ingredient_cols].copy()
    return X, Y

def build_composition(Y: pd.DataFrame, spec: DataSpec) -> Tuple[np.ndarray, List[str]]:
    id_col = spec.id_col
    cols = [c for c in Y.columns if c != id_col]
    Yvals = Y[cols].to_numpy(dtype=float)

    # If given in 0..100, normalize to 0..1 by row sum
    row_sums = Yvals.sum(axis=1, keepdims=True)
    row_sums = np.where(row_sums == 0.0, 1.0, row_sums)
    Yvals = Yvals / row_sums

    # Presence filter
    present_counts = (Yvals > 0).sum(axis=0)
    keep_mask = present_counts >= spec.min_presence
    kept_cols_raw = [c for c, m in zip(cols, keep_mask) if m]
    Y_keep = Yvals[:, keep_mask]

    if spec.add_other:
        leftover = 1.0 - Y_keep.sum(axis=1)
        other = np.clip(leftover, 0.0, 1.0)
        Y_aug = np.concatenate([Y_keep, other[:, None]], axis=1)
        kept_cols = kept_cols_raw + ["Other"]
    else:
        # Renormalize kept subset
        s = Y_keep.sum(axis=1, keepdims=True)
        s = np.where(s == 0.0, 1.0, s)
        Y_aug = Y_keep / s
        kept_cols = kept_cols_raw

    Y_aug = to_simplex(Y_aug)
    return Y_aug, kept_cols

# -----------------------------
# Softmax-GP custom "likelihood"
# -----------------------------
class SoftmaxCompositional(gpflow.likelihoods.Likelihood):
    """
    Softmax-based compositional likelihood:
      log p(y | f) = tau * sum_k y_k * log softmax(f)_k
    Vector-valued: latent_dim = observation_dim = input_dim = D.
    """

    def __init__(self, D: int, tau: float = 100.0, eps: float = 1e-8):
        # GPflow 2.9+ requires dims
        super().__init__(input_dim=D, latent_dim=D, observation_dim=D)
        self.D = int(D)
        self.tau = tf.convert_to_tensor(tau, dtype=gpflow.default_float())
        self.eps = tf.convert_to_tensor(eps, dtype=gpflow.default_float())

    # ---------- helpers ----------
    @staticmethod
    def _softmax(z, axis=-1):
        zmax = tf.reduce_max(z, axis=axis, keepdims=True)
        ez = tf.exp(z - zmax)
        return ez / tf.reduce_sum(ez, axis=axis, keepdims=True)

    def _loglik_point(self, F, Y):
        # F, Y: (..., D)
        P = tf.clip_by_value(self._softmax(F), self.eps, 1.0)
        return self.tau * tf.reduce_sum(Y * tf.math.log(P), axis=-1)  # (...,)

    # ---------- required by GPflow Likelihood ----------
    def _log_prob(self, F, Y):
        return self._loglik_point(F, Y)

    def _variational_expectations(self, Fmu, Fvar, Y):
        # Monte-Carlo VE: E_q[f][ log p(Y|f) ]
        num_mc = 8
        eps = tf.random.normal(shape=(num_mc,) + tf.shape(Fmu), dtype=Fmu.dtype)
        F_sample = Fmu[None, ...] + tf.sqrt(Fvar[None, ...] + 1e-12) * eps  # (S,N,D)
        logp = self._loglik_point(F_sample, Y[None, ...])                  # (S,N)
        return tf.reduce_mean(logp, axis=0)                                 # (N,)

    def _predict_mean_and_var(self, Fmu, Fvar):
        # MC predictive moments in observation space
        num_mc = 64
        eps = tf.random.normal(shape=(num_mc,) + tf.shape(Fmu), dtype=Fmu.dtype)
        F_sample = Fmu[None, ...] + tf.sqrt(Fvar[None, ...] + 1e-12) * eps  # (S,N,D)
        P = self._softmax(F_sample, axis=-1)                                 # (S,N,D)
        mean = tf.reduce_mean(P, axis=0)                                     # (N,D)
        var = tf.math.reduce_variance(P, axis=0)                             # (N,D)
        # ensure simplex numerically
        mean = tf.clip_by_value(mean, self.eps, 1.0)
        mean = mean / tf.reduce_sum(mean, axis=-1, keepdims=True)
        return mean, var

    def _predict_log_density(self, Fmu, Fvar, Y):
        num_mc = 32
        eps = tf.random.normal(shape=(num_mc,) + tf.shape(Fmu), dtype=Fmu.dtype)
        F_sample = Fmu[None, ...] + tf.sqrt(Fvar[None, ...] + 1e-12) * eps
        logp = self._loglik_point(F_sample, Y[None, ...])  # (S,N)
        return tf.reduce_mean(logp, axis=0)                 # (N,)

    # convenience used by your predict() wrapper
    def predictive_mean_from_moments(self, Fmu, Fvar, mc: int = 64):
        eps = tf.random.normal(shape=(mc,) + tf.shape(Fmu), dtype=Fmu.dtype)
        F_sample = Fmu[None, ...] + tf.sqrt(Fvar[None, ...] + 1e-12) * eps
        P = self._softmax(F_sample, axis=-1)
        mean = tf.reduce_mean(P, axis=0)
        mean = tf.clip_by_value(mean, self.eps, 1.0)
        mean = mean / tf.reduce_sum(mean, axis=-1, keepdims=True)
        return mean


# -----------------------------
# Build & train the SVGP
# -----------------------------
@dataclass
class ModelConfig:
    num_inducing: int = 20
    ard: bool = True
    tau: float = 100.0
    mc_pred: int = 64
    max_iters: int = 5000
    lr: float = 0.01
    seed: int = 0

def build_svgp_softmax(X: np.ndarray, Y: np.ndarray, cfg: ModelConfig):
    tf.random.set_seed(cfg.seed)
    N, P = X.shape
    D = Y.shape[1]

    M = min(cfg.num_inducing, N)
    perm = np.random.RandomState(cfg.seed).permutation(N)[:M]
    Z = X[perm, :].copy()

    base_kern = RBF(lengthscales=np.ones(P), variance=1.0)  # ARD via vector lengthscales
    kern = SharedIndependent(base_kern, output_dim=D)

    lik = SoftmaxCompositional(D=D, tau=cfg.tau)  # <-- pass D here

    inducing = InducingPoints(Z.astype(np.float64))
    model = gpflow.models.SVGP(
        kernel=kern,
        likelihood=lik,
        inducing_variable=inducing,
        num_latent_gps=D,
        q_diag=True,
        whiten=True,
    )

    # small init for q_var improves stability
    # Zero mean
    model.q_mu.assign(tf.zeros_like(model.q_mu))

    # q_sqrt shape = (L, M) when q_diag=True
    model.q_sqrt.assign(1e-3 * tf.ones_like(model.q_sqrt))

    return model, lik, Z



def train_svgp(model, X, Y, cfg: ModelConfig):
    """
    Adam on the ELBO (negative objective in gpflow is -ELBO).
    """
    Xtf = tf.convert_to_tensor(X, dtype=tf.float64)
    Ytf = tf.convert_to_tensor(Y, dtype=tf.float64)

    opt = tf.optimizers.Adam(learning_rate=cfg.lr)

    @tf.function(autograph=False)
    def step():
        with tf.GradientTape() as tape:
            elbo = model.elbo((Xtf, Ytf))
            loss = -elbo
        grads = tape.gradient(loss, model.trainable_variables)
        opt.apply_gradients(zip(grads, model.trainable_variables))
        return elbo

    for it in range(cfg.max_iters):
        elbo_val = step()
        if (it + 1) % 500 == 0:
            tf.print("iter", it + 1, "ELBO", elbo_val)

# -----------------------------
# Predictions
# -----------------------------
def predict_composition(model, Xnew: np.ndarray, mc_samples: int = 64) -> np.ndarray:
    """
    Returns predictive mean composition (N*, D) by MC over q(f).
    """
    Xtf = tf.convert_to_tensor(Xnew, dtype=tf.float64)
    Fmu, Fvar = model.predict_f(Xtf, full_cov=False, full_output_cov=False)
    lik: SoftmaxCompositional = model.likelihood  # type: ignore
    Pmean = lik.predictive_mean_from_moments(Fmu, Fvar, mc=mc_samples).numpy()
    # Ensure valid simplex numerically
    Pmean = to_simplex(Pmean)
    return Pmean

# -----------------------------
# End-to-end pipeline
# -----------------------------
def run_pipeline(
    sensory_csv: str,
    ingredients_csv: str,
    id_col: str = "sample_id",
    min_presence: int = 1,
    add_other: bool = True,
    num_inducing: int = 20,
    tau: float = 100.0,
    max_iters: int = 5000,
    do_loocv: bool = True,
    seed: int = 0
) -> Dict:
    spec = DataSpec(
        sensory_csv=sensory_csv,
        ingredients_csv=ingredients_csv,
        id_col=id_col,
        add_other=add_other,
        min_presence=min_presence,
        scale_X=True
    )

    # Load
    Xdf, Ydf = load_and_align(spec)
    meds = Xdf.iloc[:,1:].median(axis=0)
    Xdf = Xdf.fillna(meds)
    X_cols = [c for c in Xdf.columns if c != id_col]
    X_raw = Xdf[X_cols].to_numpy(dtype=float)
    Y_mat, ing_names = build_composition(Ydf, spec)  # (N, D)

    # Standardize X
    xscaler = StandardScaler().fit(X_raw)
    X = xscaler.transform(X_raw)

    # Build + train model
    cfg = ModelConfig(
        num_inducing=num_inducing, tau=tau, max_iters=max_iters, seed=seed, lr=0.01, ard=True
    )
    model, lik, Z = build_svgp_softmax(X, Y_mat, cfg)
    train_svgp(model, X, Y_mat, cfg)

    artifacts = {
        "xscaler": xscaler,
        "model": model,
        "ingredient_names": ing_names,
        "X_cols": X_cols,
        "id_col": id_col,
        "config": cfg.__dict__,
    }

    # Optional LOOCV (or k-fold if you prefer)
    if do_loocv:
        N = X.shape[0]
        kf = KFold(n_splits=N, shuffle=True, random_state=seed)
        ad_all, l2_all = [], []
        for tr_idx, te_idx in kf.split(X):
            Xtr, Xte = X[tr_idx], X[te_idx]
            Ytr, Yte = Y_mat[tr_idx], Y_mat[te_idx]

            cfg_cv = ModelConfig(
                num_inducing=min(cfg.num_inducing, Xtr.shape[0]),
                tau=cfg.tau, max_iters=int(max_iters/2), seed=seed, lr=cfg.lr, ard=True
            )
            m_cv, _, _ = build_svgp_softmax(Xtr, Ytr, cfg_cv)
            train_svgp(m_cv, Xtr, Ytr, cfg_cv)

            Yhat = predict_composition(m_cv, Xte, mc_samples=cfg.mc_pred)
            ad = aitchison_distance(Yte, Yhat)
            l2 = np.sqrt(np.sum((Yte - Yhat) ** 2, axis=1))
            ad_all.extend(list(ad))
            l2_all.extend(list(l2))

        artifacts["cv"] = {
            "scheme": "LOOCV",
            "aitchison_mean": float(np.mean(ad_all)),
            "aitchison_std": float(np.std(ad_all)),
            "l2_mean": float(np.mean(l2_all)),
            "l2_std": float(np.std(l2_all)),
        }

    return artifacts

def predict_from_artifacts(artifacts: Dict, X_new: pd.DataFrame) -> pd.DataFrame:
    X_cols = artifacts["X_cols"]
    xscaler = artifacts["xscaler"]
    model = artifacts["model"]
    ing_names = artifacts["ingredient_names"]

    Xarr = X_new[X_cols].to_numpy(dtype=float)
    Xs = xscaler.transform(Xarr)
    Yhat = predict_composition(model, Xs, mc_samples=artifacts["config"].get("mc_pred", 64))
    Yhat_pct = 100 * Yhat
    return pd.DataFrame(Yhat_pct, columns=ing_names, index=X_new.index)

In [33]:
# -----------------------------
# Example usage
# -----------------------------

SENSORY_CSV = "../../data/recipes/data_sens.csv"      # columns: sample_id, s1..s8
ING_CSV     = "../../data/recipes/data_recipe.csv"  # columns: sample_id, many ingredients (0..100 or 0..1)

artifacts = run_pipeline(
    sensory_csv=SENSORY_CSV,
    ingredients_csv=ING_CSV,
    id_col="idx",
    min_presence=1,     # keep all ingredients as separate cols (rare ones flow into 'Other')
    add_other=True,
    num_inducing=20,
    tau=150.0,          # try 50..300; larger -> sharper fits to observed compositions
    max_iters=10,
    do_loocv=True,
    seed=0
)

if "cv" in artifacts:
    print("LOOCV Aitchison mean±std:",
            artifacts["cv"]["aitchison_mean"], "±", artifacts["cv"]["aitchison_std"])
    print("LOOCV L2 mean±std:",
            artifacts["cv"]["l2_mean"], "±", artifacts["cv"]["l2_std"])

# Demo predictions on the same X (replace with your new sensory rows)
Xdf, _ = load_and_align(DataSpec(SENSORY_CSV, ING_CSV))
preds = predict_from_artifacts(artifacts, Xdf.drop(columns=["sample_id"]))
print(preds.head())


TypeError: SoftmaxCompositional._variational_expectations() takes 4 positional arguments but 5 were given