# Machine learning example

In this notebook I will show all of the machine learning steps that are necessary for simulation based metabolic flux inference.

In [1]:
from sbmfi.models.small_models import spiro
from sbmfi.priors.uniform import UniformRoundedFleXchPrior
from sbmfi.core.polytopia import FluxCoordinateMapper, simplify_vertices
from sbmfi.settings import BASE_DIR

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
import math
import torch
import time
import tqdm
import pickle 

import numpy as np
import pandas as pd

import arviz as az
import holoviews as hv
hv.extension('bokeh')

file = os.path.join(BASE_DIR, 'spiro_50k.h5')
dataset_id = 'ds_1'



### ALWAYS LOAD THE CELL BELOW

In [2]:
model, kwargs = spiro(
    backend='torch',
    auto_diff=False,
    batch_size=1,
    add_biomass=True,
    v2_reversible=True,
    ratios=True,
    build_simulator=True,
    add_cofactors=True,
    which_measurements='lcms',
    seed=2,
    measured_boundary_fluxes = ('h_out', ),
    which_labellings=['A', 'B'],
    include_bom=True,
    v5_reversible=False,
    n_obs=0,
    kernel_id='svd',
    L_12_omega = 1.0,
    clip_min=None,
    transformation='ilr',
)
basebayes = kwargs['basebayes']

Set parameter Username
Academic license - for non-commercial use only - expires 2025-07-27


  _C._set_default_tensor_type(t)


## Small *spiro* model

In the cell below, we create the spiro model. We also automatically create a simulator that simulates labelling for 2 different labelling states named `'A'` and `'B'`. The simulator includes a boundary observation model for the boundary fluxes `['bm', 'd_out', 'h_out']` with errors drawn from a multivariate Gaussian. Note that in this incarnation of the model, we do not check whether the noisy boundary fluxes lie in the flux polytope.

Displayed below are the reactions of the model

In [154]:
model.flux_coordinate_mapper.labelling_fluxes_id

Index(['v1', 'v2', 'v3', 'v4', 'v5_rev', 'v6', 'v7', 'd_out', 'f_out', 'bm',
       'h_out', 'a_in', 'v2_rev'],
      dtype='object')

In [155]:
for reaction in model.reactions:
    print(reaction, reaction.bounds)

a_in:  --> A/ab (10.0, 10.0)
d_out: D/abc -->  (0.0, 100.0)
f_out: F/a -->  (0.0, 100.0)
h_out: H/ab -->  (0.0, 100.0)
v1: A/ab --> B/ab (0.0, 100.0)
v2: B/ab ==> E/ab (0.0, 100.0)
v3: B/ab + E/cd --> C/abcd + cof (0.0, 100.0)
v4: E/ab --> H/ab (0.0, 100.0)
v5: F/a + D/bcd <-- C/abcd (-100.0, 0.0)
v6: D/abc --> E/ab + F/c (0.0, 100.0)
v7: F/a + F/b --> H/ab (0.0, 100.0)
bm: 0.3 H/. + 0.6 B/. + 0.5 E/. + 0.1 C/. -->  (0.05, 1.5)
EX_cof: cof -->  (0.0, 1000.0)


These are the measurements that we assume to have access to for both labelling conditions.  

In [156]:
print(f"number of LC-MS signals for labelling condition A: {kwargs['annotation_df']['A'].shape}, and B {kwargs['annotation_df']['B'].shape}")

number of LC-MS signals for labelling condition A: (14, 9), and B (10, 9)


In [157]:
kwargs.keys()

dict_keys(['annotation_df', 'substrate_df', 'measured_boundary_fluxes', 'measurements', 'fluxes', 'true_theta', 'basebayes'])

In [158]:
kwargs['substrate_df']

Unnamed: 0,A/00,A/01,A/10,A/11
A,0.2,0.0,0.0,0.8
B,0.0,1.0,0.0,0.0


In [159]:
kwargs['true_theta']

theta_id,R_svd0,R_svd1,R_svd2,R_svd3,v2_xch
samples_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
v,-0.789858,2.06266,0.605007,1.745831,0.5


In [160]:
model.flux_coordinate_mapper.map_theta_2_fluxes(kwargs['true_theta'], pandalize=True)

Unnamed: 0_level_0,EX_cof,v1,v2,v3,v4,v5_rev,v6,v7,d_out,f_out,bm,h_out,a_in,v2_rev
samples_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
v,8.2,10.0,2.4,8.2,1.332268e-15,8.05,8.05,8.05,6.661338e-16,-1.776357e-15,1.5,7.6,10.0,1.5


In [161]:
kwargs['measurements']

labelling_id,A,A,A,A,A,A,A,A,A,B,B,B,B,B,BOM,BOM
data_id,ilr_C_0,ilr_C_1,ilr_D_0,ilr_D_1,ilr_H_0,ilr_L_0,ilr_L_1,ilr_L_2,"ilr_L|[1,2]_0",ilr_C_0,ilr_D_0,ilr_H_{M+Cl}_0,ilr_H_0,"ilr_L|[1,2]_0",h_out,bm
0,-1.984299,-1.854073,-2.222083,-1.663253,-0.174556,-1.594347,-2.142986,-2.912785,-1.470387,-4.456696,-2.810111,-0.381061,-0.381061,-1.648991,7.6,1.5


In [162]:
kwargs['basebayes'].to_partial_mdvs(kwargs['measurements'])

labelling_id,A,A,A,A,A,A,A,A,A,A,...,B,B,B,B,B,B,B,B,BOM,BOM
data_id,C+0,C+3,C+4,D+0,D+2,D+3,H+0,H+1,L+0,L+1,...,D+0,D+2,H_{M+Cl}+0,H_{M+Cl}+1,H+0,H+1,"L|[1,2]+0","L|[1,2]+1",h_out,bm
0,0.017559,0.29055,0.691892,0.016376,0.379294,0.604331,0.438596,0.561404,0.003744,0.035695,...,0.018449,0.981551,0.368443,0.631557,0.368443,0.631557,0.088505,0.911495,7.6,1.5


In [163]:
kwargs['annotation_df']['A']

Unnamed: 0,met_id,nC13,adduct_name,mz,rt,sigma,omega,total_I,formula
0,C,0,M-H,157.018955,4.0,0.02,,700000.0,C4H6N4OS
1,C,3,M-H,160.02902,4.0,0.02,,700000.0,C4H6N4OS
2,C,4,M-H,161.032375,4.0,0.02,,700000.0,C4H6N4OS
3,D,0,M-H,37.008374,5.0,0.01,,100000.0,C3H2
4,D,2,M-H,39.015083,5.0,0.01,,100000.0,C3H2
5,D,3,M-H,40.018438,5.0,0.01,,100000.0,C3H2
6,H,0,M-H,25.008374,1.0,0.01,,3000.0,C2H2
7,H,1,M-H,26.011728,1.0,0.01,,3000.0,C2H2
8,L,0,M-H,153.926096,6.0,0.01,,400000.0,C5KNaSH
9,L,1,M-H,154.92945,6.0,0.01,,400000.0,C5KNaSH


In [164]:
kwargs['annotation_df']['B']

Unnamed: 0,met_id,nC13,adduct_name,mz,rt,sigma,omega,total_I,formula
0,C,0,M-H,157.018955,4.0,0.02,,700000.0,C4H6N4OS
1,C,3,M-H,160.02902,4.0,0.02,,700000.0,C4H6N4OS
2,D,0,M-H,37.008374,5.0,0.01,,100000.0,C3H2
3,D,2,M-H,39.015083,5.0,0.01,,100000.0,C3H2
4,H,0,M-H,25.008374,1.0,0.01,,3000.0,C2H2
5,H,1,M-H,26.011728,1.0,0.01,,3000.0,C2H2
6,H,0,M+Cl,60.985051,1.0,0.03,,2000.0,C2H2
7,H,1,M+Cl,61.988406,1.0,0.03,,2000.0,C2H2
8,"L|[1,2]",0,M-H,136.972776,6.0,0.01,1.0,40000.0,C2H2O7
9,"L|[1,2]",1,M-H,137.976131,6.0,0.01,1.0,40000.0,C2H2O7


## Simulating a dataset

In [178]:
from sbmfi.core.simulator import DataSetSim

In [179]:
n = 50000

simulator = DataSetSim(
    model = model,
    substrate_df = kwargs['substrate_df'],
    mdv_observation_models = basebayes._obmods,
    boundary_observation_model = basebayes._bom,
    num_processes=0,  
    epsilon=1e-12,
)
prior = UniformRoundedFleXchPrior(model._fcm)

In [180]:
theta = prior.sample((n,))

In [181]:
labelling_fluxes = model.flux_coordinate_mapper.map_theta_2_fluxes(theta, rescale_val=None)
result = simulator.simulate_set(
    labelling_fluxes,
    n_obs=3,
    fluxes_per_task=None,
    what='all',
    break_i=-1,
    close_pool=True,
    show_progress=True,
)

100%|██████████████████████████████████████████████████████| 100000/100000 [12:09<00:00, 137.04it/s]


In [182]:
result.pop('running_time', None)
simulator.to_hdf(
    hdf=file,
    result=result,
    dataset_id=dataset_id,
    append=True,
    expectedrows_multiplier=3,
)

## Representing labelling measurements in a reduced latent space

As a back-of-the-envelope calculation, we can imagine that by LC-MS we can measure around 40 CCM metabolites in *E.coli*. Furthermore, lets imagine that on average we can measure 3 mass isotopomers per metabolite per labelling experiment. If we then do 3 labelling experiments (different substrate labellings), we have a total of `40 * 3 * 3 = 360` numbers to represent the labelling state that we use for inference. 

The first thing that we should notice is that MDVs are an inefficient way of representing labelling data. To represent the labelling state of acetate, `ac`, as an MDV we need three numbers `[ac+0, ac+1, ac+2]`. Since by definition an MDV is a point on a probability simplex, there are actually only 2 degrees of freedom for the acetate MDV, since we know it sums to 1. By applying the isometric log-ratio transform to the MDV, we can represent the labelling state using only 2 real (i.e. $\mathbb{R}$) numbers without any loss of information.

By applying the isometric log-ratio transform to all metabolite MDVs, we can now represent the labelling data with `40 * (3-1) * 3 = 240` numbers, and on top of that, these are uncorrelated real numbers unlike when using the MDV representation.

Another inefficiency is that different metabolites within a labelling experiment carry similar information. For example, Alanine is made from pyruvate and thus has a similar MDV as pyruvate. Differences can occur because of the functioning of the LC-MS. For instance `ala+1` might not be measured whereas `pyr+1` could be or there are vastly different noise levels between the two signals.

Generally, if we try to infer 20 free fluxes across many labelling experiments resulting in hundreds of independent mass isotopomer measurements, we should try to compress the data to roughly 20 dimensions.

Except for labelling measurements, we typically also have access to measurements of some boundary fluxes such as growth rate (i.e. biomass flux) and uptake of substrate / excretion of some fermentation products.

In [22]:
from sbmfi.inference.mdvae import MDVAE_Dataset, ray_train_MDVAE, MDFVAE
from sbmfi.core.simulator import _BaseSimulator

import torch
from torch.utils.data import Dataset, DataLoader, random_split

from torch import nn

create training and validation data-sets

In [23]:
model._fcm.fluxes_id

Index(['EX_cof', 'v1', 'v2', 'v3', 'v4', 'v5_rev', 'v6', 'v7', 'd_out',
       'f_out', 'bm', 'h_out', 'a_in', 'v2_rev'],
      dtype='object')

In [24]:
model.state_id

Index(['H+0', 'H+1', 'H+2', 'C+0', 'C+1', 'C+2', 'C+3', 'C+4', 'D+0', 'D+1',
       'D+2', 'D+3', 'L|[1,2]+0', 'L|[1,2]+1', 'L|[1,2]+2', 'L+0', 'L+1',
       'L+2', 'L+3', 'L+4', 'L+5'],
      dtype='object', name='mdv_id')

In [25]:
import tables as pt

hdf = pt.open_file(file, mode='r')
what_id = pd.Index(hdf.root[f'mdv_id'].read().astype(str), name=f'mdv_id')
what_id

Index(['H+0', 'H+1', 'H+2', 'C+0', 'C+1', 'C+2', 'C+3', 'C+4', 'D+0', 'D+1',
       'D+2', 'D+3', 'L|[1,2]+0', 'L|[1,2]+1', 'L|[1,2]+2', 'L+0', 'L+1',
       'L+2', 'L+3', 'L+4', 'L+5'],
      dtype='object', name='mdv_id')

In [27]:
DENOISE = True  # whether to feed denoised data (data without observation model noise added)
INCLUDE_BOM = False # whether to include the boundary fluxes in the VAE compression

if not simulator._la.backend == 'torch':
    raise ValueError
mdvs = basebayes.read_hdf(hdf=file, dataset_id=dataset_id, what='mdv') if DENOISE else None
data = basebayes.read_hdf(hdf=file, dataset_id=dataset_id, what='data')
fluxes = basebayes.read_hdf(hdf=file, dataset_id=dataset_id, what='fluxes')
mu = basebayes.simulate(labelling_fluxes=fluxes, mdvs=mdvs, n_obs=0) if DENOISE else None

# here we check whether there are any data-dimensions that are independent of fluxes; these should be removed because otherwise the denoising MDVAE does not work (dividing by 0 std leads to inf values)
unchanging = mu[:, 0].std(0) < 1e-4
if unchanging.any(): 
    print(f'Found unchanging data-dimensions: {simulator.data_id[unchanging.numpy()]}')

# simulator.to_partial_mdvs(mu[:6, 0], pandalize=True)
show_unchanged = pd.DataFrame(mu[:6, 0].numpy(), columns=simulator.data_id).head(6)

mu = mu[..., ~unchanging]
data = data[..., ~unchanging]

if (simulator._bom is not None) and not INCLUDE_BOM:
    mu = mu[..., :-simulator._bomsize] if DENOISE else None
    data = data[..., :-simulator._bomsize]

dataset = MDVAE_Dataset(data, mu, standardize=True)

n_validate = math.ceil(0.10 * len(dataset))  # 10 % of the data are keps as validation

train_ds, val_ds = random_split(
    dataset,
    lengths=(len(dataset) - n_validate, n_validate),
    generator=simulator._la._BACKEND._rng  # makes sure we get the same split every time
)

torch.save(train_ds, os.path.join(BASE_DIR, 'train_ds.pt'))
torch.save(val_ds, os.path.join(BASE_DIR, 'val_ds.pt'))

show_unchanged

Found unchanging data-dimensions: MultiIndex([('A', 'ilr_L|[1,2]_0')],
           names=['labelling_id', 'data_id'])


labelling_id,A,A,A,A,A,A,A,A,A,B,B,B,B,B,BOM,BOM
data_id,ilr_C_0,ilr_C_1,ilr_D_0,ilr_D_1,ilr_H_0,ilr_L_0,ilr_L_1,ilr_L_2,"ilr_L|[1,2]_0",ilr_C_0,ilr_D_0,ilr_H_{M+Cl}_0,ilr_H_0,"ilr_L|[1,2]_0",h_out,bm
0,-0.193931,-2.245,-1.184312,-1.651334,0.0,-1.280541,-1.697968,-2.733157,-1.470387,-2.144961,-3.417525,-0.041343,-0.041343,-1.455631,2.24395,0.510387
1,-0.372081,-2.166727,-1.242657,-1.640742,0.0,-1.287965,-1.708803,-2.737239,-1.470387,-2.475584,-3.408142,-0.140248,-0.140248,-1.572628,1.76788,0.933142
2,0.264659,-2.467023,-1.089447,-1.671177,0.0,-1.183494,-1.548523,-2.685133,-1.470387,-1.301162,-3.436325,-0.25513,-0.25513,-1.10424,2.15235,0.161464
3,-0.142867,-2.268927,-1.194264,-1.649439,-0.110316,-1.589131,-2.058328,-2.875,-1.470387,-2.158167,-3.428856,0.752157,0.752157,-1.415794,2.379396,1.012409
4,0.27007,-2.469807,-1.091747,-1.670659,-0.081514,-1.532914,-2.000631,-2.849005,-1.470387,-1.31413,-3.437807,0.802178,0.802178,-1.0992,2.72213,0.38284
5,0.916097,-2.813323,-1.030584,-1.685064,-0.030222,-1.449411,-1.905989,-2.805549,-1.470387,-0.153183,-3.45278,0.793963,0.793963,-0.528556,4.065055,0.151502


Inspecting the data in the cell above we see that `ilr_L|[1,2]_0` in labelling condition `A` is unchanging, and therefore cannot carry useful information about fluxes. This dimension must be removed from the data. 

The observations of the boundary fluxes should lie in or close to the flux polytope projected onto the exchange flux dimensions; we might thus not want to include these as indicated by the `INCLUDE_BOM` flag.

## KANKER

In [15]:
# import holoviews as hv
# from holoviews.operation import gridmatrix

# hv.extension('bokeh')

In [16]:
# # n_plot = 10
# plot_df = pd.DataFrame(train_ds[:][0][:2000, :n_plot].numpy(), columns=simulator.data_id[~unchanging.numpy()][:n_plot].map('_'.join))

# ds = hv.Dataset(plot_df)
# grid = gridmatrix(ds, diagonal_type=hv.Scatter)
# # grid.opts(shared_axes=False, axiswise=True)

https://medium.com/@ragy202/addressing-posterior-collapse-in-chemical-vaes-151c0f210388

https://arxiv.org/abs/1903.10145

https://medium.com/@david.daeschler/insights-from-developing-a-vae-fbdb2e6ba31f

https://github.com/hubertrybka/vae-annealing

https://arxiv.org/abs/2309.13160

https://arxiv.org/abs/2004.12585

https://arxiv.org/pdf/2310.15440

https://arxiv.org/pdf/1602.02282.pdf

https://www.reddit.com/r/MachineLearning/comments/8wmbof/d_variational_autoencoder_confusion_am_i_wrong/

https://openreview.net/pdf/d8e0df2b7afeaa076f0e448e960df6d5365069c9.pdf

https://towardsdatascience.com/variational-inference-with-normalizing-flows-on-mnist-9258bbcf8810

In [17]:
# from normflows import NormalizingFlowVAE

THIS MIGHT BE GOLDEN!

https://github.com/VincentStimper/normalizing-flows/blob/master/examples/vae.ipynb

Variational auto-encoder

\begin{align*}
\text{ELBO}(x) &= \mathbb{E}_{q_{\phi}(z|x)}[\log p_{\theta}(x|z)] - KL\big[q_{\phi}(z|x) || p(z)\big]
\end{align*}


\begin{align*}
\text{ELBO}(x) &= \mathbb{E}_{q_{\phi}(z|x)}[\log p_{\theta}(x|z)] - KL\big[q_{\phi}(z|x) || f(z)\big]
\end{align*}


$\mathbb{E}_{q_{\phi}(z|x)}[-\frac{1}{2} \sum_{i=1}^D (x_i - \hat{x}_i)^2 / \sigma^2 - \frac{D}{2} \log(2\pi\sigma^2)]$



In [18]:
# n_bottleneck = 11
# b = torch.tensor(n_bottleneck // 2 * [0, 1] + n_bottleneck % 2 * [0])

# b

In [19]:
# n_steps = 1000
# steps = np.arange(n_steps)


# cyclical_beta = np.vectorize(lambda x: cyclical_annealing(x, n_steps, n_cycles=3, cycle_frac=0.8))(steps)
# # hv.Scatter((steps, cyclical_beta))

In [20]:
# N_LATENT = len(simulator.theta_id)  # we assume that the latent dimension equals the number of free fluxes!

# mdvae, losses = ray_train_MDVAE({
#     'n_epoch': 12,
#     'n_hidden':0.7, 
#     'n_latent': N_LATENT, 
#     'n_hidden_layers': 3, 
#     'learning_rate': 3e-4, 
#     'batch_size': 32,
#     'LR_gamma': 0.9,
#     'beta':0.001,
#     'beta_annealing': 'constant',
# }, cwd=BASE_DIR, show_progress=True)

In [21]:
# torch.save(mdvae, f'{BASE_DIR}\mdvae_LINANNEAL_hid_lay.p')

In [22]:
# losses.to_csv('losses_LINANNEAL_hid_lay.csv')

TODO: make a loss function that looks as follows:  `loss = mse + beta * KL`, but where `beta = 0 if mse>0.2 else KL is ` 

In [23]:
# plot_df = losses.loc[losses['train0_val1'] == 0].copy()
# plot_df['step'] = np.arange(plot_df.shape[0])
# hv.Scatter(plot_df, kdims=['step'], vdims=['loss'])

In [24]:
# torch.set_printoptions(linewidth=200)

# x_in, y_in = val_ds[[66,12,50]]
# with torch.no_grad():
#     x_hat, mean, log_var = mdvae.forward(x_in)
# print(y_in.round(decimals=4))
# print(x_hat.round(decimals=4))
# print((y_in -x_hat).round(decimals=4))

In [25]:
# x_in, y_in = val_ds[12]
# with torch.no_grad():
#     x_hat, mean, log_var = mdvae.forward(x_in)
# print(y_in)
# torch.round(x_hat, decimals=4)

In [26]:
# x_in, y_in = val_ds[50]
# with torch.no_grad():
#     x_hat, mean, log_var = mdvae.forward(x_in)
# print(y_in)
# torch.round(x_hat, decimals=4)

In [27]:
# # AUTOENCODER LATENT VARIABLE PLOT

# with torch.no_grad():
#     ae_latents = mdvae(val_ds[:][0])[1]

# plot_df = pd.DataFrame(ae_latents.numpy(), columns=[f'ae_{i}' for i in range(N_LATENT)])

# ds = hv.Dataset(plot_df)
# grid = gridmatrix(ds, diagonal_type=hv.Scatter)
# grid.opts(shared_axes=False, axiswise=True)


To know whether the machine learning approach works, first we need a bench-mark. 

## SMC base-truth

In [147]:
from sbmfi.inference.bayesian import SMC
from sbmfi.inference.complotting import SMC_PLOT

In [168]:
prior = UniformRoundedFleXchPrior(model._fcm, )
smc = SMC(
    model = model,
    substrate_df = kwargs['substrate_df'], 
    mdv_observation_models = basebayes._obmods, 
    boundary_observation_model = basebayes._bom, 
    prior=prior,
    num_processes=0,
)
smc.set_measurement(x_meas=kwargs['measurements'])
smc.set_true_theta(theta=kwargs['true_theta'])

In [280]:
smc

<sbmfi.inference.bayesian.SMC at 0x2723bfa2140>

In [172]:
smc_result = smc.run(
    n_smc_steps=12,
    n=5000,
    n_obs=3,
    n0_multiplier=1.5,
    population_batch=1000,
    distance_based_decay=True,
    epsilon_decay=0.8,
    kernel_std_scale=1.0,
    evaluate_prior=False,
    potentype='approx',
    return_data=True,
    potential_kwargs={},
    metric='rmse',
    chord_proposal='gauss',
    xch_proposal='gauss',
    xch_std=0.4,
    return_all_populations=True,
    return_az=True,
    debug=False,
)

100%|████████████████████████████████████████████████████████| 15000/15000 [01:23<00:00, 180.48it/s]
100%|█████████████████████████████████████████████| 5000/5000 [03:10<00:00, 26.23it/s, epsilon=2.53]
100%|█████████████████████████████████████████████| 5000/5000 [03:18<00:00, 25.18it/s, epsilon=1.59]
100%|█████████████████████████████████████████████| 5000/5000 [03:33<00:00, 23.39it/s, epsilon=1.14]
100%|████████████████████████████████████████████| 5000/5000 [03:14<00:00, 25.76it/s, epsilon=0.809]
100%|████████████████████████████████████████████| 5000/5000 [03:38<00:00, 22.86it/s, epsilon=0.592]
100%|████████████████████████████████████████████| 5000/5000 [04:08<00:00, 20.09it/s, epsilon=0.406]
100%|████████████████████████████████████████████| 5000/5000 [05:36<00:00, 14.85it/s, epsilon=0.307]
100%|████████████████████████████████████████████| 5000/5000 [10:06<00:00,  8.25it/s, epsilon=0.259]
100%|████████████████████████████████████████████| 5000/5000 [20:36<00:00,  4.05it/s, epsil

In [175]:
smc_result

In [176]:
file_nc = os.path.join(BASE_DIR, 'spiro_5000samples_12steps_alldata.nc')
smc_result.to_netcdf(file_nc)

data = az.InferenceData.from_netcdf(file_nc)

In [205]:
smc_plot = SMC_PLOT(
    fcm=model.flux_coordinate_mapper,  # this should be in the sampled basis!
    inference_data=data,
    v_rep = None,
    hv_backend='matplotlib',
)

In [184]:
smc_plot.theta_id

array(['R_svd0', 'R_svd1', 'R_svd2', 'R_svd3', 'v2_xch'], dtype='<U6')

In [185]:
smc_plot._v_rep.shape

(14, 4)

In [186]:
smc_plot._v_rep

Unnamed: 0,R_svd0,R_svd1,R_svd2,R_svd3
0,-0.106422,-1.74999,-1.822772,0.359923
1,-0.111375,-1.748715,-1.826703,0.367916
2,-0.31204,0.157326,-1.913153,-0.483613
3,-0.460627,0.195572,-2.03109,-0.243822
4,2.371109,0.945944,-0.538068,-0.677951
5,3.395793,-0.720636,-0.027923,0.106261
6,3.396136,-0.717805,-0.02914,0.113871
7,2.381393,1.030884,-0.574585,-0.449667
8,-1.14151,-1.413557,-1.44353,1.081092
9,-1.112611,0.407697,-1.788575,0.207555


In [187]:
smc_plot._ttdf

Unnamed: 0,R_svd0,R_svd1,R_svd2,R_svd3,v2_xch
0,-0.789858,2.06266,0.605007,1.745831,0.5


In [188]:
A = model.flux_coordinate_mapper._sampler._G.numpy()
h = model.flux_coordinate_mapper._sampler._h.numpy()

((A @ data.prior.theta.values[0, :, :4].T).T < h.T).all(1).sum()

7500

In [207]:
from holoviews import opts

plot_post = smc_plot.grand_theta_plot('R_svd2', 'R_svd3', 'posterior')
plot_prior = smc_plot.grand_theta_plot('R_svd2', 'R_svd3', 'prior')

# plott = plot_post.opts(opts.Points(size=4, color='#34eb4c')) + plot_prior.opts(opts.Points(size=4, color='#34eb4c'))
plott = plot_post + plot_prior
plott.opts(shared_axes=False)

In [281]:
# smc_plot.grand_data_plot(['A: ilr_C_0', 'A: ilr_C_1'])

In [282]:
data.posterior_predictive.data_id

AttributeError: 'Tensor' object has no attribute 'posterior_predictive'

## Prior flow

In [6]:
from sbmfi.inference.flow_trainer import flow_constructor, flow_trainer

In [7]:
fluxes = basebayes.read_hdf(hdf=file, dataset_id=dataset_id, what='fluxes', pandalize=True)
# model.flux_coordinate_mapper.map_fluxes_2_thermo(fluxes, pandalize=True)
theta = model.flux_coordinate_mapper.map_fluxes_2_theta(fluxes, rescale_val=None, is_thermo=False, pandalize=True)  # rescaleval does not work
theta.head(2)

theta_id,R_svd0,R_svd1,R_svd2,R_svd3,v2_xch
samples_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-0.699581,0.212415,0.78667,-0.717144,0.34514
1,1.509984,-0.233915,-0.933755,-0.217233,0.608921


In [9]:
thermo_fluxes = model._fcm.map_theta_2_fluxes(theta, rescale_val=None, return_thermo=True, pandalize=True)

fcm = FluxCoordinateMapper(model=model)
cylinder_theta = fcm.map_fluxes_2_theta(thermo_fluxes, coordinate_id='cylinder', rescale_val=1.0, is_thermo=True)

pd.DataFrame(cylinder_theta.numpy(), columns=fcm.theta_id(coordinate_id='cylinder')).head(2)

theta_id,phi,C_svd_0,C_svd_1,R,v2_xch
0,-0.406167,0.732497,-0.555329,-0.406082,-0.299599
1,0.548921,-0.521442,-0.120428,0.87964,0.45406


In [14]:
prior_flow = flow_constructor(
    fcm=fcm,
    coordinate_id='cylinder',
    rescale_val=1.0,
    log_xch=False,
    embedding_net=None,
    num_context_channels=None,
    autoregressive=True,
    num_blocks=2,
    num_hidden_channels=64,
    num_bins=8,
    dropout_probability=0.1,
    num_transforms=10,
    init_identity=True,
    permute=None,  
    p=None,
)



# THIS WORKS:
# prior_flow = flow_constructor(  # IMPORTANT LARGE BATCH SIZES
#     fcm=cyl_fcm,
#     circular=True,
#     embedding_net=None,
#     num_context_channels=None,
#     autoregressive=True,
#     num_blocks=4,
#     num_hidden_channels=30,
#     num_bins=8,
#     dropout_probability=0.0,
#     num_transforms=12,
#     init_identity=True,
#     permute='shuffle',  
#     p=None,
#     scale=0.3,
# )

# learning_rate = 1e-4
# weight_decay = 1e-4
# batch_size = 2048
# optimizer = torch.optim.Adam(prior_flow.parameters(), lr=learning_rate, weight_decay=weight_decay)
# losses=[]

```
prior_flow = flow_constructor(
    circular=True,
    autoregressive=True,
    permute=None,  
)  
```

Note that for the [circular neural spline example](https://github.com/VincentStimper/normalizing-flows/blob/master/examples/circular_nsf.ipynb), the parameter `permute_mask=True` and we do not manually add any permutations or LU decomposition of the input data. In the non-circular [neural spline flow example](https://github.com/VincentStimper/normalizing-flows/blob/master/examples/neural_spline_flow.ipynb), we do add a LULinear layer to mix the hidden channels.

So without the LU, we at least dont end up in the situation where the loss goes down drastically, but we end up with a spiky Gaussian distribution around 0 that does not resemble the target. I still have not found a setting that produces a good normalizing flow, learning stops around $KL_{div} \approx 3$.

GETTING DECENT RESULTS WITH LARGER BATCHSIZES, E.G. 1024 and 2048, relatively fast training and good posteriors.

In [15]:
# prepare data
batch_size = 2048
dataset = torch.utils.data.TensorDataset(cylinder_theta)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)  # WORKS WITH batch_size=2048, try 1024

In [10]:
learning_rate = 1e-4
weight_decay = 1e-4
optimizer = torch.optim.Adam(prior_flow.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9, last_epoch=-1)
losses=[]

NameError: name 'prior_flow' is not defined

In [30]:
def train_main(dataloader, flow, optimizer=None, losses=None, n_epoch=25, scheduler=None, learning_rate=1e-4, weight_decay=1e-4, LR_gamma=1.0):
    n_steps = n_epoch * len(dataloader)
    pbar = tqdm.tqdm(total=n_steps, ncols=120, position=0)

    if optimizer is None:
        optimizer = torch.optim.Adam(prior_flow.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    try: 
        get_val = lambda x: x.to('cpu').data.numpy()
        if losses is None:
            losses = []
        for epoch in range(n_epoch):
            for i, (chunk,)  in enumerate(dataloader):
                loss = flow.forward_kld(chunk)
                optimizer.zero_grad()
                if ~(torch.isnan(loss) | torch.isinf(loss)):
                    loss.backward()
                    optimizer.step()
                else:
                    raise ValueError(f'loss: {loss}')
                np_loss = get_val(loss)
                losses.append(float(np_loss))
                pbar.update()
                pbar.set_postfix(loss=np_loss.round(4))
            if scheduler is not None:
                scheduler.step()
    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(e)
        raise e
    finally:
        pbar.close()
    return flow, losses


prior_flow, losses = train_main(dataloader, prior_flow, optimizer, losses, scheduler)


 49%|██████████████████████████████████▎                                   | 306/625 [23:44<24:45,  4.66s/it, loss=3.17]


In [18]:
hv.Scatter(losses)

In [19]:
with torch.no_grad():
    pf_samples, pf_log_q = prior_flow.sample(2000)

In [27]:
cylinder_theta.shape

torch.Size([50000, 5])

In [29]:
i1, i2 = 0, 2

theta_id = fcm.theta_id('cylinder')
randi = torch.as_tensor(np.random.choice(smc_data.shape[0], 3000))

xax = hv.Dimension(theta_id[i1], range=(-1.1,1.1))
yax = hv.Dimension(theta_id[i2], range=(-1.1,1.1))

randi = torch.as_tensor(np.random.choice(smc_data.shape[0], 3000))
pr = hv.Bivariate(cylinder_theta.numpy()[randi][:, [i1, i2]], kdims=[xax, yax]).opts(colorbar=True, cmap='Blues', filled=True)
pf = hv.Bivariate(pf_samples[:2000, [i1, i2]].detach().numpy(), kdims=[xax, yax]).opts(colorbar=True, cmap='Blues', filled=True)

(pf + pr).opts(shared_axes=False)

## Conditional flow


In [3]:
from sbmfi.inference.flow_trainer import flow_constructor, flow_trainer, Flow_Dataset
from sbmfi.inference.mdvae import MDVAE_Dataset

In [4]:
n = 50000 # accidentaly saved to hdf twice, which means that all data after 500000 is duplicated :'(
device = 'cuda:0'

fcm = model.flux_coordinate_mapper
fluxes = basebayes.read_hdf(hdf=file, dataset_id=dataset_id, what='fluxes', pandalize=True)[:n]
cylinder_theta = fcm.map_fluxes_2_theta(fluxes, coordinate_id='cylinder', is_thermo=False, pandalize=False, rescale_val=1.0)
data = basebayes.read_hdf(hdf=file, dataset_id=dataset_id, what='data', pandalize=False)[:n]

unchanging_idx = [basebayes.data_id.to_list().index(x) for x in [('A', 'ilr_L|[1,2]_0'),]]  # remember this one does not change!
selecta = np.ones(data.shape[-1], dtype=bool)
selecta[unchanging_idx] = False
data = data[..., selecta]

In [5]:
file_nc = os.path.join(BASE_DIR, 'spiro_5000samples_12steps_alldata.nc')
smc_infdat = az.InferenceData.from_netcdf(file_nc)
smc_theta = torch.as_tensor(smc_infdat.posterior.theta[-1].values)

smc_thermo = fcm.map_theta_2_fluxes(smc_theta, return_thermo=True, pandalize=False, rescale_val=None)
smc_cylinder = fcm.map_fluxes_2_theta(smc_thermo, coordinate_id='cylinder', rescale_val=1.0, is_thermo=True, pandalize=False)

In [169]:
# n_data = data.shape[-1]
# n_hidden = math.ceil(n_data / 1.5)
# n_latent = math.ceil(n_data / 3)
# n_hidlay = 2
# embedding_net = [torch.nn.Linear(n_data, n_hidden), torch.nn.LeakyReLU(0.01)]
# for i in range(n_hidlay):
#     embedding_net.extend([torch.nn.Linear(n_hidden, n_hidden), torch.nn.LeakyReLU(0.01)])
# embedding_net.append(torch.nn.Linear(n_hidden, n_latent))
# embedding_net = torch.nn.Sequential(*embedding_net)

cond_flow = flow_constructor(
    fcm=fcm,
    coordinate_id='cylinder',
    rescale_val=1.0,
    log_xch=False,
    # embedding_net=embedding_net,
    num_context_channels=data.shape[-1],
    autoregressive=True,
    num_blocks=3,
    num_hidden_channels=128,
    num_bins=15,
    dropout_probability=0.02,
    num_transforms=8,
    init_identity=True,
    permute='shuffle',
    # permute=None,
    p=None,
    device=device
)




```
cond_flow = flow_constructor(
    fcm=fcm,
    coordinate_id='cylinder',
    rescale_val=1.0,
    log_xch=False,
    # embedding_net=embedding_net,
    num_context_channels=data.shape[-1],
    autoregressive=True,
    num_blocks=3,
    num_hidden_channels=128,
    num_bins=15,
    dropout_probability=0.02,
    num_transforms=8,
    init_identity=True,
    permute='shuffle',
    # permute=None,
    p=None,
    device=device
)

```
start this one with 5e-3 lr and scheduling and then after 50 epochs switch to 1e-4 without scheduling, then after another 100 epochs I switched to lr 2e-4, which jumped the loss back up.


In [170]:
if torch.cuda.is_available() and (device != 'cpu'):
    data=data.to(device)
    cylinder_theta=cylinder_theta.to(device)


batch_size = 8192 * 3
dataset = Flow_Dataset(data=data, theta=cylinder_theta)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)  # WORKS WITH batch_size=2048, try 1024  num_workers = 4 should be fastest?? https://www.reddit.com/r/MachineLearning/comments/kvs1ex/d_here_are_17_ways_of_making_pytorch_training/

In [171]:
# pickle.dump((dataset, fcm), open('dat_fcm.p', 'wb'))

In [172]:
learning_rate = 5e-3
weight_decay = 1e-2
gamma = 0.9
# optimizer = torch.optim.Adam(cond_flow.parameters(), lr=learning_rate, weight_decay=weight_decay)
optimizer = torch.optim.AdamW(cond_flow.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma, last_epoch=-1)
losses=[]

In [238]:
# dataloader = torch.utils.data.DataLoader(dataset, batch_size=2048, shuffle=True, num_workers=0)
# optimizer = torch.optim.Adam(cond_flow.parameters(), lr=1e-3, weight_decay=weight_decay)  # manually reduce learning_rate to 1e-4
# scheduler = None

optimizer = torch.optim.AdamW(cond_flow.parameters(), lr=2e-4, weight_decay=1e-2)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=gamma, last_epoch=-1)
#

In [255]:
def train_main(dataloader, flow, optimizer=None, losses=None, n_epoch=50, scheduler=None, learning_rate=1e-4, weight_decay=1e-4, LR_gamma=1.0, device='cuda:0'):
    n_steps = n_epoch * len(dataloader)
    pbar = tqdm.tqdm(total=n_steps, ncols=120, position=0)

    if optimizer is None:
        optimizer = torch.optim.Adam(flow.parameters(), lr=learning_rate, weight_decay=weight_decay)
    
    try: 
        get_val = lambda x: x.to('cpu').data.numpy()
        if losses is None:
            losses = []
        for epoch in range(n_epoch):
            for i, (x_chunk, y_chunk)  in enumerate(dataloader):
                loss = flow.forward_kld(y_chunk, context=x_chunk)
                optimizer.zero_grad()
                if ~(torch.isnan(loss) | torch.isinf(loss)):
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(flow.parameters(), max_norm=1.0)
                    optimizer.step()
                else:
                    raise ValueError(f'loss: {loss}')
                np_loss = get_val(loss)
                losses.append(float(np_loss))
                pbar.update()
                pbar.set_postfix(loss=np_loss.round(4))
                if (scheduler is not None) and (i%9000 == 0):
                    scheduler.step()
    except KeyboardInterrupt:
        pass
    except Exception as e:
        print(e)
        raise e
    finally:
        pbar.close()
    return flow, losses


cond_flow, losses = train_main(dataloader, cond_flow, optimizer, losses, scheduler=scheduler)

 73%|██████████████████████████████████████████████████                   | 254/350 [45:26<17:10, 10.73s/it, loss=-8.51]


In [256]:
for param_group in optimizer.param_groups:
    print(param_group['lr'])

0.0002


In [257]:
# from torch.profiler import profile, record_function, ProfilerActivity

# activities = [ProfilerActivity.CPU, ProfilerActivity.CUDA]

# with torch.profiler.profile(
#     activities=activities,
#     schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
#     on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/COND_FLOW3'),
#     record_shapes=True,
#     profile_memory=True,
#     with_stack=False
# ) as prof:
#     for step, (x_chunk, y_chunk) in enumerate(dataloader):
#         prof.step()  # Need to call this at each step to notify profiler of steps' boundary.
#         if step >= 1 + 1 + 3:
#             break
#         cond_flow.forward_kld(y_chunk, context=x_chunk)
#     # prof.export_chrome_trace("trace.json")
# sort_by_keyword = "self_" + device + "_time_total"

# print(prof.key_averages(group_by_stack_n=5).table(sort_by=sort_by_keyword, row_limit=2))

In [258]:
hv.Scatter(losses)

In [243]:
# del cond_flow

In [244]:
# dataset[:]

In [259]:
flow_kwargs = dict(
    fcm=fcm,
    coordinate_id='cylinder',
    rescale_val=1.0,
    log_xch=False,
    # embedding_net=embedding_net,
    num_context_channels=data.shape[-1],
    autoregressive=True,
    num_blocks=3,
    num_hidden_channels=128,
    num_bins=15,
    dropout_probability=0.02,
    num_transforms=8,
    init_identity=True,
    permute='shuffle',
    # permute=None,
    p=None,
    device=device,
    batch_size = 8192 * 3,
    learning_rate = 5e-3,
    weight_decay = 1e-3,
    gamma = 0.9,
    comment = 'start this one with 5e-3 lr and scheduling and then after 420 steps switch to 1e-4 without scheduling, then after 1125 steps I switched to lr 2e-4 no scheduling, which jumped the loss back up. Somehow this thing is still learning at the time of saving. The current loss is -8.5, STILL LEARNING',
)
pickle.dump((cond_flow, optimizer, losses, flow_kwargs, dataset), open('sortofworking_cond_flow_flow_optimizer_losses_kwargs_dataset_FIVE_TEMP2.p', 'wb'))

In [246]:
i1, i2 = 1,3
randi = torch.as_tensor(np.random.choice(smc_cylinder.shape[0], 3000))

cylinder_theta_id = fcm.theta_id('cylinder')
xax = hv.Dimension(cylinder_theta_id[i1], range=(-1.1,1.1))
yax = hv.Dimension(cylinder_theta_id[i2], range=(-1.1,1.1))

true_theta = fcm.map_fluxes_2_theta(kwargs['fluxes'].to_frame().T, coordinate_id='cylinder', rescale_val=1.0, is_thermo=False, pandalize=False)
measurements = (torch.from_numpy(kwargs['measurements'].values)[:, selecta] - dataset.data_mean.to('cpu')) / dataset.data_std.to('cpu')
measurements = measurements.to(device)
biv_opts_kwargs = dict(show_grid=True, colorbar=False, cmap='Blues', filled=True)

In [247]:
measurements3, theta3 = dataset[65465]
with torch.no_grad():
    cond_samples3, log_q3 = cond_flow.sample(2000, context=measurements3[None,:])

cond_samples3, log_q3, theta3 = cond_samples3.to('cpu'), log_q3.to('cpu'), theta3.to('cpu')
meas_samples3 = hv.Bivariate(cond_samples3[:2000, [i1, i2]].numpy(), kdims=[xax, yax]).opts(**biv_opts_kwargs)
true3 = hv.Points(theta3[None,[i1, i2]].numpy(), kdims=[xax, yax]).opts(color='red')
meas_samples3 * true3

In [248]:
measurements3, theta3 = dataset[8000]
with torch.no_grad():
    cond_samples3, log_q3 = cond_flow.sample(2000, context=measurements3[None,:])
cond_samples3, log_q3, theta3 = cond_samples3.to('cpu'), log_q3.to('cpu'), theta3.to('cpu')
meas_samples3 = hv.Bivariate(cond_samples3[:2000, [i1, i2]].detach().numpy(), kdims=[xax, yax]).opts(**biv_opts_kwargs)
true3 = hv.Points(theta3[None,[i1, i2]], kdims=[xax, yax]).opts(color='red')
meas_samples3 * true3

In [249]:
measurements3, theta3 = dataset[5]
with torch.no_grad():
    cond_samples3, log_q3 = cond_flow.sample(2000, context=measurements3[None,:])
cond_samples3, log_q3, theta3 = cond_samples3.to('cpu'), log_q3.to('cpu'), theta3.to('cpu')
meas_samples3 = hv.Bivariate(cond_samples3[:2000, [i1, i2]].detach().numpy(), kdims=[xax, yax]).opts(**biv_opts_kwargs)
true3 = hv.Points(theta3[None,[i1, i2]], kdims=[xax, yax]).opts(color='red')
meas_samples3 * true3

In [250]:
measurements2, theta2 = dataset[5000]

with torch.no_grad():
    cond_samples, log_q = cond_flow.sample(2000, context=measurements)
    cond_samples2, log_q2 = cond_flow.sample(2000, context=measurements2[None,:])

cond_samples, log_q = cond_samples.to('cpu'), log_q.to('cpu')
cond_samples2, log_q2, theta2 = cond_samples2.to('cpu'), log_q2.to('cpu'), theta2.to('cpu')

In [252]:
theta2

tensor([0.8758, 0.3845, 0.8076, 0.9712, 0.8007])

In [253]:
true_theta

tensor([[-0.1164,  0.2642,  0.6063,  1.0000,  0.1429]])

In [251]:
trth = hv.Points(true_theta[:, [i1, i2]], kdims=[xax, yax]).opts(color='red')

smc = hv.Bivariate(smc_cylinder[randi][:, [i1, i2]].numpy(), kdims=[xax, yax]).opts(**biv_opts_kwargs)

meas_samples = hv.Bivariate(cond_samples[:2000, [i1, i2]].numpy(), kdims=[xax, yax]).opts(**biv_opts_kwargs)

meas_samples2 = hv.Bivariate(cond_samples2[:2000, [i1, i2]].numpy(), kdims=[xax, yax]).opts(**biv_opts_kwargs)
true2 = hv.Points(theta2[None,[i1, i2]], kdims=[xax, yax]).opts(color='red')

(smc*trth + meas_samples * trth + meas_samples2 * true2).opts(shared_axes=False)


# NONSENSE!


we will sample `n` fluxes from a uniform prior and simulate `n_obs=3` observations per sampled flux-vector.

In [114]:
# dims = {
#     'theta': ['theta_id'],
# }
# coords = {
#     'theta_id': cyl_fcm.theta_id.tolist(),
# }

# ding = az.from_dict(
#     posterior={
#         'theta': samples[None, ...].numpy()  # chains x draws x param
#     },
#     prior={
#         'theta': pf_samples[None, ...].numpy(),  # add the 'chains' dimension
#     },
#     dims=dims,
#     coords=coords,
# )

In [191]:
prior_fluxes = cyl_fcm.map_theta_2_fluxes(samples, pandalize=True, return_thermo=True)
pf_fluxes = cyl_fcm.map_theta_2_fluxes(pf_samples, pandalize=True, return_thermo=True)
