In [1]:
%load_ext autoreload
%autoreload 2

In [32]:
import pandas as pd
full_data = pd.read_csv("../../data/processed/historical/train/lcl_data.csv")
df_100K = full_data.sample(10000, random_state=0)
df_100K.to_csv("../../data/processed/historical/train/lcl_data_100K.csv", index=False)

# Load Data

In [3]:
import torch
import numpy as np
import random
RANDOM_STATE = 0
torch.manual_seed(RANDOM_STATE)
torch.use_deterministic_algorithms(True)
g = torch.Generator()
g.manual_seed(RANDOM_STATE)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [4]:
from pathlib import Path
from opensynth.data_modules.lcl_data_module import LCLDataModule
import pytorch_lightning as pl

import matplotlib.pyplot as plt

data_path = Path("../../data/processed/historical/train/lcl_data_25K.csv")
stats_path = Path("../../data/processed/historical/train/mean_std.csv")
outlier_path = Path("../../data/processed/historical/train/outliers.csv")

dm = LCLDataModule(data_path=data_path, stats_path=stats_path, batch_size=25000, n_samples=25000)
dm.setup()

In [5]:
import torch
from opensynth.models.faraday import FaradayVAE
vae_model = torch.load("vae_model.pt")
vae_model.eval()

  vae_model = torch.load("vae_model.pt")


FaradayVAE(
  (encoder): Encoder(
    (encoder_layers): Sequential(
      (0): Linear(in_features=50, out_features=512, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=512, out_features=256, bias=True)
      (3): GELU(approximate='none')
      (4): Linear(in_features=256, out_features=128, bias=True)
      (5): GELU(approximate='none')
      (6): Linear(in_features=128, out_features=64, bias=True)
      (7): GELU(approximate='none')
      (8): Linear(in_features=64, out_features=32, bias=True)
      (9): GELU(approximate='none')
      (10): Linear(in_features=32, out_features=16, bias=True)
    )
  )
  (decoder): Decoder(
    (latent): Linear(in_features=18, out_features=16, bias=True)
    (latent_activations): GELU(approximate='none')
    (decoder_layers): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=32, out_features=64, bias=True)
      (3): GELU(approximate='no

In [6]:
from opensynth.models.faraday.gaussian_mixture.prepare_gmm_input import encode_data_for_gmm

next_batch = next(iter(dm.train_dataloader()))
input_tensor = encode_data_for_gmm(data=next_batch, vae_module=vae_model)
input_data = input_tensor.detach().numpy()
n_samples = len(input_tensor)

In [7]:
N_COMPONENTS = 250
REG_COVAR = 1e-4
EPOCHS = 25
IDX = 0
CONVERGENCE_TOL = 1e-2


In [8]:
input_tensor[0][0]

tensor(-0.9579, grad_fn=<SelectBackward0>)

# Init GMM

In [9]:
from opensynth.models.faraday.new_gmm import gmm_utils

labels_, means_, responsibilities_ = gmm_utils.initialise_centroids(
        X=input_data, n_components=N_COMPONENTS
    )
print(labels_.dtype, responsibilities_.dtype, means_.dtype)

torch.float32 torch.float32 torch.float32


In [10]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params

gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS,
    reg_covar=REG_COVAR,
)
print(gmm_init_params["precision_cholesky"][IDX][0][0])
print(gmm_init_params["weights"].sum())

tensor(1.4373)
tensor(1.)


# Torch GMM

In [11]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params, training_loop
from opensynth.models.faraday.new_gmm.new_gmm_model import GaussianMixtureModel


gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS,
    reg_covar=REG_COVAR,
)
torch_gmm = GaussianMixtureModel(
    num_components=N_COMPONENTS,
    num_features = input_data.shape[1],
    reg_covar=REG_COVAR,
    print_idx=IDX
)
torch_gmm.initialise(gmm_init_params)
trained_model = training_loop(model=torch_gmm, data=input_tensor, max_iter=EPOCHS)

100%|██████████| 25/25 [00:07<00:00,  3.37it/s]

Converged: False. Number of iterations: 24





# SK Learn GMM Manual

In [12]:
import numpy as np
from scipy.special import logsumexp
from scipy import linalg

def _estimate_gaussian_parameters(X, resp, reg_covar=REG_COVAR):
    nk = resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps
    means = np.dot(resp.T, X) / nk[:, np.newaxis]
    n_components, n_features = means.shape
    covariances = np.empty((n_components, n_features, n_features))
    for k in range(n_components):
        diff = X - means[k]
        covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k]
        covariances[k].flat[:: n_features + 1] += reg_covar
    return nk, means, covariances

def _compute_precision_cholesky(covariances):
    estimate_precision_error_message = (
        "Fitting the mixture model failed because some components have "
        "ill-defined empirical covariance (for instance caused by singleton "
        "or collapsed samples). Try to decrease the number of components, "
        "or increase reg_covar."
    )

    n_components, n_features, _ = covariances.shape
    precisions_chol = np.empty((n_components, n_features, n_features))
    for k, covariance in enumerate(covariances):
        try:
            cov_chol = linalg.cholesky(covariance, lower=True)
        except linalg.LinAlgError:
            raise ValueError(estimate_precision_error_message)
        precisions_chol[k] = linalg.solve_triangular(
            cov_chol, np.eye(n_features), lower=True
        ).T
    return precisions_chol

def _compute_log_det_cholesky(matrix_chol, n_features):
    n_components, _, _ = matrix_chol.shape
    log_det_chol = np.sum(
        np.log(matrix_chol.reshape(n_components, -1)[:, :: n_features + 1]), 1
    )
    return log_det_chol

def _estimate_log_gaussian_prob(X, means, precisions_chol):
    n_samples, n_features = X.shape
    n_components, _ = means.shape

    log_det = _compute_log_det_cholesky(precisions_chol, n_features)

    log_prob = np.empty((n_samples, n_components))
    for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
        y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
        log_prob[:, k] = np.sum(np.square(y), axis=1)
    return -0.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det

def _estimate_log_weights(weights):
        return np.log(weights)

def _estimate_weighted_log_prob(X, means, precisions_chol, weights):
        return _estimate_log_gaussian_prob(X, means, precisions_chol) + _estimate_log_weights(weights)


def _estimate_log_prob_resp(X, means, precisions_chol, weights):
    weighted_log_prob = _estimate_weighted_log_prob(X, means, precisions_chol, weights)
    log_prob_norm = logsumexp(weighted_log_prob, axis=1)
    with np.errstate(under="ignore"):
        log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]
    return log_prob_norm, log_resp

def _e_step(X,means, precisions_chol, weights):
    log_prob_norm, log_resp = _estimate_log_prob_resp(X, means, precisions_chol, weights)
    return np.mean(log_prob_norm), log_resp

def _m_step(X, log_reponsibilities, reg_covar=REG_COVAR):

    weights_, means_, covariances_ = _estimate_gaussian_parameters(X,np.exp(log_reponsibilities),reg_covar=reg_covar)
    weights_ /= weights_.sum()

    precision_cholesky_ = _compute_precision_cholesky(covariances=covariances_)

    return precision_cholesky_, weights_, means_, covariances_

In [13]:
means = gmm_init_params["means"].detach().numpy()
weights = gmm_init_params["weights"].detach().numpy()
prec_chol = gmm_init_params["precision_cholesky"].detach().numpy()

print(f"Initial prec chol: {prec_chol[IDX][0][0]}. Initial mean: {means[IDX][0]}")

converged = False
lower_bound = -np.inf

for i in range(EPOCHS):
    prev_lower_bound = lower_bound

    print(f"Old Prec Chol: {prec_chol[IDX][0][0]}. Old means: {means[IDX][0]}")
    log_prob, log_resp = _e_step(input_data, means, prec_chol, weights)
    prec_chol, weights, means, covar = _m_step(input_data, log_resp)

    print(f"New prec chol: {prec_chol[IDX][0][0]}. New means: {means[IDX][0]}")

    # Converegence
    lower_bound = log_prob
    change = abs(lower_bound - prev_lower_bound)
    print(f"Change: {change}")
    if change < CONVERGENCE_TOL:
        converged = True
        break

print(f'Converged: {converged}. Number of iterations: {i}')

Initial prec chol: 1.437282919883728. Initial mean: -0.6032145619392395
Old Prec Chol: 1.437282919883728. Old means: -0.6032145619392395
New prec chol: 1.356676582448056. New means: -0.6564950422496009
Change: inf
Old Prec Chol: 1.356676582448056. Old means: -0.6564950422496009
New prec chol: 1.343302477406987. New means: -0.7412085299427849
Change: 0.728874642245787
Old Prec Chol: 1.343302477406987. Old means: -0.7412085299427849
New prec chol: 1.376663885283913. New means: -0.8058698654684182
Change: 0.31081594079179986
Old Prec Chol: 1.376663885283913. Old means: -0.8058698654684182
New prec chol: 1.4099579201898336. New means: -0.8480703891338497
Change: 0.2314840705442336
Old Prec Chol: 1.4099579201898336. Old means: -0.8480703891338497
New prec chol: 1.41229630059536. New means: -0.8940405435379908
Change: 0.20597518011670957
Old Prec Chol: 1.41229630059536. Old means: -0.8940405435379908
New prec chol: 1.404418457405898. New means: -0.9452993465093092
Change: 0.170695578677182
O

# SK Learn GMM Epoch

In [14]:
from sklearn.mixture import GaussianMixture

init_weights = gmm_init_params["weights"]
init_means = gmm_init_params["means"]

skgmm = GaussianMixture(n_components=N_COMPONENTS, covariance_type='full', tol=CONVERGENCE_TOL, max_iter=EPOCHS, random_state=0, means_init = init_means, weights_init=init_weights, verbose=2, verbose_interval=1)
skgmm.fit(input_data)
skgmm_pred = skgmm.predict(input_data)

Initialization 0
  Iteration 1	 time lapse 3.90058s	 ll change inf
  Iteration 2	 time lapse 1.77832s	 ll change 0.66146
  Iteration 3	 time lapse 1.95460s	 ll change 0.34602
  Iteration 4	 time lapse 1.80817s	 ll change 0.26224
  Iteration 5	 time lapse 1.80745s	 ll change 0.22942
  Iteration 6	 time lapse 1.97844s	 ll change 0.18295
  Iteration 7	 time lapse 1.75865s	 ll change 0.13024
  Iteration 8	 time lapse 1.79453s	 ll change 0.10288
  Iteration 9	 time lapse 1.79862s	 ll change 0.08997
  Iteration 10	 time lapse 1.93086s	 ll change 0.07768
  Iteration 11	 time lapse 1.87863s	 ll change 0.06645
  Iteration 12	 time lapse 1.82267s	 ll change 0.05463
  Iteration 13	 time lapse 1.79395s	 ll change 0.04896
  Iteration 14	 time lapse 1.85815s	 ll change 0.04592
  Iteration 15	 time lapse 1.84846s	 ll change 0.04153
  Iteration 16	 time lapse 1.73301s	 ll change 0.03424
  Iteration 17	 time lapse 1.90994s	 ll change 0.02898
  Iteration 18	 time lapse 1.81359s	 ll change 0.02612
  Iter



In [15]:
skgmm.converged_, skgmm.n_iter_

(False, 25)

# Torch Lightning

In [16]:
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import LightningDataModule
class CustomDataset(Dataset):
    def __init__(self, data_tensor: torch.Tensor):
        self.data = data_tensor
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]
    
class CustomDataModule(LightningDataModule):
    def __init__(self, data_tensor: torch.Tensor, batch_size: int):
        super().__init__()
        self.data_tensor = data_tensor
        self.batch_size = batch_size
    def setup(self, stage=""):
        self.custom_ds = CustomDataset(self.data_tensor)
    def train_dataloader(self):
        return DataLoader(self.custom_ds, batch_size=self.batch_size, shuffle=False, generator=g, worker_init_fn=seed_worker)
    
custom_dm = CustomDataModule(data_tensor=input_tensor, batch_size=25000)
custom_dm.setup(stage="")

In [17]:

for i in range(5):
    print(next(iter(custom_dm.train_dataloader()))[0][0])

tensor(-0.9579, grad_fn=<SelectBackward0>)
tensor(-0.9579, grad_fn=<SelectBackward0>)
tensor(-0.9579, grad_fn=<SelectBackward0>)
tensor(-0.9579, grad_fn=<SelectBackward0>)
tensor(-0.9579, grad_fn=<SelectBackward0>)


In [20]:
from opensynth.models.faraday.new_gmm.new_gmm_model import GaussianMixtureLightningModule
gmm_module = GaussianMixtureModel(
    num_components=N_COMPONENTS,
    num_features = input_data.shape[1],
    reg_covar=REG_COVAR,
    print_idx=IDX
)
gmm_module.initialise(gmm_init_params)
print(f"Initial prec chol: {gmm_module.precision_cholesky[IDX][0][0]}. Initial mean: {gmm_module.means[IDX][0]}")

gmm_lightning_module = GaussianMixtureLightningModule(
    gmm_module = gmm_module,
    vae_module = vae_model,
    num_components = gmm_module.num_components,
    num_features = gmm_module.num_features,
    reg_covar = gmm_module.reg_covar,
    convergence_tolerance = CONVERGENCE_TOL
)
trainer = pl.Trainer(max_epochs=EPOCHS, accelerator="cpu", deterministic=True )
trainer.fit(gmm_lightning_module, custom_dm)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/shengchai/.local/share/virtualenvs/OpenSynth-EhRIPYd3/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/shengchai/.local/share/virtualenvs/OpenSynth-EhRIPYd3/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/Users/shengchai/.local/share/virtualenvs/OpenSynth-EhRIPYd3/lib/python3.11/site-packages/pytorch_

Initial prec chol: 1.437282919883728. Initial mean: -0.6032145619392395
Epoch 0: 100%|██████████| 1/1 [00:00<00:00,  1.17it/s, v_num=171]Local weights, means: 0.0064, -0.6565
Reduced weights, means: 0.0064, -0.6565
Epoch 1: 100%|██████████| 1/1 [00:00<00:00,  2.74it/s, v_num=171]Local weights, means: 0.0070, -0.7386
Reduced weights, means: 0.0070, -0.7386
Epoch 2: 100%|██████████| 1/1 [00:00<00:00,  3.12it/s, v_num=171]Local weights, means: 0.0076, -0.8044
Reduced weights, means: 0.0076, -0.8044
Epoch 3: 100%|██████████| 1/1 [00:00<00:00,  2.82it/s, v_num=171]Local weights, means: 0.0082, -0.8461
Reduced weights, means: 0.0082, -0.8461
Epoch 4: 100%|██████████| 1/1 [00:00<00:00,  3.50it/s, v_num=171]Local weights, means: 0.0090, -0.8953
Reduced weights, means: 0.0090, -0.8953
Epoch 5: 100%|██████████| 1/1 [00:00<00:00,  2.75it/s, v_num=171]Local weights, means: 0.0097, -0.9427
Reduced weights, means: 0.0097, -0.9427
Epoch 6: 100%|██████████| 1/1 [00:00<00:00,  3.45it/s, v_num=171]Local

`Trainer.fit` stopped: `max_epochs=25` reached.


Local weights, means: 0.0090, -1.1693
Reduced weights, means: 0.0090, -1.1693
Epoch 24: 100%|██████████| 1/1 [00:00<00:00,  3.58it/s, v_num=171]


In [21]:
gmm_lightning_module.gmm_module.weights[0], gmm_lightning_module.gmm_module.means[0][0]

(tensor(0.0090), tensor(-1.1693))

In [22]:
gmm_lightning_module.weight_metric.compute()[0], gmm_lightning_module.mean_metric.compute()[0][0]

(tensor(0.0090), tensor(-1.1693))

# Compare

In [23]:
IDX = 0

In [24]:
df_compare_means = pd.DataFrame()
df_compare_means["skgmm"] = skgmm.means_[IDX]
df_compare_means["numpy"] = means[IDX]
df_compare_means["torch"] = trained_model.means[IDX]
df_compare_means["lightning"] = gmm_lightning_module.gmm_module.means[IDX]
df_compare_means

Unnamed: 0,skgmm,numpy,torch,lightning
0,-1.19061,-1.206169,-1.169291,-1.169291
1,-0.794418,-0.881784,-0.804894,-0.804894
2,0.275553,0.236022,0.245125,0.245125
3,1.344647,1.472477,1.307897,1.307897
4,-1.193419,-1.263038,-1.154233,-1.154233
5,0.103049,0.038108,0.142454,0.142454
6,-1.369626,-1.498605,-1.346444,-1.346444
7,0.082094,-0.045775,0.106703,0.106703
8,0.060162,0.000991,0.058913,0.058913
9,0.830846,0.819398,0.856419,0.856419


In [25]:
gmm_init_params["means"][IDX]

tensor([-0.6032, -0.7337, -0.4493,  1.1442, -1.0089, -0.1294, -1.1521, -0.8871,
         0.1370,  0.3750,  0.4332, -2.9347, -0.0834,  0.2875, -3.3654,  0.9473,
         5.4575,  1.2549])

In [26]:
df_compare_covar = pd.DataFrame()
df_compare_covar["skgmm"] = skgmm.covariances_[IDX][0]
df_compare_covar["numpy"] = covar[IDX][0]
df_compare_covar["torch"] = trained_model.covariances.detach().numpy()[IDX][0]
df_compare_covar["lightning"] = gmm_lightning_module.gmm_module.covariances.detach().numpy()[IDX][0]
df_compare_covar

Unnamed: 0,skgmm,numpy,torch,lightning
0,0.419792,0.481448,0.416045,0.416045
1,-0.177586,-0.233607,-0.160366,-0.160366
2,-0.267074,-0.310524,-0.251109,-0.251109
3,-0.303474,-0.322696,-0.300622,-0.300622
4,0.332405,0.392857,0.325844,0.325844
5,-0.03242,-0.054914,-0.051546,-0.051546
6,0.098471,0.086881,0.104739,0.104739
7,-0.166122,-0.200558,-0.170621,-0.170621
8,-0.090187,-0.136983,-0.086941,-0.086941
9,-0.22673,-0.261106,-0.245132,-0.245132


In [27]:
df_compare_pre_chol = pd.DataFrame()
df_compare_pre_chol["skgmm"] = skgmm.precisions_cholesky_[IDX][0]
df_compare_pre_chol["numpy"] = prec_chol[IDX][0]
df_compare_pre_chol["torch"] = trained_model.precision_cholesky.detach().numpy()[IDX][0]
df_compare_pre_chol["lightning"] = gmm_lightning_module.gmm_module.precision_cholesky.detach().numpy()[IDX][0]
df_compare_pre_chol

Unnamed: 0,skgmm,numpy,torch,lightning
0,1.543417,1.441204,1.550351,1.550351
1,1.001877,1.095678,0.951004,0.951004
2,1.393014,1.204478,1.5364,1.5364
3,4.494201,3.887933,5.111315,5.111315
4,2.023389,1.134885,2.772165,2.772165
5,10.616157,10.091299,9.606372,9.606372
6,-7.828143,-7.525503,-6.897675,-6.897675
7,7.216097,5.8865,8.23031,8.23031
8,-17.839368,-14.657665,-19.755314,-19.755314
9,9.08298,7.242,8.462349,8.462349


In [28]:
df_compare_weights = pd.DataFrame()
df_compare_weights["skgmm"] = skgmm.weights_[:10]
df_compare_weights["numpy"] = weights[:10]
df_compare_weights["torch"] = trained_model.weights[:10]
df_compare_weights["lightning"] = gmm_lightning_module.gmm_module.weights.detach().numpy()[:10]
df_compare_weights

Unnamed: 0,skgmm,numpy,torch,lightning
0,0.00959,0.009647,0.008953,0.008953
1,0.0002,0.0002,0.0002,0.0002
2,0.012595,0.008783,0.014577,0.014577
3,0.0008,0.0008,0.0008,0.0008
4,0.011144,0.009512,0.010635,0.010635
5,0.009967,0.006116,0.009447,0.009447
6,0.010547,0.008836,0.011154,0.011154
7,0.001197,0.001198,0.001156,0.001156
8,0.00032,0.00032,0.00032,0.00032
9,0.00072,0.00072,0.00072,0.00072


# Sampling

In [29]:
def sample(means_, covariances_, weights_, n_samples):
    rng = np.random.RandomState(RANDOM_STATE)
    n_samples_comp = rng.multinomial(n_samples, weights_)

    X = np.vstack(
            [
                rng.multivariate_normal(mean, covariance, int(sample))
                for (mean, covariance, sample) in zip(
                    means_, covariances_, n_samples_comp
                )
            ]
        )
    
    y = np.concatenate(
        [np.full(sample, j, dtype=int) for j, sample in enumerate(n_samples_comp)]
    )
    return (X, y)

In [30]:
N_SAMPLES = 250

In [31]:
skgmm_samples = sample(skgmm.means_, skgmm.covariances_, skgmm.weights_, n_samples = N_SAMPLES)

skgmm_X, skgmm_y = skgmm_samples
skgmm_X[IDX], skgmm_y[IDX]

(array([-0.807662  , -0.89268031,  0.15431166,  1.1586377 , -0.89860792,
        -0.36512184, -1.34851914, -0.39931756, -0.25683488,  0.17882885,
         0.10511008, -2.35669724, -0.56101957,  0.787289  , -3.06863892,
         0.9408126 , 13.47886626,  4.96380435]),
 0)

In [None]:
samples = sample(means, covar, weights, n_samples = N_SAMPLES)

X, y = samples
X[IDX], y[IDX]

In [None]:
train_model_samples = sample(trained_model.means.detach().numpy(), trained_model.covariances.detach().numpy(), trained_model.weights.detach().numpy(), n_samples = N_SAMPLES)
train_model_X, train_model_y = train_model_samples
train_model_X[IDX], train_model_y[IDX]

In [None]:
gmm_lightning_samples = sample(gmm_lightning_module.gmm_module.means.detach().numpy(), gmm_lightning_module.gmm_module.covariances.detach().numpy(), gmm_lightning_module.gmm_module.weights.detach().numpy(), n_samples = N_SAMPLES)
gmm_lightning_X, gmm_lightning_y = train_model_samples
gmm_lightning_X[IDX], gmm_lightning_y[IDX]