In [1]:
%load_ext autoreload
%autoreload 2

In [130]:
import pandas as pd
full_data = pd.read_csv("../../data/processed/historical/train/lcl_data.csv")
df_25K = full_data.sample(25000, random_state=0)
df_25K.to_csv("../../data/processed/historical/train/lcl_data_25K.csv", index=False)

# Load Data

In [113]:
import torch
import numpy as np
import random
RANDOM_STATE = 0
torch.manual_seed(RANDOM_STATE)
torch.use_deterministic_algorithms(True)
g = torch.Generator()
g.manual_seed(RANDOM_STATE)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [3]:
from pathlib import Path
from opensynth.data_modules.lcl_data_module import LCLDataModule
import pytorch_lightning as pl

import matplotlib.pyplot as plt

data_path = Path("../../data/processed/historical/train/lcl_data_25K.csv")
stats_path = Path("../../data/processed/historical/train/mean_std.csv")
outlier_path = Path("../../data/processed/historical/train/outliers.csv")

dm = LCLDataModule(data_path=data_path, stats_path=stats_path, batch_size=25000, n_samples=25000)
dm.setup()

In [4]:
import torch
from opensynth.models.faraday import FaradayVAE
vae_model = torch.load("vae_model.pt")
vae_model.eval()

  vae_model = torch.load("vae_model.pt")


FaradayVAE(
  (encoder): Encoder(
    (encoder_layers): Sequential(
      (0): Linear(in_features=50, out_features=512, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=512, out_features=256, bias=True)
      (3): GELU(approximate='none')
      (4): Linear(in_features=256, out_features=128, bias=True)
      (5): GELU(approximate='none')
      (6): Linear(in_features=128, out_features=64, bias=True)
      (7): GELU(approximate='none')
      (8): Linear(in_features=64, out_features=32, bias=True)
      (9): GELU(approximate='none')
      (10): Linear(in_features=32, out_features=16, bias=True)
    )
  )
  (decoder): Decoder(
    (latent): Linear(in_features=18, out_features=16, bias=True)
    (latent_activations): GELU(approximate='none')
    (decoder_layers): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=32, out_features=64, bias=True)
      (3): GELU(approximate='no

In [5]:
from opensynth.models.faraday.gaussian_mixture.prepare_gmm_input import encode_data_for_gmm

next_batch = next(iter(dm.train_dataloader()))
input_tensor = encode_data_for_gmm(data=next_batch, vae_module=vae_model)
input_data = input_tensor.detach().numpy()
n_samples = len(input_tensor)

In [136]:
N_COMPONENTS = 250
REG_COVAR = 1e-4
EPOCHS = 25
IDX = 0


In [137]:
input_tensor[0][0]

tensor(-0.9579, grad_fn=<SelectBackward0>)

# Init GMM

In [138]:
from opensynth.models.faraday.new_gmm import gmm_utils

labels_, means_, responsibilities_ = gmm_utils.initialise_centroids(
        X=input_data, n_components=N_COMPONENTS
    )
print(labels_.dtype, responsibilities_.dtype, means_.dtype)

torch.float32 torch.float32 torch.float32


In [139]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params

gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS,
    reg_covar=REG_COVAR,
)
print(gmm_init_params["precision_cholesky"][IDX][0][0])
print(gmm_init_params["weights"].sum())

tensor(1.4373)
tensor(1.)


In [140]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params, training_loop
from opensynth.models.faraday.new_gmm.new_gmm_model import GaussianMixtureModel


gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS,
    reg_covar=REG_COVAR,
)
torch_gmm = GaussianMixtureModel(
    num_components=N_COMPONENTS,
    num_features = input_data.shape[1],
    reg_covar=REG_COVAR,
    print_idx=IDX
)
torch_gmm.initialise(gmm_init_params)
trained_model = training_loop(model=torch_gmm, data=input_tensor, max_iter=EPOCHS)

100%|██████████| 25/25 [00:08<00:00,  2.91it/s]


# SK Learn GMM Manual

In [141]:
import numpy as np
from scipy.special import logsumexp
from scipy import linalg

def _estimate_gaussian_parameters(X, resp, reg_covar=REG_COVAR):
    nk = resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps
    means = np.dot(resp.T, X) / nk[:, np.newaxis]
    n_components, n_features = means.shape
    covariances = np.empty((n_components, n_features, n_features))
    for k in range(n_components):
        diff = X - means[k]
        covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k]
        covariances[k].flat[:: n_features + 1] += reg_covar
    return nk, means, covariances

def _compute_precision_cholesky(covariances):
    estimate_precision_error_message = (
        "Fitting the mixture model failed because some components have "
        "ill-defined empirical covariance (for instance caused by singleton "
        "or collapsed samples). Try to decrease the number of components, "
        "or increase reg_covar."
    )

    n_components, n_features, _ = covariances.shape
    precisions_chol = np.empty((n_components, n_features, n_features))
    for k, covariance in enumerate(covariances):
        try:
            cov_chol = linalg.cholesky(covariance, lower=True)
        except linalg.LinAlgError:
            raise ValueError(estimate_precision_error_message)
        precisions_chol[k] = linalg.solve_triangular(
            cov_chol, np.eye(n_features), lower=True
        ).T
    return precisions_chol

def _compute_log_det_cholesky(matrix_chol, n_features):
    n_components, _, _ = matrix_chol.shape
    log_det_chol = np.sum(
        np.log(matrix_chol.reshape(n_components, -1)[:, :: n_features + 1]), 1
    )
    return log_det_chol

def _estimate_log_gaussian_prob(X, means, precisions_chol):
    n_samples, n_features = X.shape
    n_components, _ = means.shape

    log_det = _compute_log_det_cholesky(precisions_chol, n_features)

    log_prob = np.empty((n_samples, n_components))
    for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
        y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
        log_prob[:, k] = np.sum(np.square(y), axis=1)
    return -0.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det

def _estimate_log_weights(weights):
        return np.log(weights)

def _estimate_weighted_log_prob(X, means, precisions_chol, weights):
        return _estimate_log_gaussian_prob(X, means, precisions_chol) + _estimate_log_weights(weights)


def _estimate_log_prob_resp(X, means, precisions_chol, weights):
    weighted_log_prob = _estimate_weighted_log_prob(X, means, precisions_chol, weights)
    log_prob_norm = logsumexp(weighted_log_prob, axis=1)
    with np.errstate(under="ignore"):
        log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]
    return log_prob_norm, log_resp

def _e_step(X,means, precisions_chol, weights):
    log_prob_norm, log_resp = _estimate_log_prob_resp(X, means, precisions_chol, weights)
    return np.mean(log_prob_norm), log_resp

def _m_step(X, log_reponsibilities, reg_covar=REG_COVAR):

    weights_, means_, covariances_ = _estimate_gaussian_parameters(X,np.exp(log_reponsibilities),reg_covar=reg_covar)

    precision_cholesky_ = _compute_precision_cholesky(covariances=covariances_)

    return precision_cholesky_, weights_, means_

In [142]:
means = gmm_init_params["means"].detach().numpy()
weights = gmm_init_params["weights"].detach().numpy()
prec_chol = gmm_init_params["precision_cholesky"].detach().numpy()

print(f"Initial prec chol: {prec_chol[IDX][0][0]}. Initial mean: {means[IDX][0]}")

for i in range(EPOCHS):
    print(f"Old Prec Chol: {prec_chol[IDX][0][0]}. Old means: {means[IDX][0]}")
    log_prob, log_resp = _e_step(input_data, means, prec_chol, weights)
    prec_chol, weights, means = _m_step(input_data, log_resp)
    print(f"New prec chol: {prec_chol[IDX][0][0]}. New means: {means[IDX][0]}")

Initial prec chol: 1.437282919883728. Initial mean: -0.6032145619392395
Old Prec Chol: 1.437282919883728. Old means: -0.6032145619392395
New prec chol: 1.356676582448056. New means: -0.6564950422496009
Old Prec Chol: 1.356676582448056. Old means: -0.6564950422496009
New prec chol: 1.343302477406987. New means: -0.7412085299427853
Old Prec Chol: 1.343302477406987. Old means: -0.7412085299427853
New prec chol: 1.3766638852839175. New means: -0.8058698654684163
Old Prec Chol: 1.3766638852839175. Old means: -0.8058698654684163
New prec chol: 1.4099579201898764. New means: -0.8480703891338682
Old Prec Chol: 1.4099579201898764. Old means: -0.8480703891338682
New prec chol: 1.4122963005953943. New means: -0.8940405435380274
Old Prec Chol: 1.4122963005953943. Old means: -0.8940405435380274
New prec chol: 1.4044184574059342. New means: -0.9452993465093517
Old Prec Chol: 1.4044184574059342. Old means: -0.9452993465093517
New prec chol: 1.4028725597386835. New means: -0.99117607471704
Old Prec Ch

# SK Learn GMM Epoch

In [143]:
from sklearn.mixture import GaussianMixture

init_weights = gmm_init_params["weights"]
init_means = gmm_init_params["means"]

skgmm = GaussianMixture(n_components=N_COMPONENTS, covariance_type='full', max_iter=EPOCHS, random_state=0, means_init = init_means, weights_init=init_weights)
skgmm.fit(input_data)
skgmm_pred = skgmm.predict(input_data)



# Torch Lightning

In [144]:
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import LightningDataModule
class CustomDataset(Dataset):
    def __init__(self, data_tensor: torch.Tensor):
        self.data = data_tensor
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]
    
class CustomDataModule(LightningDataModule):
    def __init__(self, data_tensor: torch.Tensor, batch_size: int):
        super().__init__()
        self.data_tensor = data_tensor
        self.batch_size = batch_size
    def setup(self, stage=""):
        self.custom_ds = CustomDataset(self.data_tensor)
    def train_dataloader(self):
        return DataLoader(self.custom_ds, batch_size=self.batch_size, shuffle=False, generator=g, worker_init_fn=seed_worker)
    
custom_dm = CustomDataModule(data_tensor=input_tensor, batch_size=25000)
custom_dm.setup(stage="")

In [145]:

for i in range(5):
    print(next(iter(custom_dm.train_dataloader()))[0][0])

tensor(-0.9579, grad_fn=<SelectBackward0>)
tensor(-0.9579, grad_fn=<SelectBackward0>)
tensor(-0.9579, grad_fn=<SelectBackward0>)
tensor(-0.9579, grad_fn=<SelectBackward0>)
tensor(-0.9579, grad_fn=<SelectBackward0>)


In [146]:
from opensynth.models.faraday.new_gmm.new_gmm_model import GaussianMixtureLightningModule
gmm_module = GaussianMixtureModel(
    num_components=N_COMPONENTS,
    num_features = input_data.shape[1],
    reg_covar=REG_COVAR,
    print_idx=IDX
)
gmm_module.initialise(gmm_init_params)
gmm_lightning_module = GaussianMixtureLightningModule(
    gmm_module = gmm_module,
    vae_module = vae_model,
    num_components = gmm_module.num_components,
    num_features = gmm_module.num_features,
    reg_covar = gmm_module.reg_covar
)
trainer = pl.Trainer(max_epochs=EPOCHS, accelerator="cpu", deterministic=True )
trainer.fit(gmm_lightning_module, custom_dm)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/shengchai/.local/share/virtualenvs/OpenSynth-EhRIPYd3/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/shengchai/.local/share/virtualenvs/OpenSynth-EhRIPYd3/lib/python3.11/site-packages/pytorch_lightning/core/optimizer.py:182: `LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer

  | Name       | Type                 | Params | Mode 
------------------------------------------------------------
0 | gmm_module | GaussianMixtureModel | 0      | train
1 | vae_module | FaradayVAE           | 402 K  | eval 
------------------------------------------------------------
402 K     Trainable params
0         Non-trainable params
402 K     Total params
1.609     Total estimated model params size (MB)
1         Modules in train mode

Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s] Encoded batch: -0.9578906893730164, Means: -0.6564921736717224
Epoch 1:   0%|          | 0/1 [00:00<?, ?it/s, v_num=142]        Encoded batch: -0.9578906893730164, Means: -0.7386166453361511
Epoch 2:   0%|          | 0/1 [00:00<?, ?it/s, v_num=142]        Encoded batch: -0.9578906893730164, Means: -0.8044306039810181
Epoch 3:   0%|          | 0/1 [00:00<?, ?it/s, v_num=142]        Encoded batch: -0.9578906893730164, Means: -0.8460958003997803
Epoch 4:   0%|          | 0/1 [00:00<?, ?it/s, v_num=142]        Encoded batch: -0.9578906893730164, Means: -0.8952543139457703
Epoch 5:   0%|          | 0/1 [00:00<?, ?it/s, v_num=142]        Encoded batch: -0.9578906893730164, Means: -0.9427421689033508
Epoch 6:   0%|          | 0/1 [00:00<?, ?it/s, v_num=142]        Encoded batch: -0.9578906893730164, Means: -0.9695971012115479
Epoch 7:   0%|          | 0/1 [00:00<?, ?it/s, v_num=142]        Encoded batch: -0.9578906893730164, Means: -0.99069094657

`Trainer.fit` stopped: `max_epochs=25` reached.


Epoch 24: 100%|██████████| 1/1 [00:00<00:00,  2.03it/s, v_num=142]


# Compare

In [147]:
IDX = 0

In [148]:
skgmm.means_[IDX]

array([-1.19060958, -0.79441805,  0.27555317,  1.34464671, -1.19341921,
        0.10304883, -1.36962645,  0.08209358,  0.0601619 ,  0.83084592,
        0.06280524, -2.08669226, -0.60351705,  0.39965105, -3.02037693,
        1.20683265,  6.60475729,  3.06455994])

In [149]:
trained_model.means[IDX]

tensor([-1.1693, -0.8049,  0.2451,  1.3079, -1.1542,  0.1425, -1.3464,  0.1067,
         0.0589,  0.8564,  0.0671, -2.0898, -0.5871,  0.3574, -3.0001,  1.1950,
         6.7255,  3.0230])

In [150]:
means[IDX]

array([-1.20616921e+00, -8.81784003e-01,  2.36022237e-01,  1.47247706e+00,
       -1.26303836e+00,  3.81084423e-02, -1.49860506e+00, -4.57750754e-02,
        9.90944693e-04,  8.19398482e-01,  5.70338124e-02, -2.42086896e+00,
       -6.59784433e-01,  4.70582268e-01, -3.41993375e+00,  1.33218073e+00,
        6.77889108e+00,  2.97328036e+00])

In [151]:
gmm_lightning_module.gmm_module.means[IDX]

tensor([-1.1693, -0.8049,  0.2451,  1.3079, -1.1542,  0.1425, -1.3464,  0.1067,
         0.0589,  0.8564,  0.0671, -2.0898, -0.5871,  0.3574, -3.0001,  1.1950,
         6.7255,  3.0230])

In [152]:
gmm_init_params["means"][IDX]

tensor([-0.6032, -0.7337, -0.4493,  1.1442, -1.0089, -0.1294, -1.1521, -0.8871,
         0.1370,  0.3750,  0.4332, -2.9347, -0.0834,  0.2875, -3.3654,  0.9473,
         5.4575,  1.2549])

In [153]:
skgmm.precisions_cholesky_[IDX][0]

array([  1.5434166 ,   1.00187662,   1.39301369,   4.49420078,
         2.02338946,  10.61615705,  -7.82814269,   7.21609721,
       -17.83936845,   9.0829797 , -10.68403807,   0.5435214 ,
       -16.04140192,  11.54775514,  -5.85069145,   5.45892235,
        -3.08427163,   2.62940158])

In [154]:
trained_model.precision_cholesky[IDX][0]

tensor([  1.5504,   0.9510,   1.5364,   5.1113,   2.7722,   9.6064,  -6.8977,
          8.2303, -19.7553,   8.4623, -14.8302,   2.5196, -14.1206,   4.3731,
         -1.5201,   4.2449,  -0.9276,   2.1517])

In [155]:
prec_chol[IDX][0]

array([  1.4412036 ,   1.0956784 ,   1.20447844,   3.8879333 ,
         1.13488524,  10.09129875,  -7.52550288,   5.8865005 ,
       -14.65766528,   7.24199996,  -8.96899225,  -0.05576398,
       -12.99109356,  11.68677064,  -5.66755523,   4.46847986,
        -2.3947838 ,   1.66568229])

In [156]:
gmm_lightning_module.gmm_module.precision_cholesky[IDX][0]

tensor([  1.5504,   0.9510,   1.5364,   5.1113,   2.7722,   9.6064,  -6.8977,
          8.2303, -19.7553,   8.4623, -14.8302,   2.5196, -14.1206,   4.3731,
         -1.5201,   4.2449,  -0.9276,   2.1517])

In [157]:
gmm_init_params["precision_cholesky"][IDX][0]

tensor([  1.4373,   0.7445,   1.1892,   1.7533,   0.6190,   6.6631,  -4.2845,
          0.4039,  -7.5919,   0.9397,  -5.4953,   1.4045, -18.1528,   8.4870,
         -6.0902,  -0.6787,   3.8729,  -0.0469])