In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
full_data = pd.read_csv("../../data/processed/historical/train/lcl_data.csv")
df_25K = full_data.sample(25000, random_state=0)
df_25K.to_csv("../../data/processed/historical/train/lcl_data_25K.csv", index=False)

# Load Data

In [3]:
import torch
import numpy as np
import random
RANDOM_STATE = 0
torch.manual_seed(RANDOM_STATE)
torch.use_deterministic_algorithms(True)
g = torch.Generator()
g.manual_seed(RANDOM_STATE)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [4]:
from pathlib import Path
from opensynth.data_modules.lcl_data_module import LCLDataModule
import pytorch_lightning as pl

import matplotlib.pyplot as plt

data_path = Path("../../data/processed/historical/train/lcl_data_25K.csv")
stats_path = Path("../../data/processed/historical/train/mean_std.csv")
outlier_path = Path("../../data/processed/historical/train/outliers.csv")

dm = LCLDataModule(data_path=data_path, stats_path=stats_path, batch_size=25000, n_samples=25000)
dm.setup()

In [5]:
import torch
from opensynth.models.faraday import FaradayVAE
vae_model = torch.load("vae_model.pt")
vae_model.eval()

  vae_model = torch.load("vae_model.pt")


FaradayVAE(
  (encoder): Encoder(
    (encoder_layers): Sequential(
      (0): Linear(in_features=50, out_features=512, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=512, out_features=256, bias=True)
      (3): GELU(approximate='none')
      (4): Linear(in_features=256, out_features=128, bias=True)
      (5): GELU(approximate='none')
      (6): Linear(in_features=128, out_features=64, bias=True)
      (7): GELU(approximate='none')
      (8): Linear(in_features=64, out_features=32, bias=True)
      (9): GELU(approximate='none')
      (10): Linear(in_features=32, out_features=16, bias=True)
    )
  )
  (decoder): Decoder(
    (latent): Linear(in_features=18, out_features=16, bias=True)
    (latent_activations): GELU(approximate='none')
    (decoder_layers): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=32, out_features=64, bias=True)
      (3): GELU(approximate='no

In [6]:
from opensynth.models.faraday.gaussian_mixture.prepare_gmm_input import encode_data_for_gmm

next_batch = next(iter(dm.train_dataloader()))
input_tensor = encode_data_for_gmm(data=next_batch, vae_module=vae_model)
input_data = input_tensor.detach().numpy()
n_samples = len(input_tensor)

In [7]:
N_COMPONENTS = 250
REG_COVAR = 1e-4
EPOCHS = 25
IDX = 0
CONVERGENCE_TOL = 1e-2


In [8]:
input_tensor[0][0]

tensor(0.0195, grad_fn=<SelectBackward0>)

# Init GMM

In [9]:
from opensynth.models.faraday.new_gmm import gmm_utils

labels_, means_, responsibilities_ = gmm_utils.initialise_centroids(
        X=input_data, n_components=N_COMPONENTS
    )
print(labels_.dtype, responsibilities_.dtype, means_.dtype)

torch.float32 torch.float32 torch.float32


In [10]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params

gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS,
    reg_covar=REG_COVAR,
)
print(gmm_init_params["precision_cholesky"][IDX][0][0])
print(gmm_init_params["weights"].sum())

tensor(4.1672)
tensor(1.)


# Torch GMM

In [11]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params, training_loop
from opensynth.models.faraday.new_gmm.new_gmm_model import GaussianMixtureModel


gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS,
    reg_covar=REG_COVAR,
)
torch_gmm = GaussianMixtureModel(
    num_components=N_COMPONENTS,
    num_features = input_data.shape[1],
    reg_covar=REG_COVAR,
    print_idx=IDX
)
torch_gmm.initialise(gmm_init_params)
trained_model = training_loop(model=torch_gmm, data=input_tensor, max_iter=EPOCHS)

 80%|████████  | 20/25 [00:07<00:01,  2.81it/s]

Converged: True. Number of iterations: 20





# SK Learn GMM Manual

In [12]:
import numpy as np
from scipy.special import logsumexp
from scipy import linalg

def _estimate_gaussian_parameters(X, resp, reg_covar=REG_COVAR):
    nk = resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps
    means = np.dot(resp.T, X) / nk[:, np.newaxis]
    n_components, n_features = means.shape
    covariances = np.empty((n_components, n_features, n_features))
    for k in range(n_components):
        diff = X - means[k]
        covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k]
        covariances[k].flat[:: n_features + 1] += reg_covar
    return nk, means, covariances

def _compute_precision_cholesky(covariances):
    estimate_precision_error_message = (
        "Fitting the mixture model failed because some components have "
        "ill-defined empirical covariance (for instance caused by singleton "
        "or collapsed samples). Try to decrease the number of components, "
        "or increase reg_covar."
    )

    n_components, n_features, _ = covariances.shape
    precisions_chol = np.empty((n_components, n_features, n_features))
    for k, covariance in enumerate(covariances):
        try:
            cov_chol = linalg.cholesky(covariance, lower=True)
        except linalg.LinAlgError:
            raise ValueError(estimate_precision_error_message)
        precisions_chol[k] = linalg.solve_triangular(
            cov_chol, np.eye(n_features), lower=True
        ).T
    return precisions_chol

def _compute_log_det_cholesky(matrix_chol, n_features):
    n_components, _, _ = matrix_chol.shape
    log_det_chol = np.sum(
        np.log(matrix_chol.reshape(n_components, -1)[:, :: n_features + 1]), 1
    )
    return log_det_chol

def _estimate_log_gaussian_prob(X, means, precisions_chol):
    n_samples, n_features = X.shape
    n_components, _ = means.shape

    log_det = _compute_log_det_cholesky(precisions_chol, n_features)

    log_prob = np.empty((n_samples, n_components))
    for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
        y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
        log_prob[:, k] = np.sum(np.square(y), axis=1)
    return -0.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det

def _estimate_log_weights(weights):
        return np.log(weights)

def _estimate_weighted_log_prob(X, means, precisions_chol, weights):
        return _estimate_log_gaussian_prob(X, means, precisions_chol) + _estimate_log_weights(weights)


def _estimate_log_prob_resp(X, means, precisions_chol, weights):
    weighted_log_prob = _estimate_weighted_log_prob(X, means, precisions_chol, weights)
    log_prob_norm = logsumexp(weighted_log_prob, axis=1)
    with np.errstate(under="ignore"):
        log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]
    return log_prob_norm, log_resp

def _e_step(X,means, precisions_chol, weights):
    log_prob_norm, log_resp = _estimate_log_prob_resp(X, means, precisions_chol, weights)
    return np.mean(log_prob_norm), log_resp

def _m_step(X, log_reponsibilities, reg_covar=REG_COVAR):

    weights_, means_, covariances_ = _estimate_gaussian_parameters(X,np.exp(log_reponsibilities),reg_covar=reg_covar)
    weights_ /= weights_.sum()

    precision_cholesky_ = _compute_precision_cholesky(covariances=covariances_)

    return precision_cholesky_, weights_, means_, covariances_

In [13]:
means = gmm_init_params["means"].detach().numpy()
weights = gmm_init_params["weights"].detach().numpy()
prec_chol = gmm_init_params["precision_cholesky"].detach().numpy()

print(f"Initial prec chol: {prec_chol[IDX][0][0]}. Initial mean: {means[IDX][0]}")

converged = False
lower_bound = -np.inf

for i in range(EPOCHS):
    prev_lower_bound = lower_bound

    print(f"Old Prec Chol: {prec_chol[IDX][0][0]}. Old means: {means[IDX][0]}")
    log_prob, log_resp = _e_step(input_data, means, prec_chol, weights)
    prec_chol, weights, means, covar = _m_step(input_data, log_resp)

    print(f"New prec chol: {prec_chol[IDX][0][0]}. New means: {means[IDX][0]}")

    # Converegence
    lower_bound = log_prob
    change = abs(lower_bound - prev_lower_bound)
    print(f"Change: {change}")
    if change < CONVERGENCE_TOL:
        converged = True
        break

print(f'Converged: {converged}. Number of iterations: {i}')

Initial prec chol: 4.167169570922852. Initial mean: 0.18039406836032867
Old Prec Chol: 4.167169570922852. Old means: 0.18039406836032867
New prec chol: 4.233167242590703. New means: 0.1601667862775156
Change: inf
Old Prec Chol: 4.233167242590703. Old means: 0.1601667862775156
New prec chol: 4.302608211404412. New means: 0.13654736888632887
Change: 0.5965802676941183
Old Prec Chol: 4.302608211404412. Old means: 0.13654736888632887
New prec chol: 4.170436562212954. New means: 0.11592101725647623
Change: 0.23705405366639276
Old Prec Chol: 4.170436562212954. Old means: 0.11592101725647623
New prec chol: 3.935564560399703. New means: 0.0963309089718066
Change: 0.15404550009157947
Old Prec Chol: 3.935564560399703. Old means: 0.0963309089718066
New prec chol: 3.804755944723549. New means: 0.08183459658313541
Change: 0.11730303257668773
Old Prec Chol: 3.804755944723549. Old means: 0.08183459658313541
New prec chol: 3.7495639777909733. New means: 0.06969489063703525
Change: 0.09750155304139752


# SK Learn GMM Epoch

In [14]:
from sklearn.mixture import GaussianMixture

init_weights = gmm_init_params["weights"]
init_means = gmm_init_params["means"]

skgmm = GaussianMixture(n_components=N_COMPONENTS, covariance_type='full', tol=CONVERGENCE_TOL, max_iter=EPOCHS, random_state=0, means_init = init_means, weights_init=init_weights)
skgmm.fit(input_data)
skgmm_pred = skgmm.predict(input_data)

# Torch Lightning

In [15]:
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import LightningDataModule
class CustomDataset(Dataset):
    def __init__(self, data_tensor: torch.Tensor):
        self.data = data_tensor
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]
    
class CustomDataModule(LightningDataModule):
    def __init__(self, data_tensor: torch.Tensor, batch_size: int):
        super().__init__()
        self.data_tensor = data_tensor
        self.batch_size = batch_size
    def setup(self, stage=""):
        self.custom_ds = CustomDataset(self.data_tensor)
    def train_dataloader(self):
        return DataLoader(self.custom_ds, batch_size=self.batch_size, shuffle=False, generator=g, worker_init_fn=seed_worker)
    
custom_dm = CustomDataModule(data_tensor=input_tensor, batch_size=25000)
custom_dm.setup(stage="")

In [16]:

for i in range(5):
    print(next(iter(custom_dm.train_dataloader()))[0][0])

tensor(0.0195, grad_fn=<SelectBackward0>)
tensor(0.0195, grad_fn=<SelectBackward0>)
tensor(0.0195, grad_fn=<SelectBackward0>)
tensor(0.0195, grad_fn=<SelectBackward0>)
tensor(0.0195, grad_fn=<SelectBackward0>)


In [17]:
from opensynth.models.faraday.new_gmm.new_gmm_model import GaussianMixtureLightningModule
gmm_module = GaussianMixtureModel(
    num_components=N_COMPONENTS,
    num_features = input_data.shape[1],
    reg_covar=REG_COVAR,
    print_idx=IDX
)
gmm_module.initialise(gmm_init_params)
gmm_lightning_module = GaussianMixtureLightningModule(
    gmm_module = gmm_module,
    vae_module = vae_model,
    num_components = gmm_module.num_components,
    num_features = gmm_module.num_features,
    reg_covar = gmm_module.reg_covar,
    convergence_tolerance = CONVERGENCE_TOL
)
trainer = pl.Trainer(max_epochs=EPOCHS, accelerator="cpu", deterministic=True )
trainer.fit(gmm_lightning_module, custom_dm)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/charlotte.avery/.virtualenvs/OpenSynth-BNsxhSIM/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/charlotte.avery/.virtualenvs/OpenSynth-BNsxhSIM/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
/Users/charlotte.avery/.virtualenvs/OpenSynth-BNsxhSIM/lib/python3.11/site-packages/pytorch_lightning/core/opt

Epoch 0:   0%|          | 0/1 [00:00<?, ?it/s] Encoded batch: 0.019506216049194336,Means: 0.16016706824302673
Epoch 1:   0%|          | 0/1 [00:00<?, ?it/s, v_num=14]        Encoded batch: 0.019506216049194336,Means: 0.13565151393413544
Epoch 2:   0%|          | 0/1 [00:00<?, ?it/s, v_num=14]        Encoded batch: 0.019506216049194336,Means: 0.11374605447053909
Epoch 3:   0%|          | 0/1 [00:00<?, ?it/s, v_num=14]        Encoded batch: 0.019506216049194336,Means: 0.09664156287908554
Epoch 4:   0%|          | 0/1 [00:00<?, ?it/s, v_num=14]        Encoded batch: 0.019506216049194336,Means: 0.08267594128847122
Epoch 5:   0%|          | 0/1 [00:00<?, ?it/s, v_num=14]        Encoded batch: 0.019506216049194336,Means: 0.0698365718126297
Epoch 6:   0%|          | 0/1 [00:00<?, ?it/s, v_num=14]        Encoded batch: 0.019506216049194336,Means: 0.06141422316431999
Epoch 7:   0%|          | 0/1 [00:00<?, ?it/s, v_num=14]        Encoded batch: 0.019506216049194336,Means: 0.05523926019668579
Ep

# Compare

In [18]:
IDX = 0

In [19]:
skgmm.means_[IDX]

array([-0.03410445, -1.55531795, -0.80513344,  1.46706329,  0.52191256,
       -0.07904954, -0.47972545,  3.15058638, -0.26191411, -1.60104445,
       -0.42080482, -1.32116084, -2.27573128,  0.9726397 ,  1.11706623,
       -1.15814226,  7.20815503,  2.88975046])

In [20]:
trained_model.means[IDX]

tensor([-3.8477e-03, -1.6204e+00, -9.1621e-01,  1.6438e+00,  6.1305e-01,
        -7.4405e-02, -5.8105e-01,  3.3147e+00, -2.8936e-01, -1.7627e+00,
        -5.0486e-01, -1.4561e+00, -2.4095e+00,  1.0130e+00,  1.1924e+00,
        -1.2135e+00,  7.4317e+00,  2.8693e+00])

In [21]:
means[IDX]

array([-0.01515262, -1.6526233 , -0.9028992 ,  1.63498979,  0.57621875,
       -0.07837   , -0.5976933 ,  3.35495689, -0.28491077, -1.8378896 ,
       -0.51163217, -1.38887519, -2.37020506,  1.00783342,  1.19473881,
       -1.26235768,  7.59674037,  2.95692276])

In [22]:
gmm_lightning_module.gmm_module.means[IDX]

tensor([-3.8477e-03, -1.6204e+00, -9.1621e-01,  1.6438e+00,  6.1305e-01,
        -7.4405e-02, -5.8105e-01,  3.3147e+00, -2.8936e-01, -1.7627e+00,
        -5.0486e-01, -1.4561e+00, -2.4095e+00,  1.0130e+00,  1.1924e+00,
        -1.2135e+00,  7.4317e+00,  2.8693e+00])

In [23]:
gmm_init_params["means"][IDX]

tensor([ 0.1804, -2.7538, -1.0911,  1.7346,  0.2788, -0.0517, -0.9778,  3.2770,
        -0.2799, -2.2817, -0.6563, -1.5545, -1.2570,  0.9383,  0.9554, -2.0788,
         5.9050,  3.9497])

In [24]:
skgmm.precisions_cholesky_[IDX][0]

array([ 3.57312093,  2.46106684,  2.99801459, -1.30749426, -2.95219555,
       -6.02259624,  2.66225723, 10.95334562,  1.99717904, -4.05830277,
        2.9830019 ,  3.36654426, -5.42788066, -4.29834805,  3.90808516,
        5.32548283,  2.71379078,  5.82315592])

In [25]:
trained_model.precision_cholesky[IDX][0]

tensor([ 3.5892,  2.4673,  2.3681, -1.0535, -2.7764, -4.0214,  3.3850, 12.1402,
         0.9446, -4.5697,  2.4469,  2.7109, -6.1373, -2.9488,  4.7330,  4.5295,
         1.4875,  4.2269])

In [26]:
prec_chol[IDX][0]

array([ 3.84591718,  2.35226099,  3.50887833, -1.23289717, -2.7024412 ,
       -4.06890568,  3.82486535, 12.50374967,  1.81350395, -6.01436896,
        2.7187965 ,  6.50805437, -5.51275162, -4.24942411,  4.47813211,
        3.78821435,  2.34999917,  4.8001615 ])

In [27]:
gmm_lightning_module.gmm_module.precision_cholesky[IDX][0]

tensor([ 3.5892,  2.4673,  2.3681, -1.0535, -2.7764, -4.0214,  3.3850, 12.1402,
         0.9446, -4.5697,  2.4469,  2.7109, -6.1373, -2.9488,  4.7330,  4.5295,
         1.4875,  4.2269])

In [28]:
gmm_init_params["precision_cholesky"][IDX][0]

tensor([ 4.1672,  1.6312,  2.5548, -2.7919, -3.7756, -4.2847,  2.0378,  6.5975,
        -0.3280, -2.4506, -0.2866,  3.9566, -2.3045, -2.7461, -1.5410, -0.6959,
         2.3353, -0.2032])

# Sampling

In [29]:
def sample(means_, covariances_, weights_, n_samples):
    rng = np.random.RandomState(RANDOM_STATE)
    n_samples_comp = rng.multinomial(n_samples, weights_)

    X = np.vstack(
            [
                rng.multivariate_normal(mean, covariance, int(sample))
                for (mean, covariance, sample) in zip(
                    means_, covariances_, n_samples_comp
                )
            ]
        )
    
    y = np.concatenate(
        [np.full(sample, j, dtype=int) for j, sample in enumerate(n_samples_comp)]
    )
    return (X, y)

In [30]:
N_SAMPLES = 250

In [31]:
samples = sample(means, covar, weights, n_samples = N_SAMPLES)

X, y = samples
X[IDX], y[IDX]

(array([ 0.01900764, -1.16245489, -0.77577511,  2.14979001,  2.07757197,
         0.07693889, -0.28857952,  2.92392678, -0.0494064 , -4.05625911,
        -0.67460345, -0.67508589, -2.63844236,  0.3934814 ,  3.08956995,
        -2.02227899,  6.05678877,  4.13936513]),
 0)

In [32]:
train_model_samples = sample(trained_model.means.detach().numpy(), trained_model.covariances.detach().numpy(), trained_model.weights.detach().numpy(), n_samples = N_SAMPLES)
train_model_X, train_model_y = train_model_samples
train_model_X[IDX], train_model_y[IDX]

  rng.multivariate_normal(mean, covariance, int(sample))


(array([ 0.03574808, -2.79003573, -0.76025084,  0.82357755, -0.20516164,
        -0.03460098, -0.07782398,  3.9214106 , -0.22837557,  0.32822871,
         0.69329133, -2.19083986, -2.50094329,  2.28448515,  0.32796495,
        -1.06755265, 12.07750982,  3.10206355]),
 0)

In [33]:
gmm_lightning_samples = sample(gmm_lightning_module.gmm_module.means.detach().numpy(), gmm_lightning_module.gmm_module.covariances.detach().numpy(), gmm_lightning_module.gmm_module.weights.detach().numpy(), n_samples = N_SAMPLES)
gmm_lightning_X, gmm_lightning_y = train_model_samples
gmm_lightning_X[IDX], gmm_lightning_y[IDX]

  rng.multivariate_normal(mean, covariance, int(sample))


(array([ 0.03574808, -2.79003573, -0.76025084,  0.82357755, -0.20516164,
        -0.03460098, -0.07782398,  3.9214106 , -0.22837557,  0.32822871,
         0.69329133, -2.19083986, -2.50094329,  2.28448515,  0.32796495,
        -1.06755265, 12.07750982,  3.10206355]),
 0)