In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import pandas as pd
full_data = pd.read_csv("../../data/processed/historical/train/lcl_data.csv")
df_100K = full_data.sample(100000, random_state=0)
df_100K.to_csv("../../data/processed/historical/train/lcl_data_100K.csv", index=False)

# Load Data

In [32]:
import torch
import numpy as np
import random
RANDOM_STATE = 0
torch.manual_seed(RANDOM_STATE)
torch.use_deterministic_algorithms(True)
g = torch.Generator()
g.manual_seed(RANDOM_STATE)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [33]:
from pathlib import Path
from opensynth.data_modules.lcl_data_module import LCLDataModule
import pytorch_lightning as pl

import matplotlib.pyplot as plt

data_path = Path("../../data/processed/historical/train/lcl_data_100K.csv")
stats_path = Path("../../data/processed/historical/train/mean_std.csv")
outlier_path = Path("../../data/processed/historical/train/outliers.csv")

dm = LCLDataModule(data_path=data_path, stats_path=stats_path, batch_size=25000, n_samples=100000)
dm.setup()
dl = dm.train_dataloader()

In [34]:
import torch
vae_model = torch.load("vae_model.pt")
vae_model.eval()

FaradayVAE(
  (encoder): Encoder(
    (encoder_layers): Sequential(
      (0): Linear(in_features=50, out_features=512, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=512, out_features=256, bias=True)
      (3): GELU(approximate='none')
      (4): Linear(in_features=256, out_features=128, bias=True)
      (5): GELU(approximate='none')
      (6): Linear(in_features=128, out_features=64, bias=True)
      (7): GELU(approximate='none')
      (8): Linear(in_features=64, out_features=32, bias=True)
      (9): GELU(approximate='none')
      (10): Linear(in_features=32, out_features=16, bias=True)
    )
  )
  (decoder): Decoder(
    (latent): Linear(in_features=18, out_features=16, bias=True)
    (latent_activations): GELU(approximate='none')
    (decoder_layers): Sequential(
      (0): Linear(in_features=16, out_features=32, bias=True)
      (1): GELU(approximate='none')
      (2): Linear(in_features=32, out_features=64, bias=True)
      (3): GELU(approximate='no

In [35]:
N_COMPONENTS = 200
REG_COVAR = 1e-4
EPOCHS = 25
IDX = 0
CONVERGENCE_TOL = 1e-2


# Init GMM

In [36]:
from opensynth.models.faraday.gmm.gmm_init import initialise_gmm_params

gmm_init_params = initialise_gmm_params(
    dl,
    n_components = N_COMPONENTS,
    vae_module=vae_model,
    reg_covar=REG_COVAR,
)
print(gmm_init_params["precision_cholesky"][IDX][0][0])
print(gmm_init_params["weights"].sum())

tensor(5.0246)
tensor(1.)


# Torch Lightning Batch Learning

In [37]:
features = next(iter(dl))["features"]
obtained_feature_list = list(features.keys())

num_features= vae_model.latent_dim + len(obtained_feature_list)
num_features

18

In [39]:
from opensynth.models.faraday.gmm.gmm_model import GaussianMixtureLightningModule, GaussianMixtureModel
gmm_module = GaussianMixtureModel(
    num_components=N_COMPONENTS,
    num_features = num_features,
    reg_covar=REG_COVAR,
)
gmm_module.initialise(gmm_init_params)
print(f"Initial prec chol: {gmm_module.precision_cholesky[IDX][0][0]}. Initial mean: {gmm_module.means[IDX][0]}")

gmm_lightning_module = GaussianMixtureLightningModule(
    gmm_module = gmm_module,
    vae_module = vae_model,
    num_components = gmm_module.num_components,
    num_features = gmm_module.num_features,
    reg_covar = gmm_module.reg_covar,
    convergence_tolerance = CONVERGENCE_TOL,
    sync_on_batch=False,
)
trainer = pl.Trainer(max_epochs=EPOCHS, accelerator="cpu", deterministic=True )
trainer.fit(gmm_lightning_module, dl)

GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/charlotte.avery/.virtualenvs/OpenSynth-BNsxhSIM/lib/python3.11/site-packages/pytorch_lightning/trainer/setup.py:177: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/Users/charlotte.avery/.virtualenvs/OpenSynth-BNsxhSIM/lib/python3.11/site-packages/pytorch_lightning/core/optimizer.py:182: `LightningModule.configure_optimizers` returned `None`, this fit will run with no optimizer

  | Name                      | Type                    | Params | Mode 
------------------------------------------------------------------------------
0 | gmm_module                | GaussianMixtureModel    | 0      | train
1 | vae_module                | FaradayVAE              | 402 K  | eval 
2 | weight_metric             | WeightsMetric           | 0      | train
3 | mean_metric               | MeansMetric             | 0      | train
4 | precision_ch

Initial prec chol: 5.024597644805908. Initial mean: 0.021900895982980728
Epoch 0: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s, v_num=130]Local weights at rank: 0 - means: 0.0181, -0.0063
Reduced weights, means: 0.0181,-0.0063, 
NLL:  tensor(3.2172)
Epoch 1: 100%|██████████| 4/4 [00:03<00:00,  1.29it/s, v_num=130]Local weights at rank: 0 - means: 0.0219, -0.0053
Reduced weights, means: 0.0219,-0.0053, 
NLL:  tensor(2.7763)
Epoch 2: 100%|██████████| 4/4 [00:02<00:00,  1.38it/s, v_num=130]Local weights at rank: 0 - means: 0.0222, 0.0014
Reduced weights, means: 0.0222,0.0014, 
NLL:  tensor(2.5918)
Epoch 3: 100%|██████████| 4/4 [00:02<00:00,  1.37it/s, v_num=130]Local weights at rank: 0 - means: 0.0217, -0.0174
Reduced weights, means: 0.0217,-0.0174, 
NLL:  tensor(2.4985)
Epoch 4: 100%|██████████| 4/4 [00:02<00:00,  1.42it/s, v_num=130]Local weights at rank: 0 - means: 0.0211, -0.0167
Reduced weights, means: 0.0211,-0.0167, 
NLL:  tensor(2.4708)
Epoch 5: 100%|██████████| 4/4 [00:02<00:00, 

In [40]:
gmm_lightning_module.gmm_module.means

tensor([[-0.0535, -3.0132,  0.5096,  ..., -2.3300,  7.0925,  2.5998],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4412, -8.3531, -1.8495,  ..., -4.1836,  7.1546,  2.9216],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [41]:
ligthning_sum_components = gmm_lightning_module.gmm_module.means.sum(axis=1)
len(ligthning_sum_components[ligthning_sum_components==0])

97

## SK-learn batch learning

In [43]:
from sklearn.mixture import GaussianMixture
from opensynth.models.faraday.gmm import gmm_utils

init_weights = gmm_init_params["weights"]
init_means = gmm_init_params["means"]

skgmm = GaussianMixture(n_components=N_COMPONENTS, covariance_type='full', tol=CONVERGENCE_TOL, max_iter=EPOCHS, random_state=0, means_init = init_means, weights_init=init_weights, warm_start=True, verbose=1)


dl = dm.train_dataloader()
next_batch = next(iter(dl))
for batch_num, batch_data in enumerate(dl):
    print("Batch number: ", batch_num)
    batch_data = gmm_utils.encode_data(batch_data, vae_model)
    input_data = batch_data.detach().numpy()
    skgmm.fit(input_data)


Batch number:  0
Initialization 0
  Iteration 10
  Iteration 20
Initialization converged.
Batch number:  1
Initialization 0
Initialization converged.
Batch number:  2
Initialization 0
Initialization converged.
Batch number:  3
Initialization 0
Initialization converged.


In [44]:
skgmm.means_

array([[-0.29326892, -2.31318905,  0.65633773, ..., -2.20927247,
         6.84006862,  2.75435577],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.50843498, -1.96700384,  0.91861898, ..., -2.20977999,
         6.33845569,  3.025821  ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [45]:
sklearn_sum_components = skgmm.means_.sum(axis=1)
len(sklearn_sum_components[sklearn_sum_components==0])

144

# Compare

In [46]:
IDX = 0

In [47]:
df_compare_means = pd.DataFrame()
df_compare_means["skgmm"] = skgmm.means_[IDX]
df_compare_means["lightning"] = gmm_lightning_module.gmm_module.means[IDX]
df_compare_means

Unnamed: 0,skgmm,lightning
0,-0.293269,-0.053504
1,-2.313189,-3.0132
2,0.656338,0.509599
3,-0.320346,-0.400431
4,0.273952,0.020384
5,0.014188,0.143856
6,0.91029,0.88202
7,1.899968,1.754849
8,0.05703,0.0485
9,-2.388877,-1.616659


In [49]:
df_compare_pre_chol = pd.DataFrame()
df_compare_pre_chol["skgmm"] = skgmm.precisions_cholesky_[IDX][0]
df_compare_pre_chol["lightning"] = gmm_lightning_module.gmm_module.precision_cholesky.detach().numpy()[IDX][0]
df_compare_pre_chol

Unnamed: 0,skgmm,lightning
0,8.798937,7.678572
1,6.465024,7.192576
2,7.943102,10.451228
3,-2.094277,-8.580624
4,-4.160939,-3.901158
5,-10.824012,-5.436331
6,11.254416,5.779688
7,6.267616,19.31114
8,-0.423693,0.986895
9,-7.770061,-21.248657


In [50]:
df_compare_weights = pd.DataFrame()
df_compare_weights["skgmm"] = skgmm.weights_[:10]
df_compare_weights["lightning"] = gmm_lightning_module.gmm_module.weights.detach().numpy()[:10]
df_compare_weights

Unnamed: 0,skgmm,lightning
0,0.04468736,0.01883411
1,8.881784e-20,4.768372e-12
2,0.03703079,4.768372e-12
3,0.02218388,0.002291919
4,0.007290996,0.0111413
5,8.881784e-20,0.01019686
6,8.881784e-20,4.768372e-12
7,0.01168187,4.768372e-12
8,8.881784e-20,0.01190537
9,8.881784e-20,4.768372e-12
