In [1]:
%load_ext autoreload
%autoreload 2

# Load Data

In [2]:
from pathlib import Path
from opensynth.data_modules.lcl_data_module import LCLDataModule
import pytorch_lightning as pl

import matplotlib.pyplot as plt

data_path = Path("../../data/processed/historical/train/lcl_data.csv")
stats_path = Path("../../data/processed/historical/train/mean_std.csv")
outlier_path = Path("../../data/processed/historical/train/outliers.csv")

dm = LCLDataModule(data_path=data_path, stats_path=stats_path, batch_size=25000, n_samples=50000)
dm.setup()

In [3]:
import torch
from opensynth.models.faraday import FaradayVAE
vae_model = torch.load("vae_model.pt")

  vae_model = torch.load("vae_model.pt")


In [4]:
from opensynth.models.faraday.gaussian_mixture.prepare_gmm_input import encode_data_for_gmm

next_batch = next(iter(dm.train_dataloader()))
input_tensor = encode_data_for_gmm(data=next_batch, vae_module=vae_model)
input_data = input_tensor.detach().numpy()
n_samples = len(input_tensor)

In [66]:
N_COMPONENTS = 25

# Init GMM

In [74]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params


gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS
)

In [75]:
gmm_init_params["precision_cholesky"][0][0]

tensor([ 0.0922,  0.0889,  0.0228,  0.1915,  0.2978,  0.3017, -0.1140,  0.0758,
        -0.8019,  0.2783, -0.7721, -0.1721, -0.5064,  0.6894, -0.7149, -0.2384,
         0.4642, -0.1257])

In [76]:
gmm_init_params['weights']

tensor([  50.,  253., 1637.,  474.,  425., 3605.,  358.,   66., 1340.,   23.,
        4134., 1708.,  373.,  568., 1958.,  165., 3618.,  189.,   58.,   54.,
         361.,   32.,   48.,  132., 3371.], dtype=torch.float64)

# SK Learn Outputs

In [70]:
import numpy as np
from scipy import linalg

def sk_estimate_gaussian_parameters(X, resp, reg_covar):
    nk = (
        resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps
    )  # This adds small white noise to avoid division by zero
    means = np.dot(resp.T, X) / nk[:, np.newaxis]  # The centroids

    n_components, n_features = means.shape
    covariances = np.empty((n_components, n_features, n_features))
    for k in range(n_components):
        diff = X - means[k]
        covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k]
        covariances[k].flat[:: n_features + 1] += reg_covar

    return nk, means, covariances


def sk_compute_precision_cholesky(covariances):
    estimate_precision_error_message = (
        "Fitting the mixture model failed because some components have "
        "ill-defined empirical covariance (for instance caused by singleton "
        "or collapsed samples). Try to decrease the number of components, "
        "or increase reg_covar."
    )

    n_components, n_features, _ = covariances.shape
    precisions_chol = np.empty((n_components, n_features, n_features))
    for k, covariance in enumerate(covariances):
        try:
            cov_chol = linalg.cholesky(covariance, lower=True)
        except linalg.LinAlgError:
            raise ValueError(estimate_precision_error_message)
        precisions_chol[k] = linalg.solve_triangular(
            cov_chol, np.eye(n_features), lower=True
        ).T

    return precisions_chol

In [71]:
from sklearn.cluster import KMeans

n_samples, _ = input_data.shape
n_components = N_COMPONENTS

# K-means initialisation
resp = np.zeros((n_samples, n_components))
label = (
    KMeans(n_clusters=n_components, n_init=1, random_state=0)
    .fit(input_data)
    .labels_
)
resp[np.arange(n_samples), label] = 1
# Initialise GMM

n_samples, _ = input_data.shape
weights, means, covariances = None, None, None

weights, means, covariances = sk_estimate_gaussian_parameters(
    input_data, resp, 1e-6
)
weights /= n_samples

weights_ = weights
means_ = means
covariances_ = covariances

precisions_cholesky_ = sk_compute_precision_cholesky(covariances)

In [72]:
precisions_cholesky_[0][0]

array([ 0.09219085,  0.0889303 ,  0.02281819,  0.19152657,  0.29779096,
        0.30174624, -0.11401279,  0.0758457 , -0.80188263,  0.27832801,
       -0.77204619, -0.17212504, -0.50637951,  0.68939058, -0.71485848,
       -0.2383334 ,  0.46413801, -0.12564614])

In [73]:
weights * n_samples

array([  50.,  253., 1637.,  474.,  425., 3605.,  358.,   66., 1340.,
         23., 4134., 1708.,  373.,  568., 1958.,  165., 3618.,  189.,
         58.,   54.,  361.,   32.,   48.,  132., 3371.])