In [1]:
%load_ext autoreload
%autoreload 2

# Load Data

In [2]:
from pathlib import Path
from opensynth.data_modules.lcl_data_module import LCLDataModule
import pytorch_lightning as pl

import matplotlib.pyplot as plt

data_path = Path("../../data/processed/historical/train/lcl_data.csv")
stats_path = Path("../../data/processed/historical/train/mean_std.csv")
outlier_path = Path("../../data/processed/historical/train/outliers.csv")

dm = LCLDataModule(data_path=data_path, stats_path=stats_path, batch_size=25000, n_samples=50000)
dm.setup()

In [3]:
import torch
from opensynth.models.faraday import FaradayVAE
vae_model = torch.load("vae_model.pt")

  vae_model = torch.load("vae_model.pt")


In [4]:
from opensynth.models.faraday.gaussian_mixture.prepare_gmm_input import encode_data_for_gmm

next_batch = next(iter(dm.train_dataloader()))
input_tensor = encode_data_for_gmm(data=next_batch, vae_module=vae_model)
input_data = input_tensor.detach().numpy()
n_samples = len(input_tensor)

In [5]:
N_COMPONENTS = 10
REG_COVAR = 1e-4
EPOCHS = 10
IDX = 0


# Init GMM

In [6]:
from opensynth.models.faraday.new_gmm import gmm_utils

labels_, means_, responsibilities_ = gmm_utils.initialise_centroids(
        X=input_data, n_components=N_COMPONENTS
    )
print(labels_.dtype, responsibilities_.dtype, means_.dtype)

torch.float32 torch.float32 torch.float32


In [7]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params

gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS,
    reg_covar=REG_COVAR,
)
print(gmm_init_params["precision_cholesky"][IDX][0][0])
print(gmm_init_params["weights"].sum())

tensor(0.0923)
tensor(1.)


In [8]:
from opensynth.models.faraday.new_gmm.train_gmm import initialise_gmm_params, training_loop
from opensynth.models.faraday.new_gmm.new_gmm_model import GaussianMixtureModel


gmm_init_params = initialise_gmm_params(
    X=input_data,
    n_components = N_COMPONENTS,
    reg_covar=REG_COVAR,
)
torch_gmm = GaussianMixtureModel(
    num_components=N_COMPONENTS,
    num_features = input_data.shape[1],
    reg_covar=REG_COVAR,
    print_idx=IDX
)
torch_gmm.initialise(gmm_init_params)
trained_model = training_loop(model=torch_gmm, data=input_tensor, max_iter=EPOCHS)

100%|██████████| 10/10 [00:00<00:00, 59.43it/s]


# SK Learn GMM Manual

In [9]:
import numpy as np
from scipy.special import logsumexp
from scipy import linalg

def _estimate_gaussian_parameters(X, resp, reg_covar=REG_COVAR):
    nk = resp.sum(axis=0) + 10 * np.finfo(resp.dtype).eps
    means = np.dot(resp.T, X) / nk[:, np.newaxis]
    n_components, n_features = means.shape
    covariances = np.empty((n_components, n_features, n_features))
    for k in range(n_components):
        diff = X - means[k]
        covariances[k] = np.dot(resp[:, k] * diff.T, diff) / nk[k]
        covariances[k].flat[:: n_features + 1] += reg_covar
    return nk, means, covariances

def _compute_precision_cholesky(covariances):
    estimate_precision_error_message = (
        "Fitting the mixture model failed because some components have "
        "ill-defined empirical covariance (for instance caused by singleton "
        "or collapsed samples). Try to decrease the number of components, "
        "or increase reg_covar."
    )

    n_components, n_features, _ = covariances.shape
    precisions_chol = np.empty((n_components, n_features, n_features))
    for k, covariance in enumerate(covariances):
        try:
            cov_chol = linalg.cholesky(covariance, lower=True)
        except linalg.LinAlgError:
            raise ValueError(estimate_precision_error_message)
        precisions_chol[k] = linalg.solve_triangular(
            cov_chol, np.eye(n_features), lower=True
        ).T
    return precisions_chol

def _compute_log_det_cholesky(matrix_chol, n_features):
    n_components, _, _ = matrix_chol.shape
    log_det_chol = np.sum(
        np.log(matrix_chol.reshape(n_components, -1)[:, :: n_features + 1]), 1
    )
    return log_det_chol

def _estimate_log_gaussian_prob(X, means, precisions_chol):
    n_samples, n_features = X.shape
    n_components, _ = means.shape

    log_det = _compute_log_det_cholesky(precisions_chol, n_features)

    log_prob = np.empty((n_samples, n_components))
    for k, (mu, prec_chol) in enumerate(zip(means, precisions_chol)):
        y = np.dot(X, prec_chol) - np.dot(mu, prec_chol)
        log_prob[:, k] = np.sum(np.square(y), axis=1)
    return -0.5 * (n_features * np.log(2 * np.pi) + log_prob) + log_det

def _estimate_log_weights(weights):
        return np.log(weights)

def _estimate_weighted_log_prob(X, means, precisions_chol, weights):
        return _estimate_log_gaussian_prob(X, means, precisions_chol) + _estimate_log_weights(weights)


def _estimate_log_prob_resp(X, means, precisions_chol, weights):
    weighted_log_prob = _estimate_weighted_log_prob(X, means, precisions_chol, weights)
    log_prob_norm = logsumexp(weighted_log_prob, axis=1)
    with np.errstate(under="ignore"):
        log_resp = weighted_log_prob - log_prob_norm[:, np.newaxis]
    return log_prob_norm, log_resp

def _e_step(X,means, precisions_chol, weights):
    log_prob_norm, log_resp = _estimate_log_prob_resp(X, means, precisions_chol, weights)
    return np.mean(log_prob_norm), log_resp

def _m_step(X, log_reponsibilities, reg_covar=REG_COVAR):

    weights_, means_, covariances_ = _estimate_gaussian_parameters(X,np.exp(log_reponsibilities),reg_covar=reg_covar)

    precision_cholesky_ = _compute_precision_cholesky(covariances=covariances_)

    return precision_cholesky_, weights_, means_

In [10]:
means = gmm_init_params["means"].detach().numpy()
weights = gmm_init_params["weights"].detach().numpy()
prec_chol = gmm_init_params["precision_cholesky"].detach().numpy()

print(f"Initial prec chol: {prec_chol[IDX][0][0]}. Initial mean: {means[IDX][0]}")

for i in range(EPOCHS):
    print(f"Old Prec Chol: {prec_chol[IDX][0][0]}. Old means: {means[IDX][0]}")
    log_prob, log_resp = _e_step(input_data, means, prec_chol, weights)
    prec_chol, weights, means = _m_step(input_data, log_resp)
    print(f"New prec chol: {prec_chol[IDX][0][0]}. New means: {means[IDX][0]}")

Initial prec chol: 0.09233899414539337. Initial mean: -32.447509765625
Old Prec Chol: 0.09233899414539337. Old means: -32.447509765625
New prec chol: 0.06472880061429055. New means: -25.715593655928068
Old Prec Chol: 0.06472880061429055. Old means: -25.715593655928068
New prec chol: 0.06267556517053612. New means: -21.7309191674698
Old Prec Chol: 0.06267556517053612. Old means: -21.7309191674698
New prec chol: 0.062342531825448314. New means: -19.749283787407165
Old Prec Chol: 0.062342531825448314. Old means: -19.749283787407165
New prec chol: 0.05939968037941207. New means: -16.349924500403812
Old Prec Chol: 0.05939968037941207. Old means: -16.349924500403812
New prec chol: 0.058542095117219264. New means: -12.840604303459832
Old Prec Chol: 0.058542095117219264. Old means: -12.840604303459832
New prec chol: 0.05931896324566232. New means: -11.045933302446542
Old Prec Chol: 0.05931896324566232. Old means: -11.045933302446542
New prec chol: 0.060650940875511615. New means: -10.052943788

# SK Learn GMM Epoch

In [11]:
from sklearn.mixture import GaussianMixture

init_weights = gmm_init_params["weights"].detach().numpy()
init_weights = gmm_init_params["weights"].detach().numpy()
init_weights.sum()

init_means = gmm_init_params["means"].detach().numpy()

skgmm = GaussianMixture(n_components=N_COMPONENTS, covariance_type='full', max_iter=EPOCHS-1, random_state=0, means_init = init_means, weights_init=init_weights)
skgmm.fit(input_data)
skgmm_pred = skgmm.predict(input_data)



In [12]:
IDX = 0

In [13]:
skgmm.means_[IDX]

array([-8.76955749,  6.29801683,  2.42422483,  6.4354165 , -2.36006574,
        8.17225987,  4.36265813, 12.39972776,  2.82658783, 17.80273158,
       -8.58646057,  6.5972387 ,  1.34069643, -9.11925838,  0.88472509,
        6.53774552,  5.96765772,  3.00442179])

In [14]:
trained_model.means[IDX]

tensor([-8.6170,  6.2405,  2.2864,  6.3637, -2.3627,  8.0549,  4.3183, 12.0865,
         2.8429, 17.5462, -8.4114,  6.3496,  1.4047, -9.0281,  0.7731,  6.4449,
         5.9858,  2.9858])

In [15]:
means[IDX]

array([-8.69751708,  6.25565895,  2.35867001,  6.42641889, -2.39910511,
        8.07440443,  4.30034328, 12.18440497,  2.8318163 , 17.62923671,
       -8.48207977,  6.4152795 ,  1.36035473, -9.02610299,  0.75895445,
        6.50198245,  5.98516536,  2.99519298])

In [16]:
gmm_init_params["means"][IDX]

tensor([-32.4475,   8.4070,  24.8205,  17.4919,   0.3753,  16.8688,   4.2048,
         51.8951,  -8.5819,  43.8303, -35.0830,  33.5587, -17.8375,  -8.3878,
          6.0216,  21.4091,   5.0877,   3.8421])

In [17]:
skgmm.precisions_cholesky_[IDX][0]

array([ 0.06369507,  0.03458898,  0.09399889,  0.13416007,  0.07894018,
        0.327084  , -0.08046169, -0.36430293, -0.50419288, -0.15294315,
       -0.2228582 ,  0.09015276, -1.08597186,  0.6824299 , -0.44803864,
        0.07492746, -0.0064414 , -0.05672007])

In [18]:
trained_model.precision_cholesky[IDX][0]

tensor([ 0.0643,  0.0350,  0.0946,  0.1342,  0.0798,  0.3298, -0.0822, -0.3671,
        -0.5088, -0.1548, -0.2256,  0.0875, -1.0989,  0.6875, -0.4523,  0.0728,
        -0.0057, -0.0554])

In [19]:
prec_chol[IDX][0]

array([ 0.06402143,  0.03480142,  0.09415445,  0.13400933,  0.07951304,
        0.32869236, -0.08124659, -0.36509731, -0.50625905, -0.15396889,
       -0.22486298,  0.08955377, -1.09077244,  0.68359076, -0.4499195 ,
        0.07333384, -0.00572745, -0.05363178])

In [20]:
gmm_init_params["precision_cholesky"][IDX][0]

tensor([ 0.0923,  0.0632,  0.0379,  0.1822,  0.1628,  0.2921, -0.0557, -0.1410,
        -0.6086,  0.1975, -0.1225,  0.2013, -0.7367,  0.4754, -0.1810,  0.3120,
         0.3510, -0.1842])