# Gaussian Mixture Models (GMM)

In [17]:
import numpy as np
from logpdf_loglikelihood_GAU import logpdf_GAU_ND #for single density of a gaussian component
from scipy.special import logsumexp #for marginalizing the joints to retrieve the GMM log density
from mean_covariance import vrow, vcol

In [3]:
def logpdf_GMM(X, gmm):
    """
    Compute log density of data points X under a Gaussian Mixture Model (GMM).
    Parameters
    -X: matrix of size (D, N) where D is the number of features and N is the number of data points.
    -gmm: list of gaussian components. Each one is a tuple of (weight, mean, covariance).
           weight: scalar
           mean: vector of size (D,)
           covariance: matrix of size (D, D)
        Example: gmm = [(w1, mu1, C1), (w2, mu2, C2), ...]
    Returns
    -logpdf: vector of size (N,) containing the log density of each data point under the GMM.
    """

    #1. create matrix S of shape (K, N), where N = number of samples and K = number of components
    K = len(gmm)  # number of components
    N = X.shape[1]
    S = np.zeros((K, N))

    #iterate over components, for each componente take mean, covariance and compute log density of the Gaussian
    for k in range(K):
        weight, mean, covariance = gmm[k]
        S[k, :] = logpdf_GAU_ND(X, mean, covariance) + np.log(weight)
    
    #these are the log joints, then GMM can be casted as a latent variable model, i.e. we can find the GMM log density marginalizing the joints over the latent variable
    #latent variable if the component/cluster
    logdens = logsumexp(S, axis=0)  #marginalize joints over the components

    return logdens  

we now test the function to compute the GMM log density over a reference dataset using reference GMM parameters:

In [4]:
from GMM_load import load_gmm, save_gmm

In [14]:
#load first GMM
gmm = load_gmm('./GMM_models/GMM_4D_3G_init.json')
example_X = np.load('./Data/GMM_data_4D.npy')

print(f'Shape of example_X: {example_X.shape}')

GMM_ll = logpdf_GMM(example_X, gmm)
print(f'GMM log likelihood shape: {GMM_ll.shape}')

#compare
GMM_ll_solution = np.load('./GMM_models/ll/GMM_4D_3G_init_ll.npy')

#check if the log likelihoods are equal
if np.allclose(GMM_ll, GMM_ll_solution):
    print("The computed log likelihoods match the solution.")

Shape of example_X: (4, 1000)
GMM log likelihood shape: (1000,)
The computed log likelihoods match the solution.


In [16]:
#chek with 1-D Data
gmm_1D = load_gmm('./GMM_models/GMM_1D_3G_init.json')
example_X_1D = np.load('./Data/GMM_data_1D.npy')
print(f'Shape of example_X_1D: {example_X_1D.shape}')
GMM_ll_1D = logpdf_GMM(example_X_1D, gmm_1D)
print(f'GMM log likelihood shape for 1D data: {GMM_ll_1D.shape}')

#compare with 1D solution
GMM_ll_1D_solution = np.load('./GMM_models/ll/GMM_1D_3G_init_ll.npy')
#check if the log likelihoods are equal
if np.allclose(GMM_ll_1D, GMM_ll_1D_solution):
    print("The computed log likelihoods for 1D data match the solution.")

Shape of example_X_1D: (1, 4000)
GMM log likelihood shape for 1D data: (4000,)
The computed log likelihoods for 1D data match the solution.


# GMM Estimation: E-M Algorithm

1) **E-step**: here we have to minimize the KL divergence, so the ELBO gets higher and equals to objective function of GMM. In practice, in order to make the KL divergence become equal to zero, its minimum, we are required to compute the responsibilities, so the cluster posteriors. So, we compute the **auxiliary function**:
    $$
    \text{AUX}(\theta, \theta_t) = \mathbb{E}_{C_1, ..., C_M \mid \mathbf{X}_1 = \mathbf{x}_1, ...., \mathbf{X}_N = \mathbf{x}_N, \theta_t} \left[ \log f_{\mathbf{X}_1, ..., \mathbf{X}_N, C_1, ..., C_N}(\mathbf{x}_1, ..., \mathbf{x}_N, c_1, ..., c_N \mid \theta_t)\right]
    $$
    Exploiting the *i.i.d* assumption for our samples, and the linearity property of the expectation operator (i.e. an expectation over a sum is equal to the sum of all the single expectations):
    $$
    \text{AUX}(\theta, \theta_t) = \sum_{i = 1}^N \mathbb{E}_{C_1, ..., C_M \mid \mathbf{X}_1 = \mathbf{x}_1, ...., \mathbf{X}_N = \mathbf{x}_N, \theta_t} \left[ \log f_{\mathbf{X}_i, C_i}(\mathbf{x}_i, c_i \mid \theta_t)\right]
    $$
    Then we can observe that, since in the expectations we just have the log-joint for the i-th samples, these expectations just depends on the posteriors of the i-th samples:
    $$
    \text{AUX}(\theta, \theta_t) = \sum_{i = 1}^N \mathbb{E}_{C_i \mid \mathbf{X}_i = \mathbf{x}_i, \theta_t} \left[ \log f_{\mathbf{X}_i, C_i}(\mathbf{x}_i, c_i \mid \theta_t)\right]
    $$
    Then, computing the responsibilities, we can write the values of these expectations as:
    $$
    \text{AUX}(\theta, \theta_t) = \sum_{i = 1}^N \sum_{c = 1}^{K} \gamma_{c, i} \left[ \log \mathcal{N}(\mathbf{x}_i \mid \mu_c, \mathbf{\Sigma}_c) + \log w_c\right]
    $$
    So, $\text{AUX}(\theta, \theta_t)$ depends on the cluster posteriors, which we compute in this step like this:
    $$
    \gamma_{c, i} = f_{C \mid \mathbf{X}}(c \mid \mathbf{x}_i, \mathbf{\theta}_t) = \frac{\mathcal{N}(\mathbf{x}_i \mid \mu_c, \mathbf{\Sigma}_c) wc}{\sum_{c' = 1}^K \mathcal{N}(\mathbf{x}_i \mid \mu_c', \mathbf{\Sigma}_c') wc'}
    $$

2) **M-step**: in this step we use the cluster labels inferred in the first step, to estimate a new set of model parameters. In practice, we optimize the auxiliary function $\text{AUX}(\theta, \theta_t)$ wrt $\theta = \left[ \mathbf{M}, \mathbf{S}, \mathbf{w}\right]$ and $\sum_{i = 1}^K w_c = 1$, following a ML approach, and we find:
    $$
    \mu_c^{t+1} = \frac{\sum_{i = 1}^N \gamma_{c, i} \mathbf{x}_i}{\sum_{i = 1}^N \gamma_{c, i}}
    $$
    $$
    \mathbf{\Sigma}_c^{t+1} = \frac{\sum_{i = 1}^N \gamma_{c, i} \left( \mathbf{x}_i - \mu_c\right) \left( \mathbf{x}_i - \mu_c\right)^T}{\sum_{i = 1}^N \gamma_{c, i}}
    $$
    $$
    w_c^{t+1} = \frac{\sum_{i = 1}^N \gamma_{c, i}}{\sum_{c' = 1}^K \sum_{i = 1}^N \gamma_{c', i}}
    $$
    Here we can call $N_c = \sum_{i = 1}^N \gamma_{c, i}$ and $N = \sum_{c' = 1}^K \sum_{i = 1}^N \gamma_{c', i}$. Note that when camputing these parameters for cluser $c$ we **always sum all the fractions of the points in the dataset**, thus respecting the **soft margin assignments** approach. <br>
    When programming the algorithm, we can build and reuse for all the three estimates these zero, first and second order statistics:
    $$
    Z_c = \sum_{i = 1}^N \gamma_{c, i} \qquad
    
    \mathbf{F}_c = \sum_{i = 1}^N \gamma_{c, i} \mathbf{x}_i \qquad
    
    \mathbf{S}_c = \sum_{i = 1}^N \gamma_{c, i} \mathbf{x}_i \mathbf{x}_i^T
    $$
    and rewrite the new three estimates for cluster $c$ this way:
    $$
    \mu_c^{t+1} = \frac{\mathbf{F}_c}{Z_c}
    $$
    $$
    \mathbf{\Sigma}_c^{t+1} = \frac{\mathbf{S}_c}{Z_c} - \mu_c^{(t+1)} {\mu_c^{(t+1)}}^T
    $$
    $$
    w_c^{t+1} = \frac{Z_c}{N}
    $$

E-M in practice:

In [38]:
def GMM_EM_iteration(X, gmm_start):
   """
      One single iteration of the EM algorithm for Gaussian Mixture Models (GMM).
      Parameters
      -X: matrix of size (D, N) where D is the number of features and N is the number of data points.
      -gmm_start: list of starter GMM components. Can be obtained with either K-Means or LGB Algorithm.
               Each one is a tuple of (weight, mean, covariance).
               weight: scalar
               mean: vector of size (D,)
               covariance: matrix of size (D, D)
      -threshold_stop: threshold for stopping the EM algorithm. If the change in log likelihood is less than this value, stop.
      Returns
      -gmm: list of gaussian components. Each one is a tuple of (weight, mean, covariance).
            weight: scalar
            mean: vector of size (D,)
            covariance: matrix of size (D, D)
    """
    
   #1. E-STEP: compute responsibilities
   #create matrix S of shape (K, N), where N = number of samples and K = number of components
   K = len(gmm_start)  # number of components
   N = X.shape[1]
   S = np.zeros((K, N))

   #iterate over components, for each componente take mean, covariance and compute log density of the Gaussian
   for k in range(K):
      weight, mean, covariance = gmm_start[k]
      S[k, :] = logpdf_GAU_ND(X, mean, covariance) + np.log(weight)


   #for each sample, marginalize the log-joints over the components to get the log-marginal
   logdens = logsumexp(S, axis=0)  #vector of size (N,)

   #compute log-posteriors by removing log-marginal from the log-joints
   log_posteriors = S - logdens   #(K, N) - (N,) -> broadcasting -> (K, N) - (N,N) = (K, N) thanks to broadcasting

   #compute responsibilities, so cluster posteriors, by exponentiating the log-posteriors
   responsibilities = np.exp(log_posteriors) #(K, N)


   #2. M-STEP: estimate new GMM parameters
   gmm = []
   #compute zero, first, and second order statistics from the responsibilities of each cluster
   #for each cluster k, do:
   for k in range(K):
      gamma = responsibilities[k, :]
      Z_gamma = np.sum(gamma) #zero order
      F_gamma = vcol(np.sum(vrow(gamma) * X, axis = 1)) #first order
      S_gamma = (vrow(gamma) * X) @ X.T #second order, (D, N) @ (N, D) = (D, D)

      #ESTIMATE NEW PARAMS for the cluster k
      mu_k_new = F_gamma / Z_gamma  #col vector (D, 1)
      cov_k_new = S_gamma / Z_gamma - vcol(mu_k_new) @ vrow(mu_k_new)  #covariance matrix, (D, D)
      weight_k_new = Z_gamma / N #n = sum of reponsibilities for each sample for each cluster k = total number of samples, since responsibilities of each sample sum to 1 being fractions
      gmm.append((weight_k_new, mu_k_new, cov_k_new))  #append new params for the cluster k

   return gmm  #return the new GMM parameters after one EM iteration



def GMM_EM(X, gmm_start, threshold_stop=1e-6, max_iter=100):
   """
   EM algorithm for Gaussian Mixture Models (GMM). 
   Parameters
   -X: matrix of size (D, N) where D is the number of features and N is the number of data points.
   -gmm_start: list of starter GMM components. Can be obtained with either K-Means or LGB Algorithm.
            Each one is a tuple of (weight, mean, covariance).
            weight: scalar
            mean: vector of size (D,)
            covariance: matrix of size (D, D)
   -threshold_stop: threshold for stopping the EM algorithm. If the change in log likelihood is less than this value, stop.
   -max_iter: maximum number of iterations for the EM algorithm.
   Returns
   -gmm: list of gaussian components. Each one is a tuple of (weight, mean, covariance).
            weight: scalar
            mean: vector of size (D,)
            covariance: matrix of size (D, D)
   """

   gmm_old = gmm_start.copy()  
   num_iters = 0
   while True:
      #compute the log likelihood with old GMM params
      GMM_ll_old = logpdf_GMM(X, gmm_old).mean()  

      #run 1 iter of EM
      gmm_new = GMM_EM_iteration(X, gmm_old)

      #compute new log likelihood
      GMM_ll_new = logpdf_GMM(X, gmm_new).mean()

      #for sure GMM_ll_new >= GMM_ll_old
      #stop if GMM_ll_new - GMM_ll_old < threshold_stop
      if GMM_ll_old > GMM_ll_new:
         print("Warning: mean GMM log likelihood decreased. This is unexpected.")
         print(f"GMM_ll_old (mean): {GMM_ll_old}, GMM_ll_new (mean): {GMM_ll_new}")
         
      if GMM_ll_new - GMM_ll_old < threshold_stop:
         break

      num_iters += 1
      if num_iters >= max_iter:
         print(f"Reached maximum number of iterations: {max_iter}. Stopping EM.")
         break

      #update old GMM params
      gmm_old = gmm_new.copy()


   return gmm_new  #return the last GMM parameters after EM iterations


Let's apply the E-M Algorithm to the example data, using example initial params:

In [39]:
example_X = np.load('./Data/GMM_data_4D.npy')
gmm_start = load_gmm('./GMM_models/GMM_4D_3G_init.json')
threshold_stop = 1e-6
gmm_final = GMM_EM(example_X, gmm_start, threshold_stop=threshold_stop)


#compare with the solution
gmm_EM_solution = load_gmm('./GMM_models/GMM_4D_3G_EM.json')

#check if the GMM parameters are equal
if all(np.isclose(gmm_final[k][0], gmm_EM_solution[k][0]) and np.allclose(gmm_final[k][1], gmm_EM_solution[k][1]) and np.allclose(gmm_final[k][2], gmm_EM_solution[k][2]) for k in range(len(gmm_final))):
    print("The computed GMM parameters match the solution.")

The computed GMM parameters match the solution.
