In [1]:
import numpy as np
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt 
%matplotlib inline

In [2]:
def log_sum_exp(Z):
    """ Compute log(\sum_i exp(Z_i)) for some array Z."""
    return np.max(Z) + np.log(np.sum(np.exp(Z - np.max(Z))))

def loglikelihood(data, weights, means, covs):
    """ Compute the loglikelihood of the data for a Gaussian mixture model with the given parameters. """
    num_clusters = len(means)
    num_dim = len(data[0])
    
    ll = 0
    for d in data:
        
        Z = np.zeros(num_clusters)
        for k in range(num_clusters):
            
            # Compute (x-mu)^T * Sigma^{-1} * (x-mu)
            delta = np.array(d) - means[k]
            exponent_term = np.dot(delta.T, np.dot(np.linalg.inv(covs[k]), delta))
            
            # Compute loglikelihood contribution for this data point and this cluster
            Z[k] += np.log(weights[k])
            Z[k] -= 1/2. * (num_dim * np.log(2*np.pi) + np.log(np.linalg.det(covs[k])) + exponent_term)
            
        # Increment loglikelihood contribution of this data point across all clusters
        ll += log_sum_exp(Z)
        
    return ll

In [25]:
len(means)

NameError: name 'means' is not defined

In [34]:
def EM(data, init_means, init_covariances, init_weights, maxiter=20):
    
    # Make copies of initial parameters, which we will update during each iteration
    means = init_means[:]
    covariances = init_covariances[:]
    weights = init_weights[:]
    
    # Infer dimensions of dataset and the number of clusters
    num_data = len(data)
    num_dim = len(data[0])
    num_clusters = len(means)
    
    # Initialize some useful variables
    resp = np.zeros((num_data, num_clusters))
    ll = loglikelihood(data, weights, means, covariances)
    ll_trace = [ll]
    
    for i in range(maxiter):
        # E-step: compute responsibilities
        for j in range(num_data):
            for k in range(num_clusters):
                resp[j, k] = weights[k]*multivariate_normal.pdf(data[j],means[k],covariances[k])
        row_sums = resp.sum(axis=1)[:, np.newaxis]
        resp = resp / row_sums # normalize over all possible cluster assignments

        # M-step
        counts = np.sum(resp, axis=0)
        print ("num_data:{}".format(num_data))
        for k in range(num_clusters):
            weights[k] = counts[k]/num_data
            weighted_sum = 0
            for j in range(num_data):
                weighted_sum += (resp[j,k]*data[j])
            means[k] = weighted_sum/counts[k]
            
            weighted_sum = np.zeros((num_dim, num_dim))
            for j in range(num_data):
                weighted_sum += (resp[j,k]*np.outer(data[j]-means[k],data[j]-means[k]))
            covariances[k] = weighted_sum/counts[k]
        
        # Compute the loglikelihood at this iteration
        ll_latest = loglikelihood(data, weights, means, covariances)
        ll_trace.append(ll_latest)
        
        ll = ll_latest
    
        print("Iteration : {} - means:{}".format(i,means))
        
        #plt.figure(figsize=(12,8))
        #plt.scatter(X[:,0],X[:,1])
        #plt.scatter(means[0][0], means[0][1], color = "red")
        #plt.scatter(means[1][0], means[1][1],color="orange")
    
    out = {'weights': weights, 'means': means, 'covs': covariances, 'loglik': ll_trace, 'resp': resp}

    return out

In [13]:
#Load the Data
X = np.loadtxt('data/Faithful.txt')

In [28]:
len(initial_weights)

2

In [36]:
np.random.seed(234)

#Two Cluster
#initial_means = [np.array([3.467750,70.132353]),np.array([3.5078162,71.6617647])]
#initial_covs = [np.array([[1.2975376,13.9110994],[13.911099,183.559040]])]*2
#initial_weights = [0.50062804,0.49937196]

#Three Cluster
initial_means = [np.array([3.4459639,69.8433735]),np.array([3.6217053,72.1578947]),np.array([3.3893617,70.5531915])]
initial_covs = [np.array([[1.2877935,13.842302],[13.8423020,183.208932]])]*3
initial_weights = [0.30514706,0.34926471,0.34558824]

# Run EM 
results = EM(X, initial_means, initial_covs, initial_weights)

num_data:272
Iteration : 0 - means:[array([ 3.44630616, 69.85042535]), array([ 3.62064297, 72.13856517]), array([ 3.39011184, 70.56633575])]
num_data:272
Iteration : 1 - means:[array([ 3.43660804, 69.72137797]), array([ 3.63621595, 72.31525135]), array([ 3.38291679, 70.50148676])]
num_data:272
Iteration : 2 - means:[array([ 3.42351954, 69.54024329]), array([ 3.65789994, 72.5610335 ]), array([ 3.37254255, 70.41275219])]
num_data:272
Iteration : 3 - means:[array([ 3.40734429, 69.31384542]), array([ 3.68530248, 72.8707839 ]), array([ 3.35908913, 70.29901903])]
num_data:272
Iteration : 4 - means:[array([ 3.38730876, 69.03567538]), array([ 3.71981131, 73.26035801]), array([ 3.34179192, 70.14953437])]
num_data:272
Iteration : 5 - means:[array([ 3.36210127, 68.69296379]), array([ 3.76375171, 73.75592376]), array([ 3.31936386, 69.94808391])]
num_data:272
Iteration : 6 - means:[array([ 3.32966354, 68.26506744]), array([ 3.82072231, 74.39698842]), array([ 3.28975384, 69.67039693])]
num_data:272


In [37]:
results['weights']

[0.24232013605095637, 0.6304087799254683, 0.12727108402357576]

In [38]:
results['means']

[array([ 1.98003614, 53.3049878 ]),
 array([ 4.31128663, 80.23636033]),
 array([ 2.27944563, 58.13167198])]

In [39]:
results['covs'][0]

array([[ 0.03776105,  0.12703753],
       [ 0.12703753, 27.07162665]])