# Library Used in Coding Assignment
Below code is for import all Python Packages
Below Packges have imported & its purpose

(1) numpy - numerical python scripts, to store array, list and other data

(2) scipy.stats - multivariate_normal to calculate the probability density function (pdf)

(2) matplotlib.pyplot - matplotlib's Pyplot packages to plot the data with the predicted mean from the EM's algo


In [141]:
import numpy as np
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt 
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Function Name = "log_sub_exp"
Below Function calculates Log Sum Expression


In [142]:
def log_sum_exp(Z):
    #Compute log(\sum_i exp(Z_i)) for Z (array)
    return np.max(Z) + np.log(np.sum(np.exp(Z - np.max(Z))))

# Function Name = "loglikelihood"


The function generates the 20 centers, 10 centers from mean = (0,1) and other 10 center would be from the mean = (1,0) with standard devitation of 0.5. Both the center (10 each) needs to be appended and return back to the caller

The numpy function np.random.normal is used to generate the centers.

The function would be called from the caller with seed, so it would be consistently same accross multiple runs

Following formula is used to generate the 20 centers


In [106]:
def loglikelihood(data, G, weights, means, covs):
    #Compute the loglikelihood of the data for a Gaussian mixture model
    num_clusters = G
    num_dim = len(data[0])
    
    ll = 0
    for d in data:
        
        Z = np.zeros(num_clusters)
        for k in range(num_clusters):
            
            # Compute (x-mu)^T * Sigma^{-1} * (x-mu)
            delta = np.array(d) - means[k]
            exponent_term = np.dot(delta.T, np.dot(np.linalg.inv(covs[k]), delta))
            
            # Compute loglikelihood contribution for this data point and this cluster
            Z[k] += np.log(weights[k])
            Z[k] -= 1/2. * (num_dim * np.log(2*np.pi) + np.log(np.linalg.det(covs[k])) + exponent_term)
            
        # Increment loglikelihood contribution of this data point across all clusters
        ll += log_sum_exp(Z)
        
    return ll

# Function Name = "EStep"


In [134]:
# E-step
def EStep(data, G, init_weights, init_means, init_covariances):
    
    # Copy the initialize the variable
    weights = init_weights[:]    
    means = init_means[:]
    covariances = init_covariances[:]

    num_data = len(data)
    num_clusters = len(means)#G

    # Initialize resp
    resp = np.zeros((num_data, num_clusters))
    
    #Loop
    for j in range(num_data):
        for k in range(num_clusters):
            resp[j, k] = weights[k]*multivariate_normal.pdf(data[j],means[k],covariances[k])
        row_sums = resp.sum(axis=1)[:, np.newaxis]
        resp = resp / row_sums # normalize the responsibility
    return resp

# Function Name = "MStep"


In [135]:
# M-step        
def MStep(data, G, init_weights, init_means, init_covariances, resp):
    
    # Copy the initialize the variable
    weights = init_weights[:]
    means = init_means[:]
    covariances = init_covariances[:]

    num_data = len(data)
    num_dim = len(data[0])    
    num_clusters = len(means)#G
    
    # Initialize some useful variables
    ll = loglikelihood(data, G, weights, means, covariances)
    ll_trace = [ll]
    
    counts = np.sum(resp, axis=0)
    
    for k in range(num_clusters):
        weights[k] = counts[k]/num_data
        weighted_sum = 0
        for j in range(num_data):
            weighted_sum += (resp[j,k]*data[j])
        means[k] = weighted_sum/counts[k]

        weighted_sum = np.zeros((num_dim, num_dim))
        for j in range(num_data):
            weighted_sum += (resp[j,k]*np.outer(data[j]-means[k],data[j]-means[k]))
        covariances[k] = weighted_sum/counts[k]

    # Compute the loglikelihood at this iteration
    ll_latest = loglikelihood(data, G, weights, means, covariances)
    ll_trace.append(ll_latest)

    ll = ll_latest
    
    out = {'weights': weights, 'means': means, 'covs': covariances, 'loglik': ll_trace, 'resp': resp}
    return out

In [136]:
def myEM(data, G, init_weights, init_means, init_covariances, maxiter=20):
    
    for i in range(maxiter):
        print ("Value of G:{}".format(G))
        response = EStep(data, G, init_weights, init_means, init_covariances )
        out = MStep(data, G, init_weights, init_means, init_covariances, response)
        print("Iteration : {} - weights:{} means:{} sigma:{}".format(i,out['weights'],out['means'],out['covs']))
        
        #plt.figure(figsize=(12,8))
        #plt.scatter(X[:,0],X[:,1])
        #plt.scatter(means[0][0], means[0][1], color = "red")
        #plt.scatter(means[1][0], means[1][1],color="orange")
    
    return out

# Load the Data (Faithful.txt)

In [137]:
#Load the Data
X = np.loadtxt('../data/Faithful.txt')
print ("Data Loaded Successfully...")
print ("First 10 rows from the Faithful Dataset")
print (X[1:11,:])
print ("------------------------------------------")
print ("Size of the Dataset is: {}".format(X.shape))

Data Loaded Successfully...
First 10 rows from the Faithful Dataset
[[ 1.8   54.   ]
 [ 3.333 74.   ]
 [ 2.283 62.   ]
 [ 4.533 85.   ]
 [ 2.883 55.   ]
 [ 4.7   88.   ]
 [ 3.6   85.   ]
 [ 1.95  51.   ]
 [ 4.35  85.   ]
 [ 1.833 54.   ]]
------------------------------------------
Size of the Dataset is: (272, 2)


# Testing the Function myFunc (Two Cluster)

(1) weights / prob = 
          [0.50062804,0.49937196]

(2) means = 
          [3.467750,70.132353]
          [3.5078162,71.6617647]


(3) covariances / sigma = 
          [1.2975376,13.9110994]
          [13.911099,183.559040]
          

In [139]:
#Two Cluster Initialization
init_weights = [0.50062804,0.49937196]
init_means = [np.array([3.467750,70.132353]),np.array([3.5078162,71.6617647])]
init_covs = [np.array([[1.2975376,13.9110994],[13.911099,183.559040]])]*2

itmax = 20
K = len(init_weights) #Number of Clusters
out = myEM(data=X, G=K, init_weights = init_weights, init_means = init_means, init_covariances = init_covs, maxiter=itmax)
print ("------------------------------------------------------------------------------------------------------------")
print ("weight: {}".format(out['weights']))
print ("means: {}".format(out['means']))
print ("sigma: {}".format(out['covs']))
print ("------------------------------------------------------------------------------------------------------------")

Value of G:2
Iteration : 0 - weights:[0.5006526542469939, 0.49934734575300627] means:[array([ 3.46776969, 70.13260977]), array([ 3.5078488 , 71.66350617])] sigma:[array([[  1.32238528,  14.18954649],
       [ 14.18954649, 185.91012994]]), array([[  1.27262438,  13.63188485],
       [ 13.63188485, 181.19953113]])]
Value of G:2
Iteration : 1 - weights:[0.5006526542469939, 0.49934734575300627] means:[array([ 3.46776969, 70.13260977]), array([ 3.5078488 , 71.66350617])] sigma:[array([[  1.32238528,  14.18954649],
       [ 14.18954649, 185.91012994]]), array([[  1.27262438,  13.63188485],
       [ 13.63188485, 181.19953113]])]
Value of G:2
Iteration : 2 - weights:[0.5006526542469939, 0.49934734575300627] means:[array([ 3.46776969, 70.13260977]), array([ 3.5078488 , 71.66350617])] sigma:[array([[  1.32238528,  14.18954649],
       [ 14.18954649, 185.91012994]]), array([[  1.27262438,  13.63188485],
       [ 13.63188485, 181.19953113]])]
Value of G:2
Iteration : 3 - weights:[0.500652654246993

# Testing the Function myFunc (Three Cluster)

##### Below are the Initialization of the Parameters
(1) weights / prob = 
          [0.30514706,0.34926471,0.34558824]

(2) means = 
          [3.4459639,69.8433735]
          [3.6217053,72.1578947]
          [3.3893617,70.5531915]

(3) covariances / sigma = 
          [1.2877935,13.842302]
          [13.8423020,183.208932]

In [140]:
#Three Cluster Initialization
init_weights = [0.30514706,0.34926471,0.34558824]
init_means = [np.array([3.4459639,69.8433735]),np.array([3.6217053,72.1578947]),np.array([3.3893617,70.5531915])]
init_covs = [np.array([[1.2877935,13.842302],[13.8423020,183.208932]])]*3

itmax = 20
K = len(init_weights) #Number of Clusters
out = myEM(data=X, G=K, init_weights = init_weights, init_means = init_means, init_covariances = init_covs, maxiter=itmax)
print ("------------------------------------------------------------------------------------------------------------")
print ("weight: {}".format(out['weights']))
print ("means: {}".format(out['means']))
print ("sigma: {}".format(out['covs']))
print ("------------------------------------------------------------------------------------------------------------")

Value of G:3
Iteration : 0 - weights:[0.3051408625243811, 0.34929799829118785, 0.34556113918443054] means:[array([ 3.44630616, 69.85042535]), array([ 3.62064297, 72.13856517]), array([ 3.39011184, 70.56633575])] sigma:[array([[  1.33024023,  14.32319609],
       [ 14.32319609, 187.80379665]]), array([[  1.22398361,  13.05405907],
       [ 13.05405907, 173.45829482]]), array([[  1.31526944,  14.22048043],
       [ 14.22048043, 189.07832165]])]
Value of G:3
Iteration : 1 - weights:[0.3051408625243811, 0.34929799829118785, 0.34556113918443054] means:[array([ 3.44630616, 69.85042535]), array([ 3.62064297, 72.13856517]), array([ 3.39011184, 70.56633575])] sigma:[array([[  1.33024023,  14.32319609],
       [ 14.32319609, 187.80379665]]), array([[  1.22398361,  13.05405907],
       [ 13.05405907, 173.45829482]]), array([[  1.31526944,  14.22048043],
       [ 14.22048043, 189.07832165]])]
Value of G:3
Iteration : 2 - weights:[0.3051408625243811, 0.34929799829118785, 0.34556113918443054] means:

Iteration : 19 - weights:[0.3051408625243811, 0.34929799829118785, 0.34556113918443054] means:[array([ 3.44630616, 69.85042535]), array([ 3.62064297, 72.13856517]), array([ 3.39011184, 70.56633575])] sigma:[array([[  1.33024023,  14.32319609],
       [ 14.32319609, 187.80379665]]), array([[  1.22398361,  13.05405907],
       [ 13.05405907, 173.45829482]]), array([[  1.31526944,  14.22048043],
       [ 14.22048043, 189.07832165]])]
------------------------------------------------------------------------------------------------------------
weight: [0.3051408625243811, 0.34929799829118785, 0.34556113918443054]
means: [array([ 3.44630616, 69.85042535]), array([ 3.62064297, 72.13856517]), array([ 3.39011184, 70.56633575])]
sigma: [array([[  1.33024023,  14.32319609],
       [ 14.32319609, 187.80379665]]), array([[  1.22398361,  13.05405907],
       [ 13.05405907, 173.45829482]]), array([[  1.31526944,  14.22048043],
       [ 14.22048043, 189.07832165]])]
------------------------------------