In [126]:
import warnings
warnings.filterwarnings("ignore") 
from IPython.core.display import display, HTML

import time

import pandas as pd
import numpy as np
import scipy.stats as scs
from scipy.stats import multivariate_normal as mvn
import sklearn.mixture as mix


In [122]:
# Code sourced from:
#    http://people.duke.edu/~ccc14/sta-663/EMAlgorithm.html

def em_gmm_orig(xs, pis, mus, sigmas, tol=0.01, max_iter=100):
    
    print ("Max Iteration is:{}".format(max_iter))

    n, p = xs.shape
    k = len(pis)

    ll_old = 0
    for i in range(max_iter):
        print('\nIteration: ', i)
        print()
        exp_A = []
        exp_B = []
        ll_new = 0

        # E-step
        ws = np.zeros((k, n))
        for j in range(len(mus)):
            for i in range(n):
                ws[j, i] = pis[j] * mvn(mus[j], sigmas[j]).pdf(xs[i])
        ws /= ws.sum(0)

        # M-step
        pis = np.zeros(k)
        for j in range(len(mus)):
            for i in range(n):
                pis[j] += ws[j, i]
        pis /= n

        mus = np.zeros((k, p))
        for j in range(k):
            for i in range(n):
                mus[j] += ws[j, i] * xs[i]
            mus[j] /= ws[j, :].sum()
        
        sigmas = np.zeros((k, p, p))
        for j in range(k):
            for i in range(n):
                ys = np.reshape(xs[i]- mus[j], (2,1))
                sigmas[j] += ws[j, i] * np.dot(ys, ys.T)
            sigmas[j] /= ws[j,:].sum()

        new_mus = (np.diag(mus)[0], np.diag(mus)[1])
        new_sigs = (np.unique(np.diag(sigmas[0]))[0], np.unique(np.diag(sigmas[1]))[0])
        df = (pd.DataFrame(index=[1, 2]).assign(mus = new_mus).assign(sigs = new_sigs))
        
        xx = np.linspace(0, 100, 100)
        yy = scs.multivariate_normal.pdf(xx, mean=new_mus[0], cov=new_sigs[0])
        
        yy2 = scs.multivariate_normal.pdf(xx, mean=new_mus[1], cov=new_sigs[1])
        
        print(df.T)   
        
        # update complete log likelihoood
        ll_new = 0.0
        for i in range(n):
            s = 0
            for j in range(k):
                s += pis[j] * mvn(mus[j], sigmas[j]).pdf(xs[i])
            ll_new += np.log(s)
        print(f'log_likelihood: {ll_new:3.4f}')
        #if np.abs(ll_new - ll_old) < tol:
        #    break
        ll_old = ll_new
        
    return ll_new, pis, mus, sigmas

In [97]:
sigmas

array([[[ 4.42442672,  2.07290778],
        [ 2.07290778,  6.88369117]],

       [[ 7.17794316, -2.86128471],
        [-2.86128471,  3.13927751]]])

In [98]:
def normalize_data(X):
    
    X_norm = np.zeros(X.shape)
    X_norm[:,0] = (X[:,0] - np.amin(X[:,0]))/(np.amax(X[:,0]) - np.amin(X[:,0]))
    X_norm[:,1] = (X[:,1] - np.amin(X[:,1]))/(np.amax(X[:,1]) - np.amin(X[:,1]))

    return X_norm

In [123]:
#Load the Data
X = np.loadtxt('data/Faithful.txt')
X_norm = normalize_data(X)
xs = X#X_norm

In [124]:
height = data['Height (in)']
n = len(height)

# Ground truthish
_mus = np.array([[0, data.groupby('Gender').mean().iat[0, 0]], 
                 [data.groupby('Gender').mean().iat[1, 0], 0]])
_sigmas = np.array([[[5, 0], [0, 5]], 
                    [[5, 0],[0, 5]]])
_pis = np.array([0.5, 0.5]) # priors

# initial random guesses for parameters
np.random.seed(0)

#pis = np.random.random(2)
#pis /= pis.sum()
#mus = np.random.random((2,2))
#sigmas = np.array([np.eye(2)] * 2) * height.std()

pis = [0.5,0.5]
mus = np.array([[3.467750,70.132353],[3.5078162,71.6617647]])
sigmas = np.array(([[[1.2975376,13.9110994],[13.911099,183.559040]]]*2))

# generate our noisy x values
#xs = np.concatenate([np.random.multivariate_normal(mu, sigma, int(pi*n))
#                    for pi, mu, sigma in zip(_pis, _mus, _sigmas)])

ll, pis, mus, sigmas = em_gmm_orig(xs, pis, mus, sigmas, max_iter=20)

# In the below plots the white dots represent the observed heights.

Max Iteration is:20

Iteration:  0

             1          2
mus   3.467744  71.662546
sigs  1.322414   1.272658
log_likelihood: -1289.7026

Iteration:  1

             1          2
mus   3.462917  71.736508
sigs  1.329423   1.265211
log_likelihood: -1289.6482

Iteration:  2

             1          2
mus   3.456646  71.839409
sigs  1.334623   1.259302
log_likelihood: -1289.5968

Iteration:  3

             1          2
mus   3.449252  71.964966
sigs  1.340099   1.252786
log_likelihood: -1289.5405

Iteration:  4

             1          2
mus   3.440629  72.112725
sigs  1.346118   1.245272
log_likelihood: -1289.4742

Iteration:  5

             1          2
mus   3.430587  72.283691
sigs  1.352709   1.236560
log_likelihood: -1289.3936

Iteration:  6

             1          2
mus   3.418894  72.479646
sigs  1.359851   1.226427
log_likelihood: -1289.2944

Iteration:  7

             1          2
mus   3.405270  72.703040
sigs  1.367501   1.214585
log_likelihood: -1289.1712

Iteration: 

In [125]:
mus

array([[ 2.73476657, 61.55115711],
       [ 4.25146916, 80.37538348]])