In [126]:
import warnings
warnings.filterwarnings("ignore") 
from IPython.core.display import display, HTML

import time

import pandas as pd
import numpy as np
import scipy.stats as scs
from scipy.stats import multivariate_normal as mvn
import sklearn.mixture as mix


In [127]:
# Code sourced from:
#    http://people.duke.edu/~ccc14/sta-663/EMAlgorithm.html

def em_gmm_orig(xs, pis, mus, sigmas, tol=0.01, max_iter=100):
    
    print ("Max Iteration is:{}".format(max_iter))

    n, p = xs.shape
    k = len(pis)

    ll_old = 0
    for i in range(max_iter):
        print('\nIteration: ', i)
        print()
        exp_A = []
        exp_B = []
        ll_new = 0

        # E-step
        ws = np.zeros((k, n))
        for j in range(len(mus)):
            for i in range(n):
                ws[j, i] = pis[j] * mvn(mus[j], sigmas[j]).pdf(xs[i])
        ws /= ws.sum(0)

        # M-step
        pis = np.zeros(k)
        for j in range(len(mus)):
            for i in range(n):
                pis[j] += ws[j, i]
        pis /= n

        mus = np.zeros((k, p))
        for j in range(k):
            for i in range(n):
                mus[j] += ws[j, i] * xs[i]
            mus[j] /= ws[j, :].sum()
        
        sigmas = np.zeros((k, p, p))
        for j in range(k):
            for i in range(n):
                ys = np.reshape(xs[i]- mus[j], (2,1))
                sigmas[j] += ws[j, i] * np.dot(ys, ys.T)
            sigmas[j] /= ws[j,:].sum()

        new_mus = (np.diag(mus)[0], np.diag(mus)[1])
        new_sigs = (np.unique(np.diag(sigmas[0]))[0], np.unique(np.diag(sigmas[1]))[0])
        df = (pd.DataFrame(index=[1, 2]).assign(mus = new_mus).assign(sigs = new_sigs))
        
        xx = np.linspace(0, 100, 100)
        yy = scs.multivariate_normal.pdf(xx, mean=new_mus[0], cov=new_sigs[0])
        
        yy2 = scs.multivariate_normal.pdf(xx, mean=new_mus[1], cov=new_sigs[1])
        
        #print(df.T)   
        
        # update complete log likelihoood
        ll_new = 0.0
        for i in range(n):
            s = 0
            for j in range(k):
                s += pis[j] * mvn(mus[j], sigmas[j]).pdf(xs[i])
            ll_new += np.log(s)
        print(f'log_likelihood: {ll_new:3.4f}')
        #if np.abs(ll_new - ll_old) < tol:
        #    break
        ll_old = ll_new
        
    return ll_new, pis, mus, sigmas

In [128]:
sigmas

array([[[  1.15930139,  11.84677091],
        [ 11.84677091, 149.42192184]],

       [[  0.28025611,   1.65973398],
        [  1.65973398,  40.93555667]]])

In [129]:
def normalize_data(X):
    
    X_norm = np.zeros(X.shape)
    X_norm[:,0] = (X[:,0] - np.amin(X[:,0]))/(np.amax(X[:,0]) - np.amin(X[:,0]))
    X_norm[:,1] = (X[:,1] - np.amin(X[:,1]))/(np.amax(X[:,1]) - np.amin(X[:,1]))

    return X_norm

In [130]:
#Load the Data
X = np.loadtxt('data/Faithful.txt')
X_norm = normalize_data(X)
xs = X#X_norm

In [136]:
height = data['Height (in)']
n = len(height)

# Ground truthish
_mus = np.array([[0, data.groupby('Gender').mean().iat[0, 0]], 
                 [data.groupby('Gender').mean().iat[1, 0], 0]])
_sigmas = np.array([[[5, 0], [0, 5]], 
                    [[5, 0],[0, 5]]])
_pis = np.array([0.5, 0.5]) # priors

# initial random guesses for parameters
np.random.seed(0)

#pis = np.random.random(2)
#pis /= pis.sum()
#mus = np.random.random((2,2))
#sigmas = np.array([np.eye(2)] * 2) * height.std()

pis = [0.50062804,0.49937196]
mus = np.array([[3.467750,70.132353],[3.5078162,71.6617647]])
sigmas = np.array(([[[1.2975376,13.9110994],[13.911099,183.559040]]]*2))

# generate our noisy x values
#xs = np.concatenate([np.random.multivariate_normal(mu, sigma, int(pi*n))
#                    for pi, mu, sigma in zip(_pis, _mus, _sigmas)])

ll, pis, mus, sigmas = em_gmm_orig(xs, pis, mus, sigmas, max_iter=40)

# In the below plots the white dots represent the observed heights.

Max Iteration is:40

Iteration:  0

log_likelihood: -1289.7026

Iteration:  1

log_likelihood: -1289.6482

Iteration:  2

log_likelihood: -1289.5968

Iteration:  3

log_likelihood: -1289.5405

Iteration:  4

log_likelihood: -1289.4742

Iteration:  5

log_likelihood: -1289.3936

Iteration:  6

log_likelihood: -1289.2943

Iteration:  7

log_likelihood: -1289.1710

Iteration:  8

log_likelihood: -1289.0167

Iteration:  9

log_likelihood: -1288.8215

Iteration:  10

log_likelihood: -1288.5709

Iteration:  11

log_likelihood: -1288.2418

Iteration:  12

log_likelihood: -1287.7962

Iteration:  13

log_likelihood: -1287.1660

Iteration:  14

log_likelihood: -1286.2202

Iteration:  15

log_likelihood: -1284.6797

Iteration:  16

log_likelihood: -1281.8691

Iteration:  17

log_likelihood: -1275.8796

Iteration:  18

log_likelihood: -1260.5699

Iteration:  19

log_likelihood: -1223.9573

Iteration:  20

log_likelihood: -1195.3494

Iteration:  21

log_likelihood: -1183.0144

Iteration:  22

log_l

In [137]:
mus

array([[ 2.03638845, 54.47851638],
       [ 4.28966197, 79.96811517]])

In [138]:
sigmas

array([[[ 0.06916767,  0.43516762],
        [ 0.43516762, 33.69728207]],

       [[ 0.16996844,  0.94060932],
        [ 0.94060932, 36.04621131]]])

In [139]:
pis

array([0.35587286, 0.64412714])