In [0]:
from scipy.stats import multivariate_normal
from scipy.special import logsumexp
from glob import glob
import soundfile as sf
from os import path
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(seed=273)

In [0]:
class GaussianHMM(object):
    """
    Gaussian Hidden Markov Model.
    """
    def __init__(self, n_states, n_dims):
        """
        Set up Gaussian HMM
        ------
        input:
        n_states: number of states in HMM (note: one of them will be a final state)
        n_dims: number of dimensions (13 MFCCs for this assignment)
        """
        self.n_states = n_states
        self.n_dims = n_dims

    def init_gaussian_params(self, X):
        """
        Initialize Gaussian parameters
        ------
        input:
        X: list of 2d-arrays with shapes (Ti, 13) for example i: each is a matrix of MFCCs for a digit utterance
        ------
        initialize mu and sigma for each state's Gaussian (where sigma is a diagonal covariance)
        """
        X_concat = np.concatenate(X)
        self.mu = np.zeros((self.n_states, self.n_dims))
        self.sigma = np.zeros((self.n_states, self.n_dims))
        for s in range(self.n_states):
            X_subset = X_concat[np.random.choice(len(X_concat), size=50, replace=False)]
            self.mu[s] = X_subset.mean(axis=0)
            self.sigma[s] = X_subset.var(axis=0)

    def init_hmm_params(self):
        """
        Initialize HMM parameters
        ------
        initialize pi (starting probability vector) and A (transition probabilities)
        """
        self.pi = np.zeros(self.n_states)
        self.pi[0] = 1.
        self.A = np.zeros((self.n_states, self.n_states))
        for s in range(self.n_states - 1):
            self.A[s, s:s + 2] = .5
        self.A[-1, -1] = 1.

    def get_emissions(self, x):
        """
        Compute Gaussian log-density at X for a diagonal model.
        ------
        get (continuous) emission probabilities from the multivariate normal
        """
        T, _ = x.shape
        log_B = np.zeros((self.n_states, T))
        for s in range(self.n_states):
            log_B[s] = multivariate_normal.logpdf(x, mean=self.mu[s], cov=np.diag(self.sigma[s]))
        return log_B

    def forward(self, log_pi, log_A_t, log_B):
        """
        Forward algorithm.
        ------
        input:
        log_pi: 1d-array of shape n_states: log of start probability vector
        log_A_t: 2d-array of shape (n_states, n_states): *transposed* log of transition probability matrix
        log_B: 2d-array of shape (n_states, Tx): log of emision probabilities (Note: Tx depends on x)
        ------
        output:
        log_alpha: 2d-array of shape (n_states, Tx): log probability to state i at time t
        """
        _, T = log_B.shape
        log_alpha = np.zeros(log_B.shape)
        for t in range(T):
            if t == 0:
                log_alpha[:, t] = log_pi + log_B[:, 0]
                #TODO: log alpha to time t
            else:
                log_alpha[:, t] = logsumexp(log_alpha[:, t - 1] + log_A_t, axis=1) + log_B[:, t]
                #TODO: log alpha to time t
        return log_alpha

    def backward(self, log_A, log_B):
        """
        Backward algorithm.
        ------
        input:
        log_A: 2d-array of shape (n_states, n_states): log of transition probability matrix
        log_B: 2d-array of shape (n_states, Tx): log of emision probabilities (Note: Tx depends on x)
        ------
        output:
        log_beta: 2d-array of shape (n_states, Tx): log probability from state i at time t
        """
        _, T = log_B.shape
        log_beta = np.zeros(log_B.shape)
        for t in range(T - 1, -1, -1):
            if t == T - 1:
                log_beta[:, t] = 0 # log(1) = 0
                #TODO: log beta from time t
            else:
                log_beta[:, t] = logsumexp(log_A + log_B[:, t + 1] + log_beta[:, t + 1], axis=1)
                #TODO: log beta from time t
        return log_beta

    def viterbi(self, log_pi, log_A, log_B):
        """
        Use viterbi algorithm to find the best path and associated log probability.
        ------
        input:
        log_pi: 1d-array of shape n_states: log of start probability vector
        log_A: 2d-array of shape (n_states, n_states): log of transition probability matrix
        log_B: 2d-array of shape (n_states, Tx): log of emision probabilities (Note: Tx depends on x)
        ------
        output:
        q: 1d-array of length T: optimal state sequence for observed sequence
        log_prob: log probability of observed sequence
        """
        _, T = log_B.shape
        log_delta = np.zeros(log_B.shape)
        psi = np.empty((T, self.n_states), dtype='int') # backtrace
        for t in range(T):
            if t == 0:
                log_delta[:, t] = log_pi + log_B[:, 0]
                #TODO: find optimal state sequence
                psi[t] = np.arange(self.n_states)
            else:
                temp = log_delta[:, t - 1][:, None] + log_A
                log_delta[:, t] = temp.max(axis=0) + log_B[:, t]
                #TODO: find optimal state sequence
                psi[t] = temp.argmax(axis=0)

        q = np.zeros(T, dtype=np.int32)
        for t in range(T - 1, -1, -1):
            if t == T - 1:
                q[t] = np.argmax(log_delta[:, -1])
                #TODO: traceback state sequence
                log_prob = np.max(log_delta[:, -1])
                #TODO: log probability of observation under state sequence
            else:
                q[t] = psi[t, q[t + 1]]
                #TODO: traceback state sequence

        return q, log_prob

    def score(self, x):
        """
        Use forward-backward algorithm to
        compute log probability and posteriors.
        ------
        input:
        x :2d-array of shape (T, 13): MFCCs for a single example
        ------
        output:
        log_prob :scalar: log probability of observed sequence
        log_alpha :2d-array of shape (n_states, T): log prob of getting to state at time t from start
        log_beta :2d-array of shape (n_states, T): log prob of getting from state at time t to end
        gamma :2d-array of shape (n_states, T): state posterior probability
        eps :2d-array of shape (n_states, n_states): state transition probability matrix
        """
        T = len(x)

        log_pi = np.log(self.pi) # starting log probabilities
        log_A = np.log(self.A) # transition log probabilities
        log_B = self.get_emissions(x) # emission log probabilities
        
        # XXX: my forward algo needs log_A.T
        log_alpha = self.forward(log_pi, log_A.T, log_B)
        log_beta = self.backward(log_A, log_B)

        log_prob = logsumexp(log_alpha[:, -1])
        #TODO: log probability of observations
        # DEBUG: forward and backward must match up
        # debug = logsumexp(log_pi + log_B[:, 0] + log_beta[:, 0])
        # assert np.isclose(log_prob, debug)

        gamma = np.exp(log_alpha + log_beta - log_prob)
        #TODO: posteriors, no longer in log-space!

        xi = np.zeros((T - 1, self.n_states, self.n_states))
        for t in range(T - 1):
            xi[t] = log_alpha[:, t][:, None] + log_beta[:, t + 1] + log_A + log_B[:, t + 1]
            #TODO: transition prob i -> j for each t
        xi -= log_prob
        xi = np.exp(xi)
        
        xi = xi.sum(axis=0) # sum over time
        xi /= xi.sum(axis=1, keepdims=True).clip(1e-1) # normalize by state probabilities (sum transitions over j)

        return log_prob, log_alpha, log_beta, gamma, xi

    def train(self, X):
        """
        Estimate model parameters.
        ------
        input:
        X: list of 2d-arrays of shape (Tx, 13): list of single digit MFCC features
        ------
        update model parameters (A, mu, sigma)
        """
        stats = {
            "gamma": np.zeros((self.n_states, 1)),
            "A": np.zeros((self.n_states, self.n_states)),
            "X": np.zeros((self.n_states, self.n_dims)),
            "X**2": np.zeros((self.n_states, self.n_dims))
        }
        plog = 0
        for x in X:
            log_prob, log_alpha, log_beta, gamma, xi = self.score(x)

            stats["gamma"] += gamma.sum(axis=1, keepdims=True)
            stats["A"] += xi
            stats["X"] += gamma.dot(x)
            stats["X**2"] += gamma.dot(x**2)
            plog += -log_prob

        stats["gamma"] += 1
        stats["A"][:-1,:-1] += np.diag(np.full(self.n_states - 1, 1))
        
        self.mu = stats["X"] / stats["gamma"]
        #TODO: update means
        self.sigma = stats["X**2"] / stats["gamma"] - self.mu ** 2
        #TODO: update diagonal covariances
        self.sigma = self.sigma.clip(1e-1)
        
        self.A = np.where(np.bitwise_or(self.A == 0.0, self.A == 1.0), self.A, stats["A"]) # update transition probabilities
        self.A /= self.A.sum(axis=1, keepdims=True) # normalize transition probabilities

        return plog

In [0]:
dataset = np.load("mfccs.npz", allow_pickle=True)
Xtrain, Ytrain = dataset["Xtrain"], dataset["Ytrain"]
Xtest, Ytest = dataset["Xtest"], dataset["Ytest"]

# Expected error rates:
# 15 states/15 iterations: 0.9714 forward, 0.9679 viterbi
# 25 states/25 iterations: 0.9821 forward, 0.9821 viterbi
# 50 states/50 iterations: 0.9804 forward, 0.9804 viterbi

In [0]:
def driver(n_states, n_dims, n_iter, plog_convergence=True):
    print("n_states: {}, n_dims: {}, n_iter: {}".format(n_states, n_dims, n_iter))
    model = dict()
    digits = range(10)
    plogs = []
    for digit in digits:
        print("\nTraining HMM for digit %d" % digit)
        Xtrain_digit = [x for x, y in zip(Xtrain, Ytrain) if y == digit]
        model[digit] = GaussianHMM(n_states=n_states, n_dims=n_dims)
        model[digit].init_gaussian_params(Xtrain_digit)
        model[digit].init_hmm_params()

        prev_plog = float('inf')
        for i in range(n_iter):
            print("Starting iteration %d..." % i)
            plog = model[digit].train(Xtrain_digit)
            if i % 10 == 0 or i == n_iter - 1:
              print('plog:', plog)
            if plog_convergence and plog > prev_plog: # break out early if possible
              print('prev plog: {}, plog: {}. Terminating training...'.format(prev_plog, plog))
              break
            prev_plog = plog
        plogs.append(plog) # the plog for the particular digit

    print('\n\nPlogs: ', plogs)

    print("\n\nTesting HMM")
    forward_accuracy, viterbi_accuracy = np.zeros(10), np.zeros(10)
    forward_confusion, viterbi_confusion = np.zeros((10, 10)), np.zeros((10, 10))
    for x, y in zip(Xtest, Ytest):
        T = len(x)

        forward_scores, viterbi_scores = [], []
        for digit in digits:
            log_pi = np.log(model[digit].pi)
            log_A = np.log(model[digit].A)
            log_B = model[digit].get_emissions(x)
            
            # XXX: my forward algo uses log_A.T
            # might be more efficient to run backward instead
            log_alpha = model[digit].forward(log_pi, log_A.T, log_B)
            forward_log_prob = logsumexp(log_alpha[:, T - 1])
            _, viterbi_log_prob = model[digit].viterbi(log_pi, log_A, log_B)

            forward_scores.append(forward_log_prob)
            viterbi_scores.append(viterbi_log_prob)

        forward_top_digit, forward_top_log_prob = sorted(zip(digits, forward_scores), key=lambda x: -x[1])[0]
        viterbi_top_digit, viterbi_top_log_prob = sorted(zip(digits, viterbi_scores), key=lambda x: -x[1])[0]

        forward_confusion[y, forward_top_digit] += 1.
        viterbi_confusion[y, viterbi_top_digit] += 1.

    forward_accuracy = np.diag(forward_confusion) / forward_confusion.sum(axis=1)
    viterbi_accuracy = np.diag(viterbi_confusion) / viterbi_confusion.sum(axis=1)

    print("forward accuracy (%.4f)" % forward_accuracy.mean(), forward_accuracy)
    print("viterbi accuracy (%.4f)" % viterbi_accuracy.mean(), viterbi_accuracy)

    plt.matshow(forward_confusion)
    plt.title('Digit forward confusion (acc=%.2f' % (forward_accuracy.mean() * 100.) + '%)')
    plt.show()

    plt.matshow(viterbi_confusion)
    plt.title('Digit Viterbi confusion (acc=%.2f' % (viterbi_accuracy.mean() * 100.) + '%)')
    plt.show()

In [31]:
driver(15, 13, 15)

n_states: 15, n_dims: 13, n_iter: 15

Training HMM for digit 0
Starting iteration 0...




plog: 302192.1389190609
Starting iteration 1...
Starting iteration 2...
Starting iteration 3...
Starting iteration 4...
Starting iteration 5...
Starting iteration 6...
Starting iteration 7...
Starting iteration 8...
Starting iteration 9...
Starting iteration 10...
plog: 211333.34510669298
Starting iteration 11...
Starting iteration 12...
Starting iteration 13...
Starting iteration 14...
plog: 210920.20618929426

Training HMM for digit 1
Starting iteration 0...
plog: 306505.165429998
Starting iteration 1...
Starting iteration 2...
Starting iteration 3...
Starting iteration 4...
Starting iteration 5...
Starting iteration 6...
Starting iteration 7...
Starting iteration 8...
Starting iteration 9...
Starting iteration 10...
plog: 225358.80050492406
Starting iteration 11...
Starting iteration 12...
Starting iteration 13...
Starting iteration 14...
plog: 225027.986223602

Training HMM for digit 2
Starting iteration 0...
plog: 283587.1012854649
Starting iteration 1...
Starting iteration 2...
S



forward accuracy (0.9607) [0.96428571 1.         0.96428571 1.         0.96428571 0.96428571
 0.96428571 0.96428571 0.98214286 0.83928571]
viterbi accuracy (0.9625) [0.96428571 1.         0.96428571 1.         0.96428571 0.96428571
 0.96428571 0.96428571 0.98214286 0.85714286]


In [None]:
driver(15, 13, 15, plog_convergence=False)

In [32]:
driver(25, 13, 25)

n_states: 25, n_dims: 13, n_iter: 25

Training HMM for digit 0
Starting iteration 0...




plog: 295453.5788020056
Starting iteration 1...
Starting iteration 2...
Starting iteration 3...
Starting iteration 4...
Starting iteration 5...
Starting iteration 6...
Starting iteration 7...
Starting iteration 8...
Starting iteration 9...
Starting iteration 10...
plog: 209505.03746230176
Starting iteration 11...
Starting iteration 12...
Starting iteration 13...
Starting iteration 14...
Starting iteration 15...
Starting iteration 16...
Starting iteration 17...
Starting iteration 18...
Starting iteration 19...
Starting iteration 20...
plog: 207606.44408731195
Starting iteration 21...
Starting iteration 22...
Starting iteration 23...
Starting iteration 24...
plog: 207182.94709199085

Training HMM for digit 1
Starting iteration 0...
plog: 306111.1769867881
Starting iteration 1...
Starting iteration 2...
Starting iteration 3...
Starting iteration 4...
Starting iteration 5...
Starting iteration 6...
Starting iteration 7...
Starting iteration 8...
Starting iteration 9...
Starting iteration 1



forward accuracy (0.9839) [1.         1.         0.98214286 1.         0.96428571 0.94642857
 1.         1.         0.96428571 0.98214286]
viterbi accuracy (0.9839) [1.         1.         0.98214286 1.         0.96428571 0.94642857
 1.         1.         0.96428571 0.98214286]


In [33]:
driver(25, 13, 25, plog_convergence=False)

n_states: 25, n_dims: 13, n_iter: 25

Training HMM for digit 0
Starting iteration 0...




plog: 300490.08130442165
Starting iteration 1...
Starting iteration 2...
Starting iteration 3...
Starting iteration 4...
Starting iteration 5...
Starting iteration 6...
Starting iteration 7...
Starting iteration 8...
Starting iteration 9...
Starting iteration 10...
plog: 207390.25241929156
Starting iteration 11...
Starting iteration 12...
Starting iteration 13...
Starting iteration 14...
Starting iteration 15...
Starting iteration 16...
Starting iteration 17...
Starting iteration 18...
Starting iteration 19...
Starting iteration 20...
plog: 206719.29242390598
Starting iteration 21...
Starting iteration 22...
Starting iteration 23...
Starting iteration 24...
plog: 206684.31521644793

Training HMM for digit 1
Starting iteration 0...
plog: 309005.0429751334
Starting iteration 1...
Starting iteration 2...
Starting iteration 3...
Starting iteration 4...
Starting iteration 5...
Starting iteration 6...
Starting iteration 7...
Starting iteration 8...
Starting iteration 9...
Starting iteration 



forward accuracy (0.9786) [1.         1.         0.98214286 1.         0.83928571 0.98214286
 1.         1.         0.98214286 1.        ]
viterbi accuracy (0.9786) [1.         1.         0.98214286 1.         0.83928571 0.98214286
 1.         1.         0.98214286 1.        ]


In [34]:
driver(50, 13, 50)

n_states: 50, n_dims: 13, n_iter: 50

Training HMM for digit 0
Starting iteration 0...




plog: 291014.0853725728
Starting iteration 1...
Starting iteration 2...
Starting iteration 3...
Starting iteration 4...
Starting iteration 5...
Starting iteration 6...
Starting iteration 7...
Starting iteration 8...
Starting iteration 9...
Starting iteration 10...
plog: 208484.36235333275
Starting iteration 11...
Starting iteration 12...
Starting iteration 13...
Starting iteration 14...
Starting iteration 15...
Starting iteration 16...
Starting iteration 17...
Starting iteration 18...
Starting iteration 19...
Starting iteration 20...
plog: 207447.3144217509
Starting iteration 21...
Starting iteration 22...
Starting iteration 23...
Starting iteration 24...
Starting iteration 25...
Starting iteration 26...
Starting iteration 27...
Starting iteration 28...
Starting iteration 29...
Starting iteration 30...
plog: 207044.10212267257
Starting iteration 31...
Starting iteration 32...
Starting iteration 33...
Starting iteration 34...
Starting iteration 35...
Starting iteration 36...
Starting it



forward accuracy (0.9857) [1.         1.         0.98214286 1.         0.96428571 0.98214286
 1.         1.         0.96428571 0.96428571]
viterbi accuracy (0.9857) [1.         1.         0.98214286 1.         0.96428571 0.98214286
 1.         1.         0.96428571 0.96428571]


In [35]:
driver(50, 13, 50, plog_convergence=False)

n_states: 50, n_dims: 13, n_iter: 50

Training HMM for digit 0
Starting iteration 0...




plog: 291664.1890083609
Starting iteration 1...
Starting iteration 2...
Starting iteration 3...
Starting iteration 4...
Starting iteration 5...
Starting iteration 6...
Starting iteration 7...
Starting iteration 8...
Starting iteration 9...
Starting iteration 10...
plog: 213217.07747073943
Starting iteration 11...
Starting iteration 12...
Starting iteration 13...
Starting iteration 14...
Starting iteration 15...
Starting iteration 16...
Starting iteration 17...
Starting iteration 18...
Starting iteration 19...
Starting iteration 20...
plog: 212831.70577350733
Starting iteration 21...
Starting iteration 22...
Starting iteration 23...
Starting iteration 24...
Starting iteration 25...
Starting iteration 26...
Starting iteration 27...
Starting iteration 28...
Starting iteration 29...
Starting iteration 30...
plog: 212693.42441885555
Starting iteration 31...
Starting iteration 32...
Starting iteration 33...
Starting iteration 34...
Starting iteration 35...
Starting iteration 36...
Starting i



forward accuracy (0.9875) [0.98214286 1.         0.98214286 1.         0.94642857 1.
 1.         1.         0.98214286 0.98214286]
viterbi accuracy (0.9875) [0.98214286 1.         0.98214286 1.         0.94642857 1.
 1.         1.         0.98214286 0.98214286]
